コード例 #1
0
ファイル: sentiment.py プロジェクト: andymeneely/sira-nlp
    def handle(self, *args, **options):
        """

        """
        processes = options['processes']
        condition = options['condition']
        year = options['year']
        begin = dt.now()
        try:
            if year == 0:
                sents = qs.query_all('sentence', ids=False)
            else:
                sents = qs.query_by_year(year, 'sentence', ids=False)

            if condition == 'all':
                sents = sents.exclude(text='').iterator()
            elif condition == 'empty' or condition == 'failed':
                sents = sents.filter(metrics__sentiment={}).exclude(
                    text='').iterator()

            connections.close_all()
            tagger = taggers.SentimentTagger(settings, processes, sents)
            tagger.tag()

        except KeyboardInterrupt:
            logger.warning('Attempting to abort...')
        finally:
            logger.info('Time: {:.2f} minutes.'.format(
                helpers.get_elapsed(begin, dt.now())))
コード例 #2
0
    def handle(self, *args, **options):  # pragma: no cover
        """

        """
        processes = options['processes']
        condition = options['condition']
        year = options['year']
        begin = dt.now()
        try:
            if year == 0:
                comms = qs.query_all('comment', ids=False)
            else:
                comms = qs.query_by_year(year, 'comment', ids=False)

            comms = comms.exclude(text='').iterator()

            connections.close_all()
            tagger = taggers.CommentLevelTagger(settings, processes, comms)
            tagger.tag()

        except KeyboardInterrupt:  # pragma: no cover
            logger.warning('Attempting to abort...')
        finally:
            logger.info('Time: {:.2f} minutes.'.format(
                helpers.get_elapsed(begin, dt.now())))
コード例 #3
0
def get_mean_yngve(treestrings):
    """ Average all of the yngve scores for the given input. """
    c = 0
    total = 0
    if type(treestrings) != list:
        raise ValueError(
            'Input to get_mean_yngve() must be a list of strings.')

    for treestring in treestrings:
        results = yngve_redux(treestring)
        total += results[0]
        c += results[1]

    try:
        score = float(total / c)
    except ZeroDivisionError:
        logger.warning('ZeroDivisionError for Yngve calculation.')
        score = 0.0

    return score
コード例 #4
0
def do(iqueue, cqueue):  # pragma: no cover
    while True:
        item = iqueue.get()
        if item == parallel.EOI:
            cqueue.put(parallel.DD)
            break

        (sent, metrics) = item
        with transaction.atomic():
            try:
                tokens = sent.token_set.all().values_list('token', 'pos')

                if metrics and 'baselines' not in sent.metrics:
                    sent.metrics['baselines'] = dict()

                if 'sent_length' in metrics:
                    sent.metrics['baselines']['length'] = tokens.count()
                if 'type_token_ratio' in metrics:
                    results = helpers.get_type_token_ratio(tokens)
                    sent.metrics['baselines']['type_token_ratio'] = results
                if 'pronoun_density' in metrics:
                    results = helpers.get_pronoun_density(tokens)
                    sent.metrics['baselines']['pronoun_density'] = results
                if 'flesch_kincaid' in metrics:
                    toks = [t[0] for t in tokens]
                    results = calc_flesch_kincaid(
                            # wordcount, sentcount, syllcount
                            len(toks), 1, helpers.get_syllable_count(toks)
                        )
                    sent.metrics['baselines']['flesch_kincaid'] = results
                if 'stop_word_ratio' in metrics:
                    logger.warning("NotImplemented: 'stop_word_ratio'")
                if 'question_ratio' in metrics:
                    logger.warning("NotImplemented: 'question_ratio'")
                if 'conceptual_similarity' in metrics:
                    logger.warning("NotImplemented: 'conceptual_similarity'")

                sent.save()
            except Error as err:  # pragma: no cover
                sys.stderr.write('Exception\n')
                sys.stderr.write('  Sentence  {}\n'.format(sent.id))
                extype, exvalue, extrace = sys.exc_info()
                traceback.print_exception(extype, exvalue, extrace)

        cqueue.put((1, sent.id))
コード例 #5
0
ファイル: sandbox.py プロジェクト: andymeneely/sira-nlp
    def handle(self, *args, **options):
        """

        """
        processes = options['processes']
        begin = dt.now()
        try:
            review_ids = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }
            print("REVIEWS:")
            for i in review_ids.keys():
                review_ids[i] = list(qs.query_by_year(i, 'review', ids=True))
                print("\t{0}: {1}".format(str(i), str(len(review_ids[i]))))
                connections.close_all()


            comment_ids = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }
            message_ids = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }
            for year, ids in review_ids.items():
                comment_ids[year] = list(qs.query_by_year(year, 'comment', ids=True))
                connections.close_all()
                message_ids[year] = list(qs.query_by_year(year, 'message', ids=True))
                connections.close_all()

            print("COMMENTS:")
            for k, v in comment_ids.items():
                print("\t{0}: {1}".format(str(k), str(len(v))))

            print("MESSAGES:")
            for k, v in message_ids.items():
                print("\t{0}: {1}".format(str(k), str(len(v))))

            comment_sentences_ids = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }
            message_sentences_ids = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }

            print("COMMENT_SENTENCES:")
            for year, ids in comment_ids.items():
                comments = Comment.objects.filter(id__in=ids)
                connections.close_all()
                for c in comments:
                    comment_sentences_ids[year] += list(c.sentences.values_list('id'))
                print("\t{0}: {1}".format(str(year), str(len(comment_sentences_ids[year]))))
#            for year, ids in comment_ids.items():
#                comment_sentences_ids[year] = list(CommentSentences.objects.filter(comment_id__in=ids).values_list('sentence_id', flat=True))
#                connections.close_all()
#                print("\t{0}: {1}".format(str(year), str(len(comment_sentences_ids[year]))))

            print("MESSAGE_SENTENCES:")
            for year, ids in message_ids.items():
                messages = Message.objects.filter(id__in=ids)
                connections.close_all()
                for m in messages:
                    message_sentences_ids[year] += list(m.sentences.values_list('id'))
                print("\t{0}: {1}".format(str(year), str(len(message_sentences_ids[year]))))
#            for year, ids, in message_ids.items():
#                message_sentences_ids[year] = list(MessageSentences.objects.filter(message_id__in=ids).values_list('sentence_id', flat=True))
#                connections.close_all()
#                print("\t{0}: {1}".format(str(year), str(len(message_sentences_ids[year]))))

            sentences = list(qs.query_all('sentence', ids=False).values_list('id', 'text'))
            connections.close_all()

            orphans = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }
            duplicates = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }
            for sentence in sentences:
                for year in review_ids.keys():
                    print("YEAR: {0}".format(str(year)))
                    if sentence[0] not in comment_sentences_ids[year] and sentence[0] not in message_sentences_ids[year]:
                        orphans[year].append(sentence[0])
                    elif sentence[0] in comment_sentences_ids[year] and sentence[0] in message_sentences_ids[year]:
                        duplicates[year].append(sentence[0])

            print("================")
            print("ORPHANS:")
            for year, ids in orphans.items():
                print("\t{0}: {1}".format(str(year), str(len(ids))))

            print("DUPLICATES:")
            for year, ids in duplicates.items():
                print("\t{0}: {1}".format(str(year), str(len(ids))))

            connections.close_all()

        except KeyboardInterrupt:
            logger.warning('Attempting to abort...')
        finally:
            logger.info('Time: {:.2f} minutes.'
                .format(helpers.get_elapsed(begin, dt.now())))