コード例 #1
0
ファイル: sentiment.py プロジェクト: andymeneely/sira-nlp
    def handle(self, *args, **options):
        """

        """
        processes = options['processes']
        condition = options['condition']
        year = options['year']
        begin = dt.now()
        try:
            if year == 0:
                sents = qs.query_all('sentence', ids=False)
            else:
                sents = qs.query_by_year(year, 'sentence', ids=False)

            if condition == 'all':
                sents = sents.exclude(text='').iterator()
            elif condition == 'empty' or condition == 'failed':
                sents = sents.filter(metrics__sentiment={}).exclude(
                    text='').iterator()

            connections.close_all()
            tagger = taggers.SentimentTagger(settings, processes, sents)
            tagger.tag()

        except KeyboardInterrupt:
            logger.warning('Attempting to abort...')
        finally:
            logger.info('Time: {:.2f} minutes.'.format(
                helpers.get_elapsed(begin, dt.now())))
コード例 #2
0
    def handle(self, *args, **options):  # pragma: no cover
        """

        """
        processes = options['processes']
        condition = options['condition']
        year = options['year']
        begin = dt.now()
        try:
            if year == 0:
                comms = qs.query_all('comment', ids=False)
            else:
                comms = qs.query_by_year(year, 'comment', ids=False)

            comms = comms.exclude(text='').iterator()

            connections.close_all()
            tagger = taggers.CommentLevelTagger(settings, processes, comms)
            tagger.tag()

        except KeyboardInterrupt:  # pragma: no cover
            logger.warning('Attempting to abort...')
        finally:
            logger.info('Time: {:.2f} minutes.'.format(
                helpers.get_elapsed(begin, dt.now())))
コード例 #3
0
 def test_get_elapsed(self):
     data = (
             datetime.datetime(2017, 1, 1, 0, 0, 0, 0),
             datetime.datetime(2017, 1, 1, 0, 1, 0, 0)
         )
     expected = 1
     actual = helpers.get_elapsed(*data)
     self.assertEqual(expected, actual)
コード例 #4
0
ファイル: tfidf.py プロジェクト: andymeneely/sira-nlp
    def handle(self, *args, **options):
        # Grab the command line arguments.
        processes = options['processes']
        key = options['key']
        population = options['population']
        chunksize = options['chunksize']
        max_length = options['maxlength']
        top = options['top']
        random = options['random']

        begin = dt.now()
        try:
            info('tfidf Command')

            pop_review_ids = query_rIDs(population)
            pop_num_docs = len(pop_review_ids)
            info('  Population has {:,} reviews'.format(pop_num_docs))

            sample_review_ids = pop_review_ids
            if random is not None:
                sample_review_ids = get_random_sample(population,
                                                      pop_review_ids, random)
            sample_num_docs = len(sample_review_ids)

            info('  Computing the denominator of IDF')
            df = query_DF(pop_review_ids, key=key)

            info('  Computing the IDF in TF-IDF')
            idf = get_idf_dict(df, pop_num_docs, key=key)

            connections.close_all()  # Hack
            tfidfs = load_tfidf_dict(sample_review_ids,
                                     idf,
                                     processes,
                                     key=key)

            types = get_types(tfidfs, max_length, top)
            info('  {:,} types chosen'.format(len(types)))
            filepath = TFIDF_TOKENS_PATH if key == 'token' else TFIDF_LEMMAS_PATH
            write_csvs(tfidfs, filepath, chunksize, types)

            assert len(tfidfs) == sample_num_docs
        except KeyboardInterrupt:
            warning('Attempting to abort.')
        finally:
            info('Time: {:.2f} mins'.format(
                helpers.get_elapsed(begin, dt.now())))
コード例 #5
0
ファイル: sandbox.py プロジェクト: andymeneely/sira-nlp
    def handle(self, *args, **options):
        """

        """
        processes = options['processes']
        begin = dt.now()
        try:
            review_ids = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }
            print("REVIEWS:")
            for i in review_ids.keys():
                review_ids[i] = list(qs.query_by_year(i, 'review', ids=True))
                print("\t{0}: {1}".format(str(i), str(len(review_ids[i]))))
                connections.close_all()


            comment_ids = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }
            message_ids = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }
            for year, ids in review_ids.items():
                comment_ids[year] = list(qs.query_by_year(year, 'comment', ids=True))
                connections.close_all()
                message_ids[year] = list(qs.query_by_year(year, 'message', ids=True))
                connections.close_all()

            print("COMMENTS:")
            for k, v in comment_ids.items():
                print("\t{0}: {1}".format(str(k), str(len(v))))

            print("MESSAGES:")
            for k, v in message_ids.items():
                print("\t{0}: {1}".format(str(k), str(len(v))))

            comment_sentences_ids = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }
            message_sentences_ids = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }

            print("COMMENT_SENTENCES:")
            for year, ids in comment_ids.items():
                comments = Comment.objects.filter(id__in=ids)
                connections.close_all()
                for c in comments:
                    comment_sentences_ids[year] += list(c.sentences.values_list('id'))
                print("\t{0}: {1}".format(str(year), str(len(comment_sentences_ids[year]))))
#            for year, ids in comment_ids.items():
#                comment_sentences_ids[year] = list(CommentSentences.objects.filter(comment_id__in=ids).values_list('sentence_id', flat=True))
#                connections.close_all()
#                print("\t{0}: {1}".format(str(year), str(len(comment_sentences_ids[year]))))

            print("MESSAGE_SENTENCES:")
            for year, ids in message_ids.items():
                messages = Message.objects.filter(id__in=ids)
                connections.close_all()
                for m in messages:
                    message_sentences_ids[year] += list(m.sentences.values_list('id'))
                print("\t{0}: {1}".format(str(year), str(len(message_sentences_ids[year]))))
#            for year, ids, in message_ids.items():
#                message_sentences_ids[year] = list(MessageSentences.objects.filter(message_id__in=ids).values_list('sentence_id', flat=True))
#                connections.close_all()
#                print("\t{0}: {1}".format(str(year), str(len(message_sentences_ids[year]))))

            sentences = list(qs.query_all('sentence', ids=False).values_list('id', 'text'))
            connections.close_all()

            orphans = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }
            duplicates = {
                    2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [],
                    2014: [], 2015: [], 2016: []
                }
            for sentence in sentences:
                for year in review_ids.keys():
                    print("YEAR: {0}".format(str(year)))
                    if sentence[0] not in comment_sentences_ids[year] and sentence[0] not in message_sentences_ids[year]:
                        orphans[year].append(sentence[0])
                    elif sentence[0] in comment_sentences_ids[year] and sentence[0] in message_sentences_ids[year]:
                        duplicates[year].append(sentence[0])

            print("================")
            print("ORPHANS:")
            for year, ids in orphans.items():
                print("\t{0}: {1}".format(str(year), str(len(ids))))

            print("DUPLICATES:")
            for year, ids in duplicates.items():
                print("\t{0}: {1}".format(str(year), str(len(ids))))

            connections.close_all()

        except KeyboardInterrupt:
            logger.warning('Attempting to abort...')
        finally:
            logger.info('Time: {:.2f} minutes.'
                .format(helpers.get_elapsed(begin, dt.now())))