def handle(self, *args, **options): """ """ processes = options['processes'] condition = options['condition'] year = options['year'] begin = dt.now() try: if year == 0: sents = qs.query_all('sentence', ids=False) else: sents = qs.query_by_year(year, 'sentence', ids=False) if condition == 'all': sents = sents.exclude(text='').iterator() elif condition == 'empty' or condition == 'failed': sents = sents.filter(metrics__sentiment={}).exclude( text='').iterator() connections.close_all() tagger = taggers.SentimentTagger(settings, processes, sents) tagger.tag() except KeyboardInterrupt: logger.warning('Attempting to abort...') finally: logger.info('Time: {:.2f} minutes.'.format( helpers.get_elapsed(begin, dt.now())))
def handle(self, *args, **options): # pragma: no cover """ """ processes = options['processes'] condition = options['condition'] year = options['year'] begin = dt.now() try: if year == 0: comms = qs.query_all('comment', ids=False) else: comms = qs.query_by_year(year, 'comment', ids=False) comms = comms.exclude(text='').iterator() connections.close_all() tagger = taggers.CommentLevelTagger(settings, processes, comms) tagger.tag() except KeyboardInterrupt: # pragma: no cover logger.warning('Attempting to abort...') finally: logger.info('Time: {:.2f} minutes.'.format( helpers.get_elapsed(begin, dt.now())))
def get_mean_yngve(treestrings): """ Average all of the yngve scores for the given input. """ c = 0 total = 0 if type(treestrings) != list: raise ValueError( 'Input to get_mean_yngve() must be a list of strings.') for treestring in treestrings: results = yngve_redux(treestring) total += results[0] c += results[1] try: score = float(total / c) except ZeroDivisionError: logger.warning('ZeroDivisionError for Yngve calculation.') score = 0.0 return score
def do(iqueue, cqueue): # pragma: no cover while True: item = iqueue.get() if item == parallel.EOI: cqueue.put(parallel.DD) break (sent, metrics) = item with transaction.atomic(): try: tokens = sent.token_set.all().values_list('token', 'pos') if metrics and 'baselines' not in sent.metrics: sent.metrics['baselines'] = dict() if 'sent_length' in metrics: sent.metrics['baselines']['length'] = tokens.count() if 'type_token_ratio' in metrics: results = helpers.get_type_token_ratio(tokens) sent.metrics['baselines']['type_token_ratio'] = results if 'pronoun_density' in metrics: results = helpers.get_pronoun_density(tokens) sent.metrics['baselines']['pronoun_density'] = results if 'flesch_kincaid' in metrics: toks = [t[0] for t in tokens] results = calc_flesch_kincaid( # wordcount, sentcount, syllcount len(toks), 1, helpers.get_syllable_count(toks) ) sent.metrics['baselines']['flesch_kincaid'] = results if 'stop_word_ratio' in metrics: logger.warning("NotImplemented: 'stop_word_ratio'") if 'question_ratio' in metrics: logger.warning("NotImplemented: 'question_ratio'") if 'conceptual_similarity' in metrics: logger.warning("NotImplemented: 'conceptual_similarity'") sent.save() except Error as err: # pragma: no cover sys.stderr.write('Exception\n') sys.stderr.write(' Sentence {}\n'.format(sent.id)) extype, exvalue, extrace = sys.exc_info() traceback.print_exception(extype, exvalue, extrace) cqueue.put((1, sent.id))
def handle(self, *args, **options): """ """ processes = options['processes'] begin = dt.now() try: review_ids = { 2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [] } print("REVIEWS:") for i in review_ids.keys(): review_ids[i] = list(qs.query_by_year(i, 'review', ids=True)) print("\t{0}: {1}".format(str(i), str(len(review_ids[i])))) connections.close_all() comment_ids = { 2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [] } message_ids = { 2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [] } for year, ids in review_ids.items(): comment_ids[year] = list(qs.query_by_year(year, 'comment', ids=True)) connections.close_all() message_ids[year] = list(qs.query_by_year(year, 'message', ids=True)) connections.close_all() print("COMMENTS:") for k, v in comment_ids.items(): print("\t{0}: {1}".format(str(k), str(len(v)))) print("MESSAGES:") for k, v in message_ids.items(): print("\t{0}: {1}".format(str(k), str(len(v)))) comment_sentences_ids = { 2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [] } message_sentences_ids = { 2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [] } print("COMMENT_SENTENCES:") for year, ids in comment_ids.items(): comments = Comment.objects.filter(id__in=ids) connections.close_all() for c in comments: comment_sentences_ids[year] += list(c.sentences.values_list('id')) print("\t{0}: {1}".format(str(year), str(len(comment_sentences_ids[year])))) # for year, ids in comment_ids.items(): # comment_sentences_ids[year] = list(CommentSentences.objects.filter(comment_id__in=ids).values_list('sentence_id', flat=True)) # connections.close_all() # print("\t{0}: {1}".format(str(year), str(len(comment_sentences_ids[year])))) print("MESSAGE_SENTENCES:") for year, ids in message_ids.items(): messages = Message.objects.filter(id__in=ids) connections.close_all() for m in messages: message_sentences_ids[year] += list(m.sentences.values_list('id')) print("\t{0}: {1}".format(str(year), str(len(message_sentences_ids[year])))) # for year, ids, in message_ids.items(): # message_sentences_ids[year] = list(MessageSentences.objects.filter(message_id__in=ids).values_list('sentence_id', flat=True)) # connections.close_all() # print("\t{0}: {1}".format(str(year), str(len(message_sentences_ids[year])))) sentences = list(qs.query_all('sentence', ids=False).values_list('id', 'text')) connections.close_all() orphans = { 2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [] } duplicates = { 2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [] } for sentence in sentences: for year in review_ids.keys(): print("YEAR: {0}".format(str(year))) if sentence[0] not in comment_sentences_ids[year] and sentence[0] not in message_sentences_ids[year]: orphans[year].append(sentence[0]) elif sentence[0] in comment_sentences_ids[year] and sentence[0] in message_sentences_ids[year]: duplicates[year].append(sentence[0]) print("================") print("ORPHANS:") for year, ids in orphans.items(): print("\t{0}: {1}".format(str(year), str(len(ids)))) print("DUPLICATES:") for year, ids in duplicates.items(): print("\t{0}: {1}".format(str(year), str(len(ids)))) connections.close_all() except KeyboardInterrupt: logger.warning('Attempting to abort...') finally: logger.info('Time: {:.2f} minutes.' .format(helpers.get_elapsed(begin, dt.now())))