def handle(self, *args, **options): """ """ processes = options['processes'] condition = options['condition'] year = options['year'] begin = dt.now() try: if year == 0: sents = qs.query_all('sentence', ids=False) else: sents = qs.query_by_year(year, 'sentence', ids=False) if condition == 'all': sents = sents.exclude(text='').iterator() elif condition == 'empty' or condition == 'failed': sents = sents.filter(metrics__sentiment={}).exclude( text='').iterator() connections.close_all() tagger = taggers.SentimentTagger(settings, processes, sents) tagger.tag() except KeyboardInterrupt: logger.warning('Attempting to abort...') finally: logger.info('Time: {:.2f} minutes.'.format( helpers.get_elapsed(begin, dt.now())))
def handle(self, *args, **options): """ """ processes = options['processes'] population = options['population'] root = options['root'] year = options['year'] begin = dt.now() try: sentences = [] if year is not None: sentences = qs.query_by_year(year, 'sentence', ids=False) else: sentences = qs.query_all('sentence', ids=False) sentences = sentences.iterator() connections.close_all() tagger = taggers.UncertaintyTagger(settings, processes, sentences, root) tagger.tag() except KeyboardInterrupt: warning('Attempting to abort.') finally: info('Time: {:.2f} mins'.format(get_elapsed(begin, dt.now())))
def handle(self, *args, **options): """ """ processes = options['processes'] metrics = options['metrics'] year = options['year'] begin = dt.now() try: #print(len(qs.query_by_year(year, 'token', ids=True))) tokens = [] if year != 0: tokens = qs.query_by_year( year, 'token', ids=False).exclude(token='').iterator() else: tokens = qs.query_all('token', ids=False).exclude(token='').iterator() connections.close_all() #print(tokens) tagger = taggers.SourceCodeTagger(settings, processes, tokens) total = tagger.tag() #print(total) except KeyboardInterrupt: # pragma: no cover warning('Attempting to abort.') finally: info('Time: {:.2f} mins'.format(get_elapsed(begin, dt.now())))
def handle(self, *args, **options): # pragma: no cover """ """ processes = options['processes'] condition = options['condition'] year = options['year'] begin = dt.now() try: if year == 0: comms = qs.query_all('comment', ids=False) else: comms = qs.query_by_year(year, 'comment', ids=False) comms = comms.exclude(text='').iterator() connections.close_all() tagger = taggers.CommentLevelTagger(settings, processes, comms) tagger.tag() except KeyboardInterrupt: # pragma: no cover logger.warning('Attempting to abort...') finally: logger.info('Time: {:.2f} minutes.'.format( helpers.get_elapsed(begin, dt.now())))
def handle(self, *args, **options): """ """ processes = options['processes'] metrics = options['metrics'] year = options['year'] begin = dt.now() try: sentences = [] if year != 0: sentences = qs.query_by_year(year, 'sentence', ids=False).exclude(text='') else: sentences = qs.query_all('sentence', ids=False).exclude(text='') connections.close_all() tagger = taggers.MetricsTagger(settings, processes, sentences, metrics) tagger.tag() except KeyboardInterrupt: # pragma: no cover warning('Attempting to abort.') finally: info('Time: {:.2f} mins'.format(get_elapsed(begin, dt.now())))
def handle(self, *args, **options): """ """ processes = options['processes'] population = options['population'] year = options['year'] begin = dt.now() try: if year == 0: sents = qs.query_all('sentence', ids=False).exclude(text='') \ .iterator() else: sents = qs.query_by_year(year, 'sentence', ids=False).exclude(text='') \ .iterator() connections.close_all() tagger = taggers.PolitenessTagger(settings, processes, sents) tagger.tag() except KeyboardInterrupt: warning('Attempting to abort.') finally: info('Time: {:.2f} mins'.format(get_elapsed(begin, dt.now())))
JSON_NULL = json.dumps(None) # The two regular expressions that follow match components of the header text # that is automatically inserted when a developer responds to a comment on # Rietveld. # E.g. On 2008/01/01 at 00:00:01, Raymond Reddington wrote: # Match the date and time in header that is inserted to comment responses # E.g. 2008/01/01 at 00:00:01 DATE_TIME_RE = re.compile( '(?P<date>\d{4}/\d{2}/\d{2})(?:\s|\sat\s)(?P<time>\d{2}:\d{2}:\d{2})') # Match the name of the author in header that is inserted to comment response # E.g. Raymond Reddington AUTHOR_RE = re.compile(', (.*) wrote:') TOKENS = qs.query_all('token', ids=False) PRONOUNS_1 = [ 'I', 'ME', 'MYSELF', 'MY', 'MINE', 'WE', 'US', 'OURSELVES', 'OUR', 'OURS' ] PRONOUNS_2 = ['YOU', 'YOURSELF', 'YOUR', 'YOURS', 'YOURSELVES'] PRONOUNS_3 = [ 'SHE', 'ITSELF', 'HER', 'HE', 'ITS', 'HIM', 'IT', 'HIMSELF', 'HERSELF', 'HERS', 'HIS', 'THEY', 'THEM', 'THEMSELVES', 'THEIR', 'THEIRS' ] def get_syllable_count(tokens): total = 0 pron = [] for token in tokens: word = token.strip("\n")
def handle(self, *args, **options): """ """ processes = options['processes'] year = options['year'] begin = dt.now() try: info('loaddb Command') info(' Years: {}'.format(settings.YEARS)) if year != 0: settings.YEARS = [year] loader = loaders.BugLoader(settings, processes) count = loader.load() info(' {:,} bugs loaded'.format(count)) loader = loaders.VulnerabilityLoader(settings, processes) count = loader.load() info(' {:,} vulnerabilities loaded'.format(count)) loader = loaders.ReviewLoader(settings, processes) count = loader.load() info(' {:,} reviews loaded'.format(count)) tagger = taggers.MissedVulnerabilityTagger(settings, processes) count = tagger.tag() info(' {:,} reviews missed a vulnerability'.format(count)) if year != 0: ids = qs.query_by_year(year, 'review', True) else: ids = qs.query_all('review', True) connections.close_all() # Hack # Comments loader = loaders.CommentLoader(settings, processes, ids) count = loader.load() info(' {:,} comments loaded'.format(count)) connections.close_all() # Hack loader = loaders.SentenceCommentLoader(settings, processes, ids) count = loader.load() info(' {:,} sentences loaded'.format(count)) connections.close_all() # Hack tagger = taggers.UsefulCommentTagger(settings, processes, ids) count = tagger.tag() info(' {:,} comments were useful'.format(count)) # Messages connections.close_all() # Hack loader = loaders.MessageLoader(settings, processes, ids) count = loader.load() info(' {:,} messages loaded'.format(count)) connections.close_all() # Hack loader = loaders.SentenceMessageLoader(settings, processes, ids) count = loader.load() info(' {:,} sentences loaded'.format(count)) connections.close_all() # Hack # Tokens loader = loaders.TokenLoader(settings, processes, ids) count = loader.load() info(' {:,} tokens loaded'.format(count)) with connection.cursor() as cursor: cursor.execute( 'REFRESH MATERIALIZED VIEW {};'.format('vw_review_token')) cursor.execute( 'REFRESH MATERIALIZED VIEW {};'.format('vw_review_lemma')) except KeyboardInterrupt: # pragma: no cover warning('Attempting to abort.') finally: info('Time: {:.2f} mins'.format(get_elapsed(begin, dt.now())))
def handle(self, *args, **options): """ """ processes = options['processes'] begin = dt.now() try: review_ids = { 2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [] } print("REVIEWS:") for i in review_ids.keys(): review_ids[i] = list(qs.query_by_year(i, 'review', ids=True)) print("\t{0}: {1}".format(str(i), str(len(review_ids[i])))) connections.close_all() comment_ids = { 2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [] } message_ids = { 2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [] } for year, ids in review_ids.items(): comment_ids[year] = list(qs.query_by_year(year, 'comment', ids=True)) connections.close_all() message_ids[year] = list(qs.query_by_year(year, 'message', ids=True)) connections.close_all() print("COMMENTS:") for k, v in comment_ids.items(): print("\t{0}: {1}".format(str(k), str(len(v)))) print("MESSAGES:") for k, v in message_ids.items(): print("\t{0}: {1}".format(str(k), str(len(v)))) comment_sentences_ids = { 2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [] } message_sentences_ids = { 2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [] } print("COMMENT_SENTENCES:") for year, ids in comment_ids.items(): comments = Comment.objects.filter(id__in=ids) connections.close_all() for c in comments: comment_sentences_ids[year] += list(c.sentences.values_list('id')) print("\t{0}: {1}".format(str(year), str(len(comment_sentences_ids[year])))) # for year, ids in comment_ids.items(): # comment_sentences_ids[year] = list(CommentSentences.objects.filter(comment_id__in=ids).values_list('sentence_id', flat=True)) # connections.close_all() # print("\t{0}: {1}".format(str(year), str(len(comment_sentences_ids[year])))) print("MESSAGE_SENTENCES:") for year, ids in message_ids.items(): messages = Message.objects.filter(id__in=ids) connections.close_all() for m in messages: message_sentences_ids[year] += list(m.sentences.values_list('id')) print("\t{0}: {1}".format(str(year), str(len(message_sentences_ids[year])))) # for year, ids, in message_ids.items(): # message_sentences_ids[year] = list(MessageSentences.objects.filter(message_id__in=ids).values_list('sentence_id', flat=True)) # connections.close_all() # print("\t{0}: {1}".format(str(year), str(len(message_sentences_ids[year])))) sentences = list(qs.query_all('sentence', ids=False).values_list('id', 'text')) connections.close_all() orphans = { 2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [] } duplicates = { 2008: [], 2009: [], 2010: [], 2011: [], 2012: [], 2013: [], 2014: [], 2015: [], 2016: [] } for sentence in sentences: for year in review_ids.keys(): print("YEAR: {0}".format(str(year))) if sentence[0] not in comment_sentences_ids[year] and sentence[0] not in message_sentences_ids[year]: orphans[year].append(sentence[0]) elif sentence[0] in comment_sentences_ids[year] and sentence[0] in message_sentences_ids[year]: duplicates[year].append(sentence[0]) print("================") print("ORPHANS:") for year, ids in orphans.items(): print("\t{0}: {1}".format(str(year), str(len(ids)))) print("DUPLICATES:") for year, ids in duplicates.items(): print("\t{0}: {1}".format(str(year), str(len(ids)))) connections.close_all() except KeyboardInterrupt: logger.warning('Attempting to abort...') finally: logger.info('Time: {:.2f} minutes.' .format(helpers.get_elapsed(begin, dt.now())))