def _update_total_counts(self): print 'Update total counts:' self.StatsModel.all().update(count=0) total_counts = defaultdict(lambda: 0) print 'Collecting new counts...' for ngram, count in print_progress(self.Model.values_list('ngram', 'count'), 10): total_counts[ngram] += count print 'Updating total counts...' for ngram, count in print_progress(total_counts.items(), 5): self.StatsModel.filter(ngram=ngram).update(count=count) self.Model.filter(ngram=ngram).update(total_count=count)
def global_collocations_rejoin(): print 'Global re-population...' # add existing if do not exist yet for article in cls.objects.filter(cluster_id=cluster_id): index = json.loads(article.index) for colloc in all_collocs.intersection(index.keys()): # get or create because we are not filtrating old ones TestCollocations.objects.get_or_create(ngram=colloc, article=article, defaults={'count': index[colloc]}) # we could screw up counts completely, need to update them print 'Starting updates...' from axel.libs.utils import print_progress from axel.libs.nlp import _update_ngram_counts for article in print_progress(cls.objects.filter(cluster_id=cluster_id)): ngrams = sorted(article.testcollocations_set.values_list('ngram', 'count'), key=lambda x: (x[1], x[0])) if not ngrams: continue index = json.loads(article.index) new_ngrams = nlp._generate_possible_ngrams([tuple(c.split()) for c in zip(*ngrams)[0]], index) new_ngrams = _update_ngram_counts(new_ngrams, index) new_ngrams = sorted(new_ngrams.items(), key=lambda x: (x[1], x[0])) new_ngrams = [k for k in new_ngrams if k[1] > 0] if new_ngrams != ngrams: obsolete_ngrams = set(ngrams).difference(new_ngrams) article.testcollocations_set.filter(ngram__in=zip(*obsolete_ngrams)[0]) \ .delete() for ngram, score in set(new_ngrams).difference(ngrams): TestCollocations.objects.create(ngram=ngram, count=score, article=article)
def populate_article_dict_ML(model, cutoff=1): """ :type model: Model """ article_dict = defaultdict(dict) article_rel_dict = defaultdict(dict) for key, is_rel in model.judged_data.iteritems(): ngram, article_id = key.split(',') is_rel = int(is_rel) article_rel_dict[article_id][ngram] = is_rel for article in print_progress(Article.objects.filter(cluster_id=model.CLUSTER_ID)): text = article.stemmed_text # create correspondence dict all_ngrams = list(model.objects.filter(article=article).values_list('ngram', flat=True)) for ngram in sorted(model.objects.filter(article=article), key=lambda x: len(x.ngram.split())): part_count = 0 for p_ngram in all_ngrams: if p_ngram != ngram.ngram and ngram.ngram in p_ngram: part_count += 1 try: is_rel = article_rel_dict[unicode(article)][ngram.ngram] except KeyError: continue ngram_abs_count = text.count(ngram.ngram) if ngram_abs_count <= cutoff: continue collection_ngram = model.COLLECTION_MODEL.objects.get(ngram=ngram.ngram) article_dict[article][ngram.ngram] = {'is_rel': is_rel, 'ngram': ngram, 'collection_ngram': collection_ngram, 'participation_count': part_count} return article_dict
def get_scores(cls, queryset): """ :param queryset: QuerySet :returns: accuracy scores for each method :rtype: defaultdict """ relevant_names = set(queryset.filter(tags__is_relevant=True).values_list('ngram', flat=True)) irrelevant_names = set(queryset.filter(tags__is_relevant=False).values_list('ngram', flat=True)) unjudged = defaultdict(lambda: 0) orderings = defaultdict(lambda: {'relevant': defaultdict(lambda: 0), 'irrelevant': defaultdict(lambda: 0)}) print 'Starting article processing...' df_dict = dict(queryset.values_list('ngram', '_df_score')) total_docs = Article.objects.filter(cluster_id=queryset.model.CLUSTER_ID).count() for article in print_progress(Article.objects.filter(cluster_id=queryset.model.CLUSTER_ID)): index = json.loads(article.index) # add TF-IDF score ngrams = article.articlecollocation_set.values_list('ngram', 'count') tfidf_ordering = [(ngram, score * math.log(total_docs / df_dict[ngram])) for ngram, score in ngrams if ngram in df_dict] tfidf_ordering.sort(key=lambda x: x[1], reverse=True) cur_orderings = list(cls.collocations(article.stemmed_text, index, MEASURES)) cur_orderings.append(('tf-idf', zip(*tfidf_ordering)[0])) for order_name, ordering in cur_orderings: for i, ngram in enumerate(ordering): if ngram in relevant_names: orderings[order_name]['relevant'][i] += 1 elif ngram in irrelevant_names: orderings[order_name]['irrelevant'][i] += 1 else: # not present unjudged[i] += 1 print 'End article processing...' print 'Starting result formatting...' graph_results = defaultdict(list) for order_name, results in orderings.iteritems(): total_relevant = 0 total_irrelevant = 0 for rel_count, irrel_count in zip(results['relevant'].items(), results['irrelevant'].items()): total_relevant += rel_count[1] total_irrelevant += irrel_count[1] graph_results[order_name].append((rel_count[0], round(total_relevant / (total_irrelevant + total_relevant), 3))) return graph_results
def _punct_calculation(self): print "Calculating Contigency tables for after/before punctuation" pos_tag_prev = [0, 0, 0, 0] pos_tag_after = [0, 0, 0, 0] for article in print_progress(Article.objects.filter(cluster_id=self.cluster_id)): for ngram in self.Model.objects.filter(article=article): if ngram.ngram in self.article_rel_dict[unicode(article)][1]: if {".", ",", ":", ";"}.intersection(zip(*ngram.pos_tag_prev)[0]): pos_tag_prev[0] += 1 else: pos_tag_prev[2] += 1 if {".", ",", ":", ";"}.intersection(zip(*ngram.pos_tag_after)[0]): pos_tag_after[0] += 1 else: pos_tag_after[2] += 1 elif ngram.ngram in self.article_rel_dict[unicode(article)][0]: if {".", ",", ":", ";"}.intersection(zip(*ngram.pos_tag_prev)[0]): pos_tag_prev[1] += 1 else: pos_tag_prev[3] += 1 if {".", ",", ":", ";"}.intersection(zip(*ngram.pos_tag_after)[0]): pos_tag_after[1] += 1 else: pos_tag_after[3] += 1 print "Contigency table BEFORE:" print " | Valid | Invalid | Total |" print "+punct | {0:>5} | {1:>7} | {2:>5} |".format( pos_tag_prev[0], pos_tag_prev[1], pos_tag_prev[0] + pos_tag_prev[1] ) print "-punct | {0:>5} | {1:>7} | {2:>5} |".format( pos_tag_prev[2], pos_tag_prev[3], pos_tag_prev[2] + pos_tag_prev[3] ) print "Totals | {0:>5} | {1:>7} | {2:>5} |".format( pos_tag_prev[0] + pos_tag_prev[2], pos_tag_prev[1] + pos_tag_prev[3], sum(pos_tag_prev) ) print print "Contigency table AFTER:" print " | Valid | Invalid | Total |" print "+punct | {0:>5} | {1:>7} | {2:>5} |".format( pos_tag_after[0], pos_tag_after[1], pos_tag_after[0] + pos_tag_after[1] ) print "-punct | {0:>5} | {1:>7} | {2:>5} |".format( pos_tag_after[2], pos_tag_after[3], pos_tag_after[2] + pos_tag_after[3] ) print "Totals | {0:>5} | {1:>7} | {2:>5} |".format( pos_tag_after[0] + pos_tag_after[2], pos_tag_after[1] + pos_tag_after[3], sum(pos_tag_after) )
def populate_article_dict(model, score_func, cutoff=1): """ :type model: Model """ article_dict = defaultdict(dict) article_rel_dict = defaultdict(dict) for key, is_rel in model.judged_data.iteritems(): ngram, article_id = key.split(',') is_rel = int(is_rel) article_rel_dict[article_id][ngram] = is_rel for article in print_progress(Article.objects.filter(cluster_id=model.CLUSTER_ID)): text = article.stemmed_text # create correspondence dict corr_dict1 = defaultdict(set) corr_dict2 = defaultdict(set) all_ngrams = list(model.objects.filter(article=article).values_list('ngram', flat=True)) for ngram in all_ngrams: if len(ngram.split()) == 2: w1, w2 = ngram.split() corr_dict1[w2].add(w1) corr_dict2[w1].add(w2) for ngram in sorted(model.objects.filter(article=article), key=lambda x: len(x.ngram.split())): part_count = 0 for p_ngram in all_ngrams: if p_ngram != ngram.ngram and ngram.ngram in p_ngram: part_count += 1 try: is_rel = article_rel_dict[unicode(article)][ngram.ngram] except KeyError: continue ngram_abs_count = text.count(ngram.ngram) if ngram_abs_count <= cutoff: continue collection_ngram = model.COLLECTION_MODEL.objects.get(ngram=ngram.ngram) score, ddict1, ddict2 = score_func(collection_ngram, ngram, text, article_dict[article], ngram_abs_count, corr_dict1, corr_dict2) nl_ngrams = [' '.join(n) for n in nltk.ngrams(ngram.ngram.split(), 2)] support_len = len(set(all_ngrams).intersection(nl_ngrams)) article_dict[article][ngram.ngram] = {'abs_count': ngram_abs_count, 'score': score, 'is_rel': is_rel, 'count': ngram.count, 'ddict1': ddict1, 'ddict2': ddict2, 'collection_ngram': collection_ngram, 'ngram': ngram, 'len': support_len, 'participation_count': part_count} return article_dict
def populate_wiki_index(cls, cluster_id): import networkx as nx from axel.stats.models import STATS_CLUSTERS_DICT for article in print_progress(cls.objects.filter(cluster_id=cluster_id)): text = '' dbpedia_graph = article.dbpedia_graph(redirects=True) # by default components are ordered descending by size max_comp = nx.connected_components(dbpedia_graph)[0] nodes = [node for node in max_comp if 'Category' not in node] statsModel = STATS_CLUSTERS_DICT[cluster_id] ngrams = statsModel.objects.filter(ngram__in=nodes) for ngram in ngrams: text += ngram.wikipedia_text + '\n' article.wiki_text_index = nlp.build_ngram_index(nlp.Stemmer.stem_wordnet(text)) article.save()
def _update_max_pos_tags(self): print 'Update max POS tags' self.StatsModel.all().update(_max_pos_tag=None) for c in print_progress(self.StatsModel.all(), 5): _ = c.max_pos_tag