def global_collocations_rejoin(): print 'Global re-population...' # add existing if do not exist yet for article in cls.objects.filter(cluster_id=cluster_id): index = json.loads(article.index) for colloc in all_collocs.intersection(index.keys()): # get or create because we are not filtrating old ones TestCollocations.objects.get_or_create(ngram=colloc, article=article, defaults={'count': index[colloc]}) # we could screw up counts completely, need to update them print 'Starting updates...' from axel.libs.utils import print_progress from axel.libs.nlp import _update_ngram_counts for article in print_progress(cls.objects.filter(cluster_id=cluster_id)): ngrams = sorted(article.testcollocations_set.values_list('ngram', 'count'), key=lambda x: (x[1], x[0])) if not ngrams: continue index = json.loads(article.index) new_ngrams = nlp._generate_possible_ngrams([tuple(c.split()) for c in zip(*ngrams)[0]], index) new_ngrams = _update_ngram_counts(new_ngrams, index) new_ngrams = sorted(new_ngrams.items(), key=lambda x: (x[1], x[0])) new_ngrams = [k for k in new_ngrams if k[1] > 0] if new_ngrams != ngrams: obsolete_ngrams = set(ngrams).difference(new_ngrams) article.testcollocations_set.filter(ngram__in=zip(*obsolete_ngrams)[0]) \ .delete() for ngram, score in set(new_ngrams).difference(ngrams): TestCollocations.objects.create(ngram=ngram, count=score, article=article)
def collocations(text, index, measures): """ Extract collocations from n-gram index :type index: dict :rtype list """ def filter_punkt(word): return _PUNKT_RE.match(word) def filter_len(word): return len(word) < 3 and not word.isupper() # do filtration by frequency > 2 bigram_index = dict([(tuple(k.split()), v) for k, v in index.iteritems() if len(k.split()) == 2 and v > 2]) # Get abstract finder because we already have index finder = nltk.collocations.AbstractCollocationFinder(None, bigram_index) # remove collocation from 2 equal words finder.apply_ngram_filter(lambda x, y: x == y) # remove weird collocations finder.apply_ngram_filter(lambda x, y: nlp._DIGIT_RE.match(x) and nlp._DIGIT_RE.match(y)) # remove punctuation, len and stopwords finder.apply_word_filter(filter_punkt) finder.apply_word_filter(filter_len) finder.apply_word_filter(lambda w: w in nlp._STOPWORDS) # build word distribution from nltk.probability import FreqDist word_fd = FreqDist() for word in text.split(): word_fd.inc(word) finder_big = nltk.collocations.BigramCollocationFinder(word_fd, finder.ngram_fd) filtered_collocs = _update_ngram_counts(_generate_possible_ngrams(finder.ngram_fd, index), index).items() filtered_collocs.sort(key=lambda col: col[1], reverse=True) # do not keep zero scores to exclude them in other rankings filtered_collocs = [col for col, score in filtered_collocs if score > 0] yield 'raw', filtered_collocs # build bigram correspondence dict corr_dict = defaultdict(list) for ngram in filtered_collocs: for bigram in nltk.ngrams(ngram.split(), 2): corr_dict[bigram].append(ngram) for measure_name, measure_func in measures: bigrams = finder_big.score_ngrams(measure_func) scored_ngrams = defaultdict(lambda: 0) for bigram, score in bigrams: for ngram in corr_dict[bigram]: if scored_ngrams[ngram] < score: scored_ngrams[ngram] = score scored_ngrams = scored_ngrams.items() scored_ngrams.sort(key=lambda col: col[1], reverse=True) yield measure_name, zip(*scored_ngrams)[0]