def set_position(self, x, y): inp = [x, y] position = self.rect[:2] if self.grid(): inp = mul_lists(inp, self.config.grid) position = mul_lists(position, self.config.grid) self.config.delta = sub_lists(inp, position)
def call_rank(ranking_index, flattened, n_large, start_words=[], model=None): """ranking_index: 0 = TFIDF; 1 = C-value; 2 = C-value + Unigrams; 3 = TF""" ranking_fn = ranking_fns[ranking_index] ranking_fn_name = ranking_fn_names[ranking_index] set_status('ranking with %s' % ranking_fn_name, model=model) if debug: print 'ranking with %s' % ranking_fn_name scored_phrases, phrase_frequencies = ranking_fn(flattened) set_status('ordering', model=model) if debug: print 'ordering' ordered_phrases = sorted(scored_phrases.iteritems(), key=lambda p: p[1], reverse=True) # ordered_fname ='../phrase_lists/%s.phrases' % ranking_index # print 'writing ordered phrases to file %s' % ordered_fname # with open(ordered_fname, 'w') as f: # for o in ordered_phrases[:n_large]: # f.write('%s\n' % str(o)) if debug: print 'mapping' ranked_phrases = [p[0] for p in ordered_phrases] if debug: print 'trimming large' large_phrases = ranked_phrases[:n_large] if start_words: if debug: print 'looking for start words', start_words found_start_words = [] for start_word in start_words: matches = (ranked_phrase for ranked_phrase in ranked_phrases if start_word in sub_lists(ranked_phrase, proper=False)) try: word = matches.next() if word not in large_phrases: found_start_words.append(word) except StopIteration: if debug: print 'start word %s not found' % start_word if debug: print 'found start words', found_start_words top_phrases = found_start_words + large_phrases else: top_phrases = large_phrases filtered_frequencies = dict( (phrase, freq) for (phrase, freq) in phrase_frequencies.items() if phrase in top_phrases) return top_phrases, filtered_frequencies, scored_phrases
def cnc(phrase_lists, c_value_threshold=0, include_unigrams=False, weight_by_length=True): """given a list of phrases, run the cnc algorithm and return a dictionary of word, c-value (ranking) pairs""" frequency_dists_by_length = {} for phrase in phrase_lists: l = len(phrase) if l not in frequency_dists_by_length: frequency_dists_by_length[l] = FreqDist() frequency_dists_by_length[l].inc(tuple(phrase)) # word -> C-value(word) phrase_scores = {} # word -> num occurrences(word) phrase_frequencies = FreqDist() # word -> (t(word), c(word)) sub_phrase_scores = {} # traverse from longest phrases to shortest for length, frequency_dist in sorted(frequency_dists_by_length.items(), \ key=lambda pair: pair[0], reverse=True): # update global frequency counts with all counts of this length phrase_frequencies.update(frequency_dist) # within each phrase length, traverse from most common phrases to least for phrase, frequency in frequency_dist.iteritems(): if phrase in sub_phrase_scores: t, c = sub_phrase_scores[phrase] subtractive = 1.0 / c * t else: subtractive = 0 if weight_by_length: if include_unigrams: weight = log(length + 1, 2) else: weight = log(length, 2) else: weight = 1 c_value = weight * (frequency - subtractive) if c_value >= c_value_threshold: phrase_scores[phrase] = c_value for sub_phrase in utils.sub_lists(phrase): if sub_phrase in sub_phrase_scores: t, c = sub_phrase_scores[sub_phrase] else: t, c = 0, 0 sub_phrase_scores[sub_phrase] = t + frequency, c + 1 return phrase_scores, phrase_frequencies
def cnc(phrase_lists, c_value_threshold=0, include_unigrams=False, weight_by_length=True): """given a list of phrases, run the cnc algorithm and return a dictionary of word, c-value (ranking) pairs""" frequency_dists_by_length = {} for phrase in phrase_lists: l = len(phrase) if l not in frequency_dists_by_length: frequency_dists_by_length[l] = FreqDist() frequency_dists_by_length[l].inc(tuple(phrase)) # word -> C-value(word) phrase_scores = {} # word -> num occurrences(word) phrase_frequencies = FreqDist() # word -> (t(word), c(word)) sub_phrase_scores = {} # traverse from longest phrases to shortest for length, frequency_dist in sorted(frequency_dists_by_length.items(), key=lambda pair: pair[0], reverse=True): # update global frequency counts with all counts of this length phrase_frequencies.update(frequency_dist) # within each phrase length, traverse from most common phrases to least for phrase, frequency in frequency_dist.iteritems(): if phrase in sub_phrase_scores: t, c = sub_phrase_scores[phrase] subtractive = 1.0 / c * t else: subtractive = 0 if weight_by_length: if include_unigrams: weight = log(length + 1, 2) else: weight = log(length, 2) else: weight = 1 c_value = weight * (frequency - subtractive) if c_value >= c_value_threshold: phrase_scores[phrase] = c_value for sub_phrase in utils.sub_lists(phrase): if sub_phrase in sub_phrase_scores: t, c = sub_phrase_scores[sub_phrase] else: t, c = 0, 0 sub_phrase_scores[sub_phrase] = t + frequency, c + 1 return phrase_scores, phrase_frequencies
def call_rank(ranking_index, flattened, n_large, start_words=[], model=None): """ranking_index: 0 = TFIDF; 1 = C-value; 2 = C-value + Unigrams; 3 = TF""" ranking_fn = ranking_fns[ranking_index] ranking_fn_name = ranking_fn_names[ranking_index] set_status('ranking with %s' % ranking_fn_name, model=model) if debug: print 'ranking with %s' % ranking_fn_name scored_phrases, phrase_frequencies = ranking_fn(flattened) set_status('ordering', model=model) if debug: print 'ordering' ordered_phrases = sorted(scored_phrases.iteritems(), key=lambda p: p[1], reverse=True) # ordered_fname ='../phrase_lists/%s.phrases' % ranking_index # print 'writing ordered phrases to file %s' % ordered_fname # with open(ordered_fname, 'w') as f: # for o in ordered_phrases[:n_large]: # f.write('%s\n' % str(o)) if debug: print 'mapping' ranked_phrases = [p[0] for p in ordered_phrases] if debug: print 'trimming large' large_phrases = ranked_phrases[:n_large] if start_words: if debug: print 'looking for start words', start_words found_start_words = [] for start_word in start_words: matches = (ranked_phrase for ranked_phrase in ranked_phrases if start_word in sub_lists(ranked_phrase, proper=False)) try: word = matches.next() if word not in large_phrases: found_start_words.append(word) except StopIteration: if debug: print 'start word %s not found' % start_word if debug: print 'found start words', found_start_words top_phrases = found_start_words + large_phrases else: top_phrases = large_phrases filtered_frequencies = dict((phrase, freq) for (phrase, freq) in phrase_frequencies.items() if phrase in top_phrases) return top_phrases, filtered_frequencies, scored_phrases