def __init__(self): self.unimass = deft(float) self.bimass = deft(float) self.unigrams = Counter() self.bigrams = Counter() self.umass = 0.0 self.bmass = 0.0
def __init__( self, min_cluster=3, min_wfreq=3, max_wfreq=0.2, baselines=10, window=10, novel_ratio=0.75, doc_overlap=0.25, min_assoc_multiplier=1, verbose=False, ): self.min_cluster = min_cluster self.min_wfreq = min_wfreq self.max_wfreq = max_wfreq self.baselines = baselines self.min_assoc = min_assoc_multiplier self.novel_ratio = novel_ratio self.doc_overlap = doc_overlap self.window = window self.bows = [] self.window_by_wid = deft(set) self.is_feature = deft(bool) self.wfreq = Counter() self.ti = TermIndex('') self.bows_by_wid = deft(list) self.wids_by_bow = deft(list) self.assignments = deft(list) self.bowid_by_docid = dict([]) self.verbose = verbose self.log = sys.stderr if self.verbose: self.log.write('%s\n' % str(self))
def clear(self): self.bows = [] self.window_by_wid = deft(set) self.is_feature = deft(bool) self.wfreq = Counter() self.ti = TermIndex('') self.bows_by_wid = deft(list) self.wids_by_bow = deft(list) self.assignments = deft(list) self.bowid_by_docid = dict([])
def __init__(self, OUT, TMP, CONFIDENCE): self.src = OUT self.temp = '%s.smoothing.temp' % '.'.join(TMP.split('.')[:-2]) self.confidence = CONFIDENCE self.posterior = Counter() self.prior = Counter() self.i = 0 self.minimum = 0 self.head = dict([]) self.grams_by_line = deft(set) self.smoothed = deft(bool) self.regexs = dict([])
def make_inverted_index(mat): docids_by_dimid = deft(set) for docid, v in enumerate(mat): for dimid, w in enumerate(v): if w: docids_by_dimid[dimid].add(docid) return docids_by_dimid
def unigram_frequencies(preprocessor, WORK, MAX_F, _n): freqDist = Counter() print 'Collecting unigram frequencies...' ndocs = 0 for line in Streamer(WORK, n=_n): freqDist.update(set(preprocessor(line).split())) ndocs += 1 freqBand = FrequencyBand(freqDist) print 'Determining frequency thresholds...' if isinstance(MAX_F, float): maxfreq = int(ndocs * MAX_F) elif isinstance(MAX_F, int): maxfreq = MAX_F else: maxfreq = freqBand.max_f() unigram_f = deft(bool) most_freq = freqDist.most_common() for i, (w, f) in enumerate(most_freq): # the system ignore the top k # most frequent words. if f < maxfreq: # the system stores a True value for all unigram_f[w] = True # words below the maximum frequency (these # words constitute relevant, informative # content. print 'Done!' print freqBand print '%d words below maxfreq' % ( len([w for w, boolean in unigram_f.items() if boolean]) ) return unigram_f, freqBand
def __init__( self, streamer, nn=[1, 2, 3] ): self.nn = nn self.streamer = streamer self.words = TermIndex('words') self.lefts = deft(Counter) self.rights = deft(Counter) self.orders = dict([]) for n in self.nn: self.orders[n] = deft(Counter) self.orders[-n] = deft(Counter) self.mass = deft(float) self.priors = Counter()
def __init__( self, stream, ngrams=[ (1, False), (2, False), (3, False), (4, False), (5, False), (3, True), (4, True), (5, True) ], min_f=2, ratio_within=0.8, transversal_filter=deft(bool), wrapping_filter=deft(bool) ): self.min_f = min_f self.ratio_within = ratio_within self.words = TermIndex('') self.ngrams = ngrams self.stream = stream self.priors = Counter() self.mass = 0.0 self.posteriors = deft(Counter) self.masses = deft(float) self.skipped = deft(Counter) self.skippers = deft(Counter) self.transversal_filter = transversal_filter self.wrapping_filter = wrapping_filter self.rewrites = deft(tuple) self.relations = Counter() self.relaheads = Counter()
def __init__(self, ngrams=[], ch_ngrams=[], skip_ngrams=[], lang='en'): self.lang = lang self._ngrams = ngrams self._ch_ngrams = ch_ngrams self._skip_ngrams = skip_ngrams self.stopwords = deft(bool) for w in STOPWORDS[self.lang]: if w not in EXCEPTIONS[self.lang]: self.stopwords[w] = True
def __inside_extract(self): for n in [5, 4, 3]: processed = deft(bool) posteriors = [ (gids, freq) for gids, freq in self.posteriors[(n, False)].items() if freq > 4 ] for gids, freq in tqdm(posteriors): self.__extract_relations(processed, n, gids, freq)
def __init__(self, max_df=20000): self.df = Counter() self.tf = dict([]) self.docs = 0.0 self.mass = 0.0 self.dmass = deft(float) self.word_by_id = dict([]) self.id_by_word = dict([]) self.max_df = max_df
def __count(self, X, features_to_deduct): F = Counter() I = deft(list) for i, x in enumerate(X): all_features = set(x) new_features = all_features - features_to_deduct for f in new_features: F[f] += 1 I[f].append(i) return F, I
def __init__(self): self.sentiment = deft(float) with open(SENTIWORDNET, 'rb') as rd: for l in rd: if not RECORD.match(l): continue row = l.decode('utf-8').strip().split('\t') w = row[4].split('#')[0] pos = float(row[2]) neg = float(row[3]) self.sentiment[w] = pos - neg
def __init__( self, streamer, window=5, n=1000, prune_at=5000 ): self.streamer = streamer self.window = window self.n = n self.prune_at = prune_at self.words = TermIndex('') self.features = TermIndex('') self.mass = 0.0 self.cc_by_w = deft(list) self.qq_by_w = dict([]) self.post_mass = deft(float) self.priors = Counter() self.ww_by_f = deft(set) self.ff_by_w = deft(set) self.types = deft(str)
def update(self, id_concept, vector): if len(vector) < 20: return if not self.seen[id_concept]: self.vectors[id_concept] = deft(bool) self.seen[id_concept] = True _vector = self.vectors[id_concept] for word, freq in vector: _id = self.words(word) _vector[_id] += freq
def __call__(self, bow): hits = Counter() matched = deft(list) for w in set(bow): for i in self.aa_by_w[w]: if not self.invprob: hits[i] += 1 else: hits[i] += self.invprob[w] matched[i].append(w) return [ (f, i, self.labels[i], matched[i]) for i, f in hits.most_common(10) if f ]
def __group_records(self, records): answers = deft(list) counts = Counter() for fields in records: score = int(fields[-1]) topic = fields[3] question = fields[1] answer = fields[-2] if counts[question] == 2: continue if score: answers[question].append(answer) counts[question] += 1 return answers
def __init__(self, min_size=1, max_size=1, max_freq=0.5, min_freq=2, sorted=True, format='yx', itemtext=False): self.min_size = min_size self.max_size = max_size self.min_freq = min_freq self.max_freq = max_freq self.sorted = sorted self.format = format self.itemtext = itemtext self.cross_cluster_penalty = dict([]) self.is_feature = deft(bool)
def __sort(self): out = [] scores = deft(set) for key, value in self.assignments.items(): scores[value[0]].add(key) feature_set = tuple( [self.ti[feature] for feature in cat_dedup(tuple(value))] ) triple = (feature_set, key, value) out.append(triple) if not scores.values(): upperbound = 0 else: upperbound = max([len(x) for x in scores.values()]) out.sort(key=lambda x: (upperbound - len(scores[x[-1][0]]), x[0])) remainder = self.__get_remainder(set(self.assignments.keys())) return [(len(scores[z[0]]), x, self.bowid_by_docid[y]) for x, y, z in out] +\ remainder
def __init__( self, min_size=1, max_size=1, max_freq=0.5, min_freq=2, sorted=True, format='yx', itemtext=False ): self.min_size = min_size self.max_size = max_size self.min_freq = min_freq self.max_freq = max_freq self.sorted = sorted self.format = format self.itemtext = itemtext self.cross_cluster_penalty = dict([]) self.is_feature = deft(bool)
def __update_cross_cluster_penalties(self, clusters): if self.cross_cluster_penalty.keys() \ and random.random() >= 0.1: return n = 100 V = set([]) penalties = deft(set) if len(clusters) > n: _clusters = random.sample(clusters, n) else: _clusters = clusters n = len(clusters) for i, (_, _, X) in enumerate(_clusters): for x in X: for w in x: penalties[w].add(i) self.cross_cluster_penalty = { feat: len(feat_clusters) / n for feat, feat_clusters in penalties.items() }
def __to_out(self, rX, X, clusters): positions_by_element = deft(set) for i, x in enumerate(X): positions_by_element[tuple(x)].add(i) original_by_position = {i: x for i, x in enumerate(rX)} out = self.__flatten_and_add_initial_positions(clusters, positions_by_element, original_by_position) if not self.sorted: out.sort(key=lambda x: x[1]) if self.format == 'x': return [e for _, _, e in out] elif self.format == 'y': return [h for h, _, _ in out] elif self.format == 'xy': return [(e, h) for h, _, e in out] elif self.format == 'yx': return [(h, e) for h, _, e in out] else: exit('FATAL: unrecognized argument for \'format\' parameter.')
def __init__( self, dimensions=100, window=5, min_count=1, workers=1, # workers=cpu_count(), max_vocab_size=20000000, iter=5, sg=0 ): self.dimensions = dimensions self.window = window self.min_count = min_count self.workers = workers self.max_vocab_size = max_vocab_size self.iter = iter self.sg = sg self.model = None self.observed = deft(bool)
def __to_out(self, rX, X, clusters): positions_by_element = deft(set) for i, x in enumerate(X): positions_by_element[tuple(x)].add(i) original_by_position = {i: x for i, x in enumerate(rX)} out = self.__flatten_and_add_initial_positions( clusters, positions_by_element, original_by_position ) if not self.sorted: out.sort(key=lambda x: x[1]) if self.format == 'x': return [e for _, _, e in out] elif self.format == 'y': return [h for h, _, _ in out] elif self.format == 'xy': return [(e, h) for h, _, e in out] elif self.format == 'yx': return [(h, e) for h, _, e in out] else: exit('FATAL: unrecognized argument for \'format\' parameter.')
def extract(self, merge=True): nn = sorted([n for n, skip in self.ngrams if not skip], reverse=True) snn = sorted([n for n, skip in self.ngrams if skip], reverse=True) covered = deft(bool) accepted = [] for n in [5, 4, 3, 2]: if n <= 1: break for gids, freq in tqdm(self.posteriors[(n, False)].items()): if freq < self.min_f: continue if not self.__wrapping_filter(gids): continue pxy = freq / self.masses[n] fx = self.posteriors[(n - 1, False)][gids[:n - 1]] fy = self.posteriors[(n - 1, False)][gids[-(n - 1):]] px = fx / self.masses[n - 1] py = fy / self.masses[n - 1] px_y = px * py gxy = [self.words[i] for i in gids] gx = [self.words[i] for i in gids[:n - 1]] gy = [self.words[i] for i in gids[-(n - 1):]] if covered[gids]: continue if (pxy >= px * self.ratio_within and pxy >= py * self.ratio_within): _n = n - 1 self.rewrites[tuple(gxy)] = tuple(gxy) while _n: _gids = gids[:_n] gids_ = gids[_n:] covered[_gids] = True covered[gids_] = True _n -= 1 self.__inside_extract()
def __init__(self, freqDist): self.bands = deft(set) for word, freq in freqDist.items(): self.bands[freq].add(word) for i, band in enumerate(sorted(self.bands.keys(), reverse=True)): l = len(self.bands[band]) # print i, band, len(self.bands[band]) # metric 1 # if i >= band: # self.k = i # self.f = band # #break # metric 2 # if len(self.bands[band]) >= band: # self.k = i # self.f = band # break # metric 3: squared frequency depth if l >= band: # self.k = l self.f = i break
def __init__(self, categories_by_concept, terms, categories, tfidf, max_depth=5, min_df=20 ): self.min_df = min_df self.path_categories_by_concept = categories_by_concept self.path_categories = categories self.path_terms = terms self.max_depth = max_depth self.observed_category = deft(bool) self.id_by_concept = dict([]) self.concept_by_id = dict([]) self.term_is_category = deft(bool) self.parents_by_category = dict([]) self.parents_by_concept = deft(list) self.id_by_term = dict([]) self.term_by_id = dict([]) self.has_parents = deft(bool) self.tfidf = tfidf self.pulling = set([]) self.vector_by_category = deft(Counter) self.contributors_by_category = deft(set) self.projected = Counter()
for w, f in freqs.items(): vector[w] += f return {w: f for w, f in vector.items() if f > 2} # def __remove_header(self, section): # return section if __name__ == '__main__': streamer = Streamer() dist = Distribution('vector') category_index = TermIndex('category') variant_index = TermIndex('term') variants_by_concept = deft(list) categories_by_concept = deft(list) start = time.time() for i, article in enumerate(streamer.articles()): # if i <= 380000: # if not i % 1000: # print '...', i # continue concept = article['concept'] variant = article['variant'] categories = article['categories'] id_concept = variant_index(concept)
def __init__(self, invprob): self.aa_by_w = deft(list) self.labels = [] self.n = 0 self.invprob = invprob
import nltk from nltk.corpus import wordnet as wn from collections import defaultdict as deft EXCLUDED = deft(bool) for w in [ 'entity.n.01', 'abstraction.n.06', 'physical_entity.n.01', 'psychological_feature.n.01', 'event.n.01', 'abstraction.n.06', 'state.n.02', 'communication.n.02', 'act.n.02', 'whole.n.02', 'content.n.05', 'message.n.02', 'group.n.01', 'quality.n.01', 'attribute.n.02', 'relation.n.01', 'cognition.n.01', 'instrumentality.n.03', 'object.n.01', 'location.n.01', 'fundamental_quantity.n.01', 'measure.n.02', 'artifact.n.01', 'cognition.n.01', 'property.n.02', 'relation.n.01', 'vertebrate.n.01', 'medium_of_exchange.n.01', 'legality.n.01', 'change.n.03', 'fraction.n.03', 'process.n.06', 'action.n.01', 'definite_quantity.n.01', 'chordate.n.01', 'representational_process.n.01', 'sidereal_day.n.01', 'ideal.n.01', 'repeat.n.01', 'one.n.01', 'grammatical_category.n.01', 'part.n.03', 'psychological_state.n.01', 'assignment.n.01', 'out.n.01', 'kind.n.01', 'type.n.06' ]: EXCLUDED[w] = True class WordNet: def __init__(self):
def __init__(self, name): self.name = name self.words = TermIndex('vector.term') self.vectors = dict([]) self.seen = deft(bool)
elif not alnum: add_token(tokens, token, alnum) token = '' else: add_token(tokens, token, alnum) token = char if token: add_token(tokens, token, alnum) return tokens def normalize(string): return unicode2ascii(string.strip().lower()) stopwords = deft(bool) for w in STOPWORDS['en']: stopwords[w] = True def words(text): tokens = [w for w in tokenizer(text.lower()) if w != 'PUNCT'] # return tokens return [w for w in tokens if not stopwords[w]] def f1(prec, recall): return (2 * prec * recall) / (prec + recall) def average(values):
from nltk import ( ngrams, sent_tokenize as splitter, wordpunct_tokenize as tokenizer ) from nltk.corpus import stopwords from nltk.probability import FreqDist NUM = re.compile('[0-9]') NON_ALPHA = re.compile('[^a-z]', re.IGNORECASE) STOPWORDS = deft(bool) for w in stopwords.words('english'): STOPWORDS[w] = True def decode(string): try: return string.decode('utf-8') except Exception: return string def encode(string): try: return string.encode('utf-8') except Exception: