def __iter__(self): mode, kwargs = self._itermode, self._itermode_kwargs if mode == 'sent': if self.sentences is None: for i, item in messages.pbar( enumerate(self._iter_sent(**kwargs))): yield item self.sentences = i + 1 self._save_info() else: for item in messages.pbar(self._iter_sent(**kwargs), total=self.sentences): yield item elif mode == 'word': if self.tokens is None: for i, item in messages.pbar( enumerate(self._iter_words(**kwargs))): yield item self.tokens = i + 1 self._save_info() else: for item in messages.pbar(self._iter_words(**kwargs), total=self.tokens): yield item else: raise ValueError(mode)
def add_heading(filename): with codecs.open(filename, 'r', 'utf8') as filein: dim = len(next(filein).split(" ")) - 1 words = 1 for line in messages.pbar(filein): words += 1 with codecs.open(filename+'_', 'w', 'utf8') as fileout: fileout.write(str(words) + ' ' + str(dim) + '\n') with codecs.open(filename, 'r', 'utf8') as filein: for line in messages.pbar(filein, total=words): fileout.write(line) os.remove(filename) os.rename(filename+'_', filename)
def _counts_to_db(self, counter, sep, commit_each=10000): messages.msg("Saving counter to sql database...") self.connect() rows = list() for i, (ngram, freq) in messages.pbar(enumerate(counter.items())): ngram = ngram.split(' ') n = len(ngram) row = [n, freq] for feat in self.feats: v = ' '.join([self._get_feat(w, feat, sep) for w in ngram]) row.append(v) rows.append(row) self.insert_data(messages.pbar(rows), 'ngram_counts')
def lex_fixedness(ngram_db, variant_dict: dict, table: str, field: str, m_field: str): in_fld = ['rowid', field, m_field] output_db, n_max = initialize(ngram_db, table, in_fld) new_fld = [('lex_fixedness_' + m_field, 'float')] output_db.new_fields(table, new_fld) for n in range(2, n_max+1): messages.msg("Collecting statistics from {}grams...".format(n)) lemma_pmis = ngram_db.to_dict('lemma_counts', ['lemma'], ['pmi'], n) ngram_freqs = ngram_db[n] lex_fixedness = list() for i, (ngram, pmi, rowid) in enumerate(messages.pbar(ngram_freqs, total=num_rows[str(n)])): V = sim(ngram, neighbours_dict) V_pmis = list() for v in V: try: V_pmis.append(lemma_pmis[v]) except KeyError: pass if len(V_pmis) > 1: fixedness = (pmi - np.mean(V_pmis)) / np.var(V_pmis) else: fixedness = 0 lex_fixedness.append((rowid, fixedness)) if (i+1) % save_every == 0: output_db.update_data(lex_fixedness, in_table, 'lex_fixedness') lex_fixedness = list() output_db.update_data(lex_fixedness, in_table, 'lex_fixedness') ngram_db.disconnect() output_db.disconnect() del_and_rename(ngram_db, output_db) messages.done()
def compute(self, ngram_db, embeddings_db, table: str, field: str, save_every=-1): in_fld = ['rowid', field] db_manager = DatabaseManager(ngram_db, table, in_fld, save_every=save_every) embeddings_db.connect() new_fld = [('comp_' + self.fun.__name__, 'float')] db_manager.new_fields(new_fld) self.sw_vecs = embeddings_db.to_dict(1) for n in messages.pbar(range(2, db_manager.n_max + 1)): messages.computing_measure(new_fld[0][0], n) ngrams, _ = db_manager.get_iterator(n) self.ngram_vecs = embeddings_db.to_dict(n) for i, (rowid, ngram) in enumerate(ngrams): self._upd(rowid, ngram) db_manager.save_every(self._get_list(), i) db_manager.save(self._get_list()) embeddings_db.disconnect() db_manager.finalize(self)
def _get_list(self, **dict_kwargs): if self.has_dict: out_list = self._dict_to_list(**dict_kwargs) else: out_list = self.default_list for item in messages.pbar(out_list): yield item self.default_list = list()
def remove_heading(filename): with codecs.open(filename, 'r', 'utf8') as filein: with codecs.open(filename+'_', 'w', 'utf8') as fileout: total = int(next(filein).split(' ')[0]) for line in messages.pbar(filein, total=total): fileout.write(line) os.remove(filename) os.rename(filename+'_', filename)
def create_sim_dict(file, vectors, min_sim=0.55, topn=10, num_trees=200): indexer = AnnoyIndexer(vectors, num_trees=num_trees) sim_dict = dict() for w in messages.pbar(vectors.vocab): sim = indexer.most_similar(vectors.get_vector(w), topn) sim_dict[w] = [s for s in sim if s[1] > min_sim] with open(file, 'wb') as fileout: pickle.dump(sim_dict, fileout)
def join_ngrams(filename, n, sep='_'): with codecs.open(filename, 'r', 'utf8') as filein: with codecs.open(filename+'_', 'w', 'utf8') as fileout: for line in messages.pbar(filein): line = line.split(' ') entry, vector = line[:n], line[n:] entry = sep.join(entry) line = ' '.join([entry] + vector) fileout.write(line) os.remove(filename) os.rename(filename+'_', filename)
def compute(self, ngram_db, table: str, save_every=-1): in_table = 'lex_context_counts' in_fld = ['max_' + self.measure] db_manager = DatabaseManager(ngram_db, in_table, in_fld, save_every=save_every) new_fields = [(self.agg_fun + '_' + self.measure, 'float')] db_manager.new_fields(new_fields, table=table) for n in messages.pbar(range(2, db_manager.n_max + 1)): messages.computing_measure(new_fields[0][0], n) ngrams_id = db_manager.to_list(table, ['rowid'], n) self.w_lpr, N = db_manager.get_iterator(n, pbar=False) self.n = n ngrams_id = messages.pbar(ngrams_id) for i, rowid in enumerate(ngrams_id): self._upd(rowid[0]) db_manager.save_every(self._get_list(), i) db_manager.save(self._get_list()) db_manager.finalize(self)
def compute(self, ngram_db, table: str, field: str, v_table: str): in_fld = [field, 'freq'] db_manager = DatabaseManager(ngram_db, v_table, in_fld) new_fld = [('syn_entropy', 'float')] db_manager.new_fields(new_fld, table=table) for n in messages.pbar(range(2, db_manager.n_max + 1)): messages.computing_measure(new_fld[0][0], n) self.base_frqs = ngram_db.to_dict(table, [field], ['freq', 'rowid'], n) variant_freqs, _ = db_manager.get_iterator(n) for i, (v_base, v_freq) in enumerate(variant_freqs): self._upd(v_base, v_freq) db_manager.save(self._get_list()) db_manager.finalize(self)
def compute(self, ngram_db, table: str, field: str, save_every=-1): in_fld = ['rowid', field, 'freq'] db_manager = DatabaseManager(ngram_db, table, in_fld, save_every) new_fld = [('c_value', 'float')] db_manager.new_fields(new_fld) for n in messages.pbar(range(db_manager.n_max, 1, -1)): messages.computing_measure('c_value', n) ngram_frqs, N = db_manager.get_iterator(n) self.n = n for i, (rowid, ngram, freq) in enumerate(ngram_frqs): self._upd(rowid, ngram, freq) db_manager.save_every(self._get_list(), i) db_manager.save(self._get_list()) self._swich_super() db_manager.finalize(self)
def sub_separator(filename, old_sep, new_sep): with codecs.open(filename, 'r', 'utf8') as filein: heading = next(filein) total, dim = heading.split(' ') total, dim = int(total), int(dim) with codecs.open(filename+'_', 'w', 'utf8') as fileout: fileout.write(heading) for line in messages.pbar(filein, total=total): line = line.split(' ') entry, vector = ' '.join(line[:-dim]), line[-dim:] entry = re.sub('{}(?=[A-Z])'.format(old_sep), new_sep, entry) line = ' '.join([entry] + vector) fileout.write(line) os.remove(filename) os.rename(filename+'_', filename)
def entropy(ngram_db, variant_dict: dict, table: str, field: str): in_fld = ['rowid', field] output_db, n_max = initialize(ngram_db, table, in_fld) new_fld = [('lex_entropy', 'float')] output_db.new_fields(table, new_fld) calculator = LexEntCalculator(variant_dict) for n in range(2, n_max+1): messages.computing_measure(new_fld[0][0], n) var_frqs = ngram_db.to_dict(table, [field], ['freq'], n) calculator.set_var_frqs(var_frqs) base_freqs, N = ngram_db[n] for i, (rowid, ngram) in enumerate(messages.pbar(base_freqs, total=N)): calculator.upd_stats(rowid, ngram) save_every(output_db, calculator.get_list(), table, new_fld[0][0], i) output_db.update_data(calculator.get_list(), table, new_fld[0][0]) finalize(ngram_db, output_db)
def compute(self, ngram_db, table: str, field: str, save_every=-1): in_fld = ['rowid', 'word', 'skipgram', field] db_manager = DatabaseManager(ngram_db, table, in_fld, save_every=save_every) new_fields = [('max_' + field, 'float')] db_manager.new_fields(new_fields) for n in messages.pbar(range(2, db_manager.n_max + 1)): messages.computing_measure(new_fields[0][0], n) self.pre = db_manager.to_dict(table, ['word', 'skipgram'], ['max_' + field], n - 1) current, _ = db_manager.get_iterator(n) for i, (rowid, w, skipgram, value) in enumerate(current): self._upd(rowid, w, skipgram, value) db_manager.save_every(self._get_list(), i) db_manager.save(self._get_list()) db_manager.finalize(self)
def compute(self, ngram_db, table: str, field: str): in_fld = ['rowid', field, 'freq'] db_manager = DatabaseManager(ngram_db, table, in_fld) new_fld = [('fdp_' + m.__name__, 'float') for m in self.measure] db_manager.new_fields(new_fld) for n in messages.pbar(range(2, db_manager.n_max + 1)): fld_str = ', '.join([f[0] for f in new_fld]) messages.computing_measure(fld_str, n) self.n = n for i in range(1, n // 2 + 1): freq_12, N = db_manager.get_iterator(n) self.N = N to_dict_args = [table, [field], ['freq']] self._set_freq_dicts(db_manager, i, to_dict_args) for rowid, ngram, freq in freq_12: self._upd(rowid, ngram, freq, i) out_list = self._get_list() db_manager.save(out_list) db_manager.finalize(self)
def compute(self, ngram_db, save_every=-1): in_table = 'lex_context_counts' in_fld = ['rowid', 'word', 'skipgram', 'pred'] db_manager = DatabaseManager(ngram_db, in_table, in_fld, save_every=save_every) new_fields = [('lpr', 'float')] db_manager.new_fields(new_fields) for n in messages.pbar(range(2, db_manager.n_max + 1)): messages.computing_measure('word lpr', n) self.syn_prob = db_manager.to_dict('syn_context_counts', ['word', 'skipos'], ['pred'], n) lex_prob, _ = db_manager.get_iterator(n) for i, (rowid, word, skipgram, prob) in enumerate(lex_prob): self._upd(rowid, word, skipgram, prob) db_manager.save_every(self._get_list(), i) db_manager.save(self._get_list()) db_manager.finalize(self)
def compute(self, ngram_db, save_every=-1): in_table = 'syn_context_counts' in_fld = ['rowid', 'skipos', 'freq'] db_manager = DatabaseManager(ngram_db, in_table, in_fld, save_every=save_every) new_fields = [('pred', 'float')] db_manager.new_fields(new_fields) for n in messages.pbar(range(2, db_manager.n_max + 1)): messages.computing_measure('syntactic predictability', n) self.skipos_freqs = db_manager.to_dict('skipos_counts', ['skipos'], ['freq'], n) syn_c_freqs, self.N = db_manager.get_iterator(n) for i, (rowid, skipos, freq) in enumerate(syn_c_freqs): self._upd(rowid, skipos, freq) db_manager.save_every(self._get_list(), i) db_manager.save(self._get_list()) db_manager.finalize(self)
def compute(self, ngram_db, table: str, field: str, save_every=-1): in_fld = [field, 'freq'] db_manager = DatabaseManager(ngram_db, table, in_fld, save_every=save_every) new_table = 'syn_context_counts' new_fields = [('word', 'text'), ('skipos', 'text'), ('length', 'int'), ('freq', 'int')] db_manager.new_table(new_table, new_fields) for n in messages.pbar(range(2, db_manager.n_max + 1)): messages.computing_measure('syntactic context freq', n) ngram_freqs, _ = db_manager.get_iterator(n) self.n = n for i, (ngram, freq) in enumerate(ngram_freqs): self._upd(ngram, freq) db_manager.save_every(self._get_list(add_n=True), i) db_manager.save(self._get_list(add_n=True)) db_manager.finalize(self)
def compute(self, ngram_db, table: str, field: str, save_every=-1): in_fld = ['rowid', field, 'freq'] db_manager = DatabaseManager(ngram_db, table, in_fld, save_every=save_every) new_fld = [(m.__name__, 'float') for m in self.measure] db_manager.new_fields(new_fld) self.freq_1 = db_manager.to_dict(table, [field], ['freq'], 1) for n in messages.pbar(range(2, db_manager.n_max + 1)): fld_str = ', '.join([f[0] for f in new_fld]) messages.computing_measure(fld_str, n) freq_12, N = db_manager.get_iterator(n) self.n, self.N = n, N for i, (rowid, ngram, freq) in enumerate(freq_12): self._upd(rowid, ngram, freq) db_manager.save_every(self._get_list(), i) db_manager.save(self._get_list()) db_manager.finalize(self)
def compute(self, ngram_db, table: str, field: str, save_every=-1): in_fld = [field, 'freq'] db_manager = DatabaseManager(ngram_db, table, in_fld, save_every=save_every) new_table = 'lex_context_counts' new_fields = [('word', 'text'), ('skipgram', 'text'), ('length', 'int'), ('pred', 'float')] db_manager.new_table(new_table, new_fields) for n in messages.pbar(range(2, db_manager.n_max + 1)): messages.computing_measure('lexical predictability', n) ngram_freqs, self.N = db_manager.get_iterator(n) self.n = n self.skipgram_freqs = db_manager.to_dict('skipgram_counts', ['skipgram'], ['freq'], n) for i, (ngram, freq) in enumerate(ngram_freqs): self._upd(ngram, freq) db_manager.save_every(self._get_list(), i) db_manager.save(self._get_list()) db_manager.finalize(self)
def save_as_text(self, filename: str, keys='all', join_values='\t', ignore_compound=False): """Save the sentences yielded by this iterator in a text file. Parameters ---------- filename: str keys: dict, optional The list of the features of a word that are retuned (default is 'all' which implies that all the elements passed to the 'idx_dict' argument of the constructor are returned) join_values: str, default='\t' Each word will be represented as a string in which the feature of the word are joined with the value passed to this argument. """ assert join_values with codecs.open(filename, 'w', 'utf8') as fileout: sentences = self._iter_sent(keys, join_values, ignore_compound) for sent in messages.pbar(sentences, total=self.sentences): fileout.write(' '.join(sent) + '\n')
def get_iterator(self, n: int, pbar=True): it, N = self.ngram_db[n] if pbar: it = messages.pbar(it, total=N) return it, N