def compute(self, ngram_db, embeddings_db, table: str, field: str, save_every=-1): in_fld = ['rowid', field] db_manager = DatabaseManager(ngram_db, table, in_fld, save_every=save_every) embeddings_db.connect() new_fld = [('comp_' + self.fun.__name__, 'float')] db_manager.new_fields(new_fld) self.sw_vecs = embeddings_db.to_dict(1) for n in messages.pbar(range(2, db_manager.n_max + 1)): messages.computing_measure(new_fld[0][0], n) ngrams, _ = db_manager.get_iterator(n) self.ngram_vecs = embeddings_db.to_dict(n) for i, (rowid, ngram) in enumerate(ngrams): self._upd(rowid, ngram) db_manager.save_every(self._get_list(), i) db_manager.save(self._get_list()) embeddings_db.disconnect() db_manager.finalize(self)
def compute(self, ngram_db, table: str, field: str, v_table: str): in_fld = [field, 'freq'] db_manager = DatabaseManager(ngram_db, v_table, in_fld) new_fld = [('syn_entropy', 'float')] db_manager.new_fields(new_fld, table=table) for n in messages.pbar(range(2, db_manager.n_max + 1)): messages.computing_measure(new_fld[0][0], n) self.base_frqs = ngram_db.to_dict(table, [field], ['freq', 'rowid'], n) variant_freqs, _ = db_manager.get_iterator(n) for i, (v_base, v_freq) in enumerate(variant_freqs): self._upd(v_base, v_freq) db_manager.save(self._get_list()) db_manager.finalize(self)
def compute(self, ngram_db, table: str, field: str, save_every=-1): in_fld = ['rowid', field, 'freq'] db_manager = DatabaseManager(ngram_db, table, in_fld, save_every) new_fld = [('c_value', 'float')] db_manager.new_fields(new_fld) for n in messages.pbar(range(db_manager.n_max, 1, -1)): messages.computing_measure('c_value', n) ngram_frqs, N = db_manager.get_iterator(n) self.n = n for i, (rowid, ngram, freq) in enumerate(ngram_frqs): self._upd(rowid, ngram, freq) db_manager.save_every(self._get_list(), i) db_manager.save(self._get_list()) self._swich_super() db_manager.finalize(self)
def entropy(ngram_db, variant_dict: dict, table: str, field: str): in_fld = ['rowid', field] output_db, n_max = initialize(ngram_db, table, in_fld) new_fld = [('lex_entropy', 'float')] output_db.new_fields(table, new_fld) calculator = LexEntCalculator(variant_dict) for n in range(2, n_max+1): messages.computing_measure(new_fld[0][0], n) var_frqs = ngram_db.to_dict(table, [field], ['freq'], n) calculator.set_var_frqs(var_frqs) base_freqs, N = ngram_db[n] for i, (rowid, ngram) in enumerate(messages.pbar(base_freqs, total=N)): calculator.upd_stats(rowid, ngram) save_every(output_db, calculator.get_list(), table, new_fld[0][0], i) output_db.update_data(calculator.get_list(), table, new_fld[0][0]) finalize(ngram_db, output_db)
def compute(self, ngram_db, table: str, field: str, save_every=-1): in_fld = ['rowid', 'word', 'skipgram', field] db_manager = DatabaseManager(ngram_db, table, in_fld, save_every=save_every) new_fields = [('max_' + field, 'float')] db_manager.new_fields(new_fields) for n in messages.pbar(range(2, db_manager.n_max + 1)): messages.computing_measure(new_fields[0][0], n) self.pre = db_manager.to_dict(table, ['word', 'skipgram'], ['max_' + field], n - 1) current, _ = db_manager.get_iterator(n) for i, (rowid, w, skipgram, value) in enumerate(current): self._upd(rowid, w, skipgram, value) db_manager.save_every(self._get_list(), i) db_manager.save(self._get_list()) db_manager.finalize(self)
def compute(self, ngram_db, table: str, field: str): in_fld = ['rowid', field, 'freq'] db_manager = DatabaseManager(ngram_db, table, in_fld) new_fld = [('fdp_' + m.__name__, 'float') for m in self.measure] db_manager.new_fields(new_fld) for n in messages.pbar(range(2, db_manager.n_max + 1)): fld_str = ', '.join([f[0] for f in new_fld]) messages.computing_measure(fld_str, n) self.n = n for i in range(1, n // 2 + 1): freq_12, N = db_manager.get_iterator(n) self.N = N to_dict_args = [table, [field], ['freq']] self._set_freq_dicts(db_manager, i, to_dict_args) for rowid, ngram, freq in freq_12: self._upd(rowid, ngram, freq, i) out_list = self._get_list() db_manager.save(out_list) db_manager.finalize(self)
def compute(self, ngram_db, save_every=-1): in_table = 'lex_context_counts' in_fld = ['rowid', 'word', 'skipgram', 'pred'] db_manager = DatabaseManager(ngram_db, in_table, in_fld, save_every=save_every) new_fields = [('lpr', 'float')] db_manager.new_fields(new_fields) for n in messages.pbar(range(2, db_manager.n_max + 1)): messages.computing_measure('word lpr', n) self.syn_prob = db_manager.to_dict('syn_context_counts', ['word', 'skipos'], ['pred'], n) lex_prob, _ = db_manager.get_iterator(n) for i, (rowid, word, skipgram, prob) in enumerate(lex_prob): self._upd(rowid, word, skipgram, prob) db_manager.save_every(self._get_list(), i) db_manager.save(self._get_list()) db_manager.finalize(self)
def compute(self, ngram_db, save_every=-1): in_table = 'syn_context_counts' in_fld = ['rowid', 'skipos', 'freq'] db_manager = DatabaseManager(ngram_db, in_table, in_fld, save_every=save_every) new_fields = [('pred', 'float')] db_manager.new_fields(new_fields) for n in messages.pbar(range(2, db_manager.n_max + 1)): messages.computing_measure('syntactic predictability', n) self.skipos_freqs = db_manager.to_dict('skipos_counts', ['skipos'], ['freq'], n) syn_c_freqs, self.N = db_manager.get_iterator(n) for i, (rowid, skipos, freq) in enumerate(syn_c_freqs): self._upd(rowid, skipos, freq) db_manager.save_every(self._get_list(), i) db_manager.save(self._get_list()) db_manager.finalize(self)
def compute(self, ngram_db, table: str, field: str, save_every=-1): in_fld = [field, 'freq'] db_manager = DatabaseManager(ngram_db, table, in_fld, save_every=save_every) new_table = 'syn_context_counts' new_fields = [('word', 'text'), ('skipos', 'text'), ('length', 'int'), ('freq', 'int')] db_manager.new_table(new_table, new_fields) for n in messages.pbar(range(2, db_manager.n_max + 1)): messages.computing_measure('syntactic context freq', n) ngram_freqs, _ = db_manager.get_iterator(n) self.n = n for i, (ngram, freq) in enumerate(ngram_freqs): self._upd(ngram, freq) db_manager.save_every(self._get_list(add_n=True), i) db_manager.save(self._get_list(add_n=True)) db_manager.finalize(self)
def compute(self, ngram_db, table: str, field: str, save_every=-1): in_fld = ['rowid', field, 'freq'] db_manager = DatabaseManager(ngram_db, table, in_fld, save_every=save_every) new_fld = [(m.__name__, 'float') for m in self.measure] db_manager.new_fields(new_fld) self.freq_1 = db_manager.to_dict(table, [field], ['freq'], 1) for n in messages.pbar(range(2, db_manager.n_max + 1)): fld_str = ', '.join([f[0] for f in new_fld]) messages.computing_measure(fld_str, n) freq_12, N = db_manager.get_iterator(n) self.n, self.N = n, N for i, (rowid, ngram, freq) in enumerate(freq_12): self._upd(rowid, ngram, freq) db_manager.save_every(self._get_list(), i) db_manager.save(self._get_list()) db_manager.finalize(self)
def compute(self, ngram_db, table: str, save_every=-1): in_table = 'lex_context_counts' in_fld = ['max_' + self.measure] db_manager = DatabaseManager(ngram_db, in_table, in_fld, save_every=save_every) new_fields = [(self.agg_fun + '_' + self.measure, 'float')] db_manager.new_fields(new_fields, table=table) for n in messages.pbar(range(2, db_manager.n_max + 1)): messages.computing_measure(new_fields[0][0], n) ngrams_id = db_manager.to_list(table, ['rowid'], n) self.w_lpr, N = db_manager.get_iterator(n, pbar=False) self.n = n ngrams_id = messages.pbar(ngrams_id) for i, rowid in enumerate(ngrams_id): self._upd(rowid[0]) db_manager.save_every(self._get_list(), i) db_manager.save(self._get_list()) db_manager.finalize(self)
def compute(self, ngram_db, table: str, field: str, save_every=-1): in_fld = [field, 'freq'] db_manager = DatabaseManager(ngram_db, table, in_fld, save_every=save_every) new_table = 'lex_context_counts' new_fields = [('word', 'text'), ('skipgram', 'text'), ('length', 'int'), ('pred', 'float')] db_manager.new_table(new_table, new_fields) for n in messages.pbar(range(2, db_manager.n_max + 1)): messages.computing_measure('lexical predictability', n) ngram_freqs, self.N = db_manager.get_iterator(n) self.n = n self.skipgram_freqs = db_manager.to_dict('skipgram_counts', ['skipgram'], ['freq'], n) for i, (ngram, freq) in enumerate(ngram_freqs): self._upd(ngram, freq) db_manager.save_every(self._get_list(), i) db_manager.save(self._get_list()) db_manager.finalize(self)