def frequency_threshold(self, table, threshold): messages.msg("Filtering ngrams...") delete = sql.DELETE.format(table=table) self.execute(delete, threshold) self.commit() messages.done() self.upd_info(table)
def lex_fixedness(ngram_db, variant_dict: dict, table: str, field: str, m_field: str): in_fld = ['rowid', field, m_field] output_db, n_max = initialize(ngram_db, table, in_fld) new_fld = [('lex_fixedness_' + m_field, 'float')] output_db.new_fields(table, new_fld) for n in range(2, n_max+1): messages.msg("Collecting statistics from {}grams...".format(n)) lemma_pmis = ngram_db.to_dict('lemma_counts', ['lemma'], ['pmi'], n) ngram_freqs = ngram_db[n] lex_fixedness = list() for i, (ngram, pmi, rowid) in enumerate(messages.pbar(ngram_freqs, total=num_rows[str(n)])): V = sim(ngram, neighbours_dict) V_pmis = list() for v in V: try: V_pmis.append(lemma_pmis[v]) except KeyError: pass if len(V_pmis) > 1: fixedness = (pmi - np.mean(V_pmis)) / np.var(V_pmis) else: fixedness = 0 lex_fixedness.append((rowid, fixedness)) if (i+1) % save_every == 0: output_db.update_data(lex_fixedness, in_table, 'lex_fixedness') lex_fixedness = list() output_db.update_data(lex_fixedness, in_table, 'lex_fixedness') ngram_db.disconnect() output_db.disconnect() del_and_rename(ngram_db, output_db) messages.done()
def copy_db(ngram_db): messages.msg('Copying database...') outroot = ngram_db.fileroot + '_' if not os.path.exists(outroot + '.info.json'): copyfile(ngram_db.db, outroot + '.db') copyfile(ngram_db.info_file, outroot + '.info.json') messages.done() return NgramDb.load(outroot)
def _aggregate_tmp(self, sum_fld: list, n_keys=1): messages.msg('Aggregating values...') for f in sum_fld: self.new_flds.remove(f) grp_f, other_f = self.new_flds[:n_keys], self.new_flds[n_keys:] self.output_db.aggregate_by(self.new_tb[4:], self.new_tb, sum_fld, other_f, grp_f) self.output_db.drop_table(self.new_tb) self.new_tb = self.new_tb[4:]
def gen_skipos(ngram_db, display=True): if display: messages.new_display() messages.msg('Generating skipos statistics...') ngram_db.connect() ngram_db.aggregate_by('skipos_counts', 'skipgram_counts', ['freq'], ['length'], ['skipos']) ngram_db.upd_info('skipos_counts') ngram_db.disconnect() messages.done()
def _gen_info(self, dim: int): messages.msg("Generating info file...") num_rows = dict() table = self.get_tables()[0] n_max = self.query(sql.MAX_LEN.format(table=table))[0][0] for n in range(1, n_max + 1): n_rows = self.query(sql.ROW_COUNT.format(table=table), n) num_rows.setdefault(table, dict())[n] = n_rows[0][0] info = {'n_max': n_max, 'dim': dim, 'num_rows': num_rows} utils.save_json(info, self.info_file) messages.done()
def _counts_to_db(self, counter, sep, commit_each=10000): messages.msg("Saving counter to sql database...") self.connect() rows = list() for i, (ngram, freq) in messages.pbar(enumerate(counter.items())): ngram = ngram.split(' ') n = len(ngram) row = [n, freq] for feat in self.feats: v = ' '.join([self._get_feat(w, feat, sep) for w in ngram]) row.append(v) rows.append(row) self.insert_data(messages.pbar(rows), 'ngram_counts')
def _gen_info(self): messages.msg("Generating info file...") num_rows = dict() tables = self.get_tables() if tables: n_max = self.query(sql.MAX_LEN.format(table=tables[0]))[0][0] for t in tables: for n in range(1, n_max + 1): n_rows = self.query(sql.ROW_COUNT.format(table=t), n) num_rows.setdefault(t, dict())[n] = n_rows[0][0] else: n_max, num_rows = 0, None info = {'n_max': n_max, 'num_rows': num_rows} utils.save_json(info, self.info_file) messages.done()
def count_ngrams(self, sentences, n, use_bounter=True, sep='\t', **bounterargs): """Counts n-gram occurrences in a corpus. Counts n-gram occurrences in a corpus and inserts the output in an SQLite database. Parameters ---------- sentences: Iterable Iterable of sentences. Each sentence must be a list of strings representing word features separated with the character that is passed to the 'sep' argument of this function. n: int or list of int length of the n-grams use_bounter: bool, default=True If True, the counts are performed via bounter, a probabilistic and memory efficient counter. If false, they are performed via regular Counter. The use of bounter is strongly recommended when working with a large corpus. sep: str, default '\t' The character that separates the features of each word in the input. **bounterargs keyword arguments passed to the bounter constructor if used. """ messages.msg("Counting ngrams of length {}...".format(n)) if use_bounter: bounterargs.setdefault('size_mb', 1024) counter = bounter(**bounterargs) else: counter = Counter() for sent in sentences: if type(n) == list: ngrams = list() for i in n: ngrams += NgramCounter._gen_ngrams(sent, i) else: ngrams = NgramCounter._gen_ngrams(sent, n) counter.update(ngrams) messages.done() self._counts_to_db(counter, sep)
def aggregate_by(self, key: str): """Sum frequency counts over a given field. Parameters ---------- key: str the field over which to compute the sum. """ messages.msg("Aggregating values...") new_tb = '{key}_counts'.format(key=key) self.connect() super().aggregate_by(new_tb, 'ngram_counts', ['freq'], other_f=['length'], grp_f=[key]) self.disconnect() messages.done()