Ejemplo n.º 1
0
 def __iter__(self):
     mode, kwargs = self._itermode, self._itermode_kwargs
     if mode == 'sent':
         if self.sentences is None:
             for i, item in messages.pbar(
                     enumerate(self._iter_sent(**kwargs))):
                 yield item
             self.sentences = i + 1
             self._save_info()
         else:
             for item in messages.pbar(self._iter_sent(**kwargs),
                                       total=self.sentences):
                 yield item
     elif mode == 'word':
         if self.tokens is None:
             for i, item in messages.pbar(
                     enumerate(self._iter_words(**kwargs))):
                 yield item
             self.tokens = i + 1
             self._save_info()
         else:
             for item in messages.pbar(self._iter_words(**kwargs),
                                       total=self.tokens):
                 yield item
     else:
         raise ValueError(mode)
Ejemplo n.º 2
0
def add_heading(filename):
    with codecs.open(filename, 'r', 'utf8') as filein:
        dim = len(next(filein).split(" ")) - 1
        words = 1
        for line in messages.pbar(filein):
            words += 1
    with codecs.open(filename+'_', 'w', 'utf8') as fileout:
        fileout.write(str(words) + ' ' + str(dim) + '\n')
        with codecs.open(filename, 'r', 'utf8') as filein:
            for line in messages.pbar(filein, total=words):
                fileout.write(line)
    os.remove(filename)
    os.rename(filename+'_', filename)
Ejemplo n.º 3
0
 def _counts_to_db(self, counter, sep, commit_each=10000):
     messages.msg("Saving counter to sql database...")
     self.connect()
     rows = list()
     for i, (ngram, freq) in messages.pbar(enumerate(counter.items())):
         ngram = ngram.split(' ')
         n = len(ngram)
         row = [n, freq]
         for feat in self.feats:
             v = ' '.join([self._get_feat(w, feat, sep) for w in ngram])
             row.append(v)
         rows.append(row)
     self.insert_data(messages.pbar(rows), 'ngram_counts')
Ejemplo n.º 4
0
def lex_fixedness(ngram_db, variant_dict: dict, table: str, field: str,
                  m_field: str):
    in_fld = ['rowid', field, m_field]
    output_db, n_max = initialize(ngram_db, table, in_fld)
    new_fld = [('lex_fixedness_' + m_field, 'float')]
    output_db.new_fields(table, new_fld)
    for n in range(2, n_max+1):
        messages.msg("Collecting statistics from {}grams...".format(n))
        lemma_pmis = ngram_db.to_dict('lemma_counts', ['lemma'], ['pmi'], n)
        ngram_freqs = ngram_db[n]
        lex_fixedness = list()
        for i, (ngram, pmi, rowid) in enumerate(messages.pbar(ngram_freqs,
                                                       total=num_rows[str(n)])):
            V = sim(ngram, neighbours_dict)
            V_pmis = list()
            for v in V:
                try:
                    V_pmis.append(lemma_pmis[v])
                except KeyError:
                    pass
            if len(V_pmis) > 1:
                fixedness = (pmi - np.mean(V_pmis)) / np.var(V_pmis)
            else:
                fixedness = 0
            lex_fixedness.append((rowid, fixedness))
            if (i+1) % save_every == 0:
                output_db.update_data(lex_fixedness, in_table, 'lex_fixedness')
                lex_fixedness = list()
    output_db.update_data(lex_fixedness, in_table, 'lex_fixedness')
    ngram_db.disconnect()
    output_db.disconnect()
    del_and_rename(ngram_db, output_db)
    messages.done()
Ejemplo n.º 5
0
 def compute(self,
             ngram_db,
             embeddings_db,
             table: str,
             field: str,
             save_every=-1):
     in_fld = ['rowid', field]
     db_manager = DatabaseManager(ngram_db,
                                  table,
                                  in_fld,
                                  save_every=save_every)
     embeddings_db.connect()
     new_fld = [('comp_' + self.fun.__name__, 'float')]
     db_manager.new_fields(new_fld)
     self.sw_vecs = embeddings_db.to_dict(1)
     for n in messages.pbar(range(2, db_manager.n_max + 1)):
         messages.computing_measure(new_fld[0][0], n)
         ngrams, _ = db_manager.get_iterator(n)
         self.ngram_vecs = embeddings_db.to_dict(n)
         for i, (rowid, ngram) in enumerate(ngrams):
             self._upd(rowid, ngram)
             db_manager.save_every(self._get_list(), i)
         db_manager.save(self._get_list())
     embeddings_db.disconnect()
     db_manager.finalize(self)
Ejemplo n.º 6
0
 def _get_list(self, **dict_kwargs):
     if self.has_dict:
         out_list = self._dict_to_list(**dict_kwargs)
     else:
         out_list = self.default_list
     for item in messages.pbar(out_list):
         yield item
     self.default_list = list()
Ejemplo n.º 7
0
def remove_heading(filename):
    with codecs.open(filename, 'r', 'utf8') as filein:
        with codecs.open(filename+'_', 'w', 'utf8') as fileout:
            total = int(next(filein).split(' ')[0])
            for line in messages.pbar(filein, total=total):
                fileout.write(line)
    os.remove(filename)
    os.rename(filename+'_', filename)
Ejemplo n.º 8
0
def create_sim_dict(file, vectors, min_sim=0.55, topn=10, num_trees=200):
    indexer = AnnoyIndexer(vectors, num_trees=num_trees)
    sim_dict = dict()
    for w in messages.pbar(vectors.vocab):
        sim = indexer.most_similar(vectors.get_vector(w), topn)
        sim_dict[w] = [s for s in sim if s[1] > min_sim]
    with open(file, 'wb') as fileout:
        pickle.dump(sim_dict, fileout)
Ejemplo n.º 9
0
def join_ngrams(filename, n, sep='_'):
    with codecs.open(filename, 'r', 'utf8') as filein:
        with codecs.open(filename+'_', 'w', 'utf8') as fileout:
            for line in messages.pbar(filein):
                line = line.split(' ')
                entry, vector = line[:n], line[n:]
                entry = sep.join(entry)
                line = ' '.join([entry] + vector)
                fileout.write(line)
    os.remove(filename)
    os.rename(filename+'_', filename)
Ejemplo n.º 10
0
 def compute(self, ngram_db, table: str, save_every=-1):
     in_table = 'lex_context_counts'
     in_fld = ['max_' + self.measure]
     db_manager = DatabaseManager(ngram_db,
                                  in_table,
                                  in_fld,
                                  save_every=save_every)
     new_fields = [(self.agg_fun + '_' + self.measure, 'float')]
     db_manager.new_fields(new_fields, table=table)
     for n in messages.pbar(range(2, db_manager.n_max + 1)):
         messages.computing_measure(new_fields[0][0], n)
         ngrams_id = db_manager.to_list(table, ['rowid'], n)
         self.w_lpr, N = db_manager.get_iterator(n, pbar=False)
         self.n = n
         ngrams_id = messages.pbar(ngrams_id)
         for i, rowid in enumerate(ngrams_id):
             self._upd(rowid[0])
             db_manager.save_every(self._get_list(), i)
         db_manager.save(self._get_list())
     db_manager.finalize(self)
Ejemplo n.º 11
0
 def compute(self, ngram_db, table: str, field: str, v_table: str):
     in_fld = [field, 'freq']
     db_manager = DatabaseManager(ngram_db, v_table, in_fld)
     new_fld = [('syn_entropy', 'float')]
     db_manager.new_fields(new_fld, table=table)
     for n in messages.pbar(range(2, db_manager.n_max + 1)):
         messages.computing_measure(new_fld[0][0], n)
         self.base_frqs = ngram_db.to_dict(table, [field],
                                           ['freq', 'rowid'], n)
         variant_freqs, _ = db_manager.get_iterator(n)
         for i, (v_base, v_freq) in enumerate(variant_freqs):
             self._upd(v_base, v_freq)
         db_manager.save(self._get_list())
     db_manager.finalize(self)
Ejemplo n.º 12
0
 def compute(self, ngram_db, table: str, field: str, save_every=-1):
     in_fld = ['rowid', field, 'freq']
     db_manager = DatabaseManager(ngram_db, table, in_fld, save_every)
     new_fld = [('c_value', 'float')]
     db_manager.new_fields(new_fld)
     for n in messages.pbar(range(db_manager.n_max, 1, -1)):
         messages.computing_measure('c_value', n)
         ngram_frqs, N = db_manager.get_iterator(n)
         self.n = n
         for i, (rowid, ngram, freq) in enumerate(ngram_frqs):
             self._upd(rowid, ngram, freq)
             db_manager.save_every(self._get_list(), i)
         db_manager.save(self._get_list())
         self._swich_super()
     db_manager.finalize(self)
Ejemplo n.º 13
0
def sub_separator(filename, old_sep, new_sep):
    with codecs.open(filename, 'r', 'utf8') as filein:
        heading = next(filein)
        total, dim = heading.split(' ')
        total, dim = int(total), int(dim)
        with codecs.open(filename+'_', 'w', 'utf8') as fileout:
            fileout.write(heading)
            for line in messages.pbar(filein, total=total):
                line = line.split(' ')
                entry, vector = ' '.join(line[:-dim]), line[-dim:]
                entry = re.sub('{}(?=[A-Z])'.format(old_sep), new_sep, entry)
                line = ' '.join([entry] + vector)
                fileout.write(line)
    os.remove(filename)
    os.rename(filename+'_', filename)
Ejemplo n.º 14
0
def entropy(ngram_db, variant_dict: dict, table: str, field: str):
    in_fld = ['rowid', field]
    output_db, n_max = initialize(ngram_db, table, in_fld)
    new_fld = [('lex_entropy', 'float')]
    output_db.new_fields(table, new_fld)
    calculator = LexEntCalculator(variant_dict)
    for n in range(2, n_max+1):
        messages.computing_measure(new_fld[0][0], n)
        var_frqs = ngram_db.to_dict(table, [field], ['freq'], n)
        calculator.set_var_frqs(var_frqs)
        base_freqs, N = ngram_db[n]
        for i, (rowid, ngram) in enumerate(messages.pbar(base_freqs, total=N)):
            calculator.upd_stats(rowid, ngram)
            save_every(output_db, calculator.get_list(), table, new_fld[0][0],
                       i)
    output_db.update_data(calculator.get_list(), table, new_fld[0][0])
    finalize(ngram_db, output_db)
Ejemplo n.º 15
0
 def compute(self, ngram_db, table: str, field: str, save_every=-1):
     in_fld = ['rowid', 'word', 'skipgram', field]
     db_manager = DatabaseManager(ngram_db,
                                  table,
                                  in_fld,
                                  save_every=save_every)
     new_fields = [('max_' + field, 'float')]
     db_manager.new_fields(new_fields)
     for n in messages.pbar(range(2, db_manager.n_max + 1)):
         messages.computing_measure(new_fields[0][0], n)
         self.pre = db_manager.to_dict(table, ['word', 'skipgram'],
                                       ['max_' + field], n - 1)
         current, _ = db_manager.get_iterator(n)
         for i, (rowid, w, skipgram, value) in enumerate(current):
             self._upd(rowid, w, skipgram, value)
             db_manager.save_every(self._get_list(), i)
         db_manager.save(self._get_list())
     db_manager.finalize(self)
Ejemplo n.º 16
0
 def compute(self, ngram_db, table: str, field: str):
     in_fld = ['rowid', field, 'freq']
     db_manager = DatabaseManager(ngram_db, table, in_fld)
     new_fld = [('fdp_' + m.__name__, 'float') for m in self.measure]
     db_manager.new_fields(new_fld)
     for n in messages.pbar(range(2, db_manager.n_max + 1)):
         fld_str = ', '.join([f[0] for f in new_fld])
         messages.computing_measure(fld_str, n)
         self.n = n
         for i in range(1, n // 2 + 1):
             freq_12, N = db_manager.get_iterator(n)
             self.N = N
             to_dict_args = [table, [field], ['freq']]
             self._set_freq_dicts(db_manager, i, to_dict_args)
             for rowid, ngram, freq in freq_12:
                 self._upd(rowid, ngram, freq, i)
         out_list = self._get_list()
         db_manager.save(out_list)
     db_manager.finalize(self)
Ejemplo n.º 17
0
 def compute(self, ngram_db, save_every=-1):
     in_table = 'lex_context_counts'
     in_fld = ['rowid', 'word', 'skipgram', 'pred']
     db_manager = DatabaseManager(ngram_db,
                                  in_table,
                                  in_fld,
                                  save_every=save_every)
     new_fields = [('lpr', 'float')]
     db_manager.new_fields(new_fields)
     for n in messages.pbar(range(2, db_manager.n_max + 1)):
         messages.computing_measure('word lpr', n)
         self.syn_prob = db_manager.to_dict('syn_context_counts',
                                            ['word', 'skipos'], ['pred'], n)
         lex_prob, _ = db_manager.get_iterator(n)
         for i, (rowid, word, skipgram, prob) in enumerate(lex_prob):
             self._upd(rowid, word, skipgram, prob)
             db_manager.save_every(self._get_list(), i)
         db_manager.save(self._get_list())
     db_manager.finalize(self)
Ejemplo n.º 18
0
 def compute(self, ngram_db, save_every=-1):
     in_table = 'syn_context_counts'
     in_fld = ['rowid', 'skipos', 'freq']
     db_manager = DatabaseManager(ngram_db,
                                  in_table,
                                  in_fld,
                                  save_every=save_every)
     new_fields = [('pred', 'float')]
     db_manager.new_fields(new_fields)
     for n in messages.pbar(range(2, db_manager.n_max + 1)):
         messages.computing_measure('syntactic predictability', n)
         self.skipos_freqs = db_manager.to_dict('skipos_counts', ['skipos'],
                                                ['freq'], n)
         syn_c_freqs, self.N = db_manager.get_iterator(n)
         for i, (rowid, skipos, freq) in enumerate(syn_c_freqs):
             self._upd(rowid, skipos, freq)
             db_manager.save_every(self._get_list(), i)
         db_manager.save(self._get_list())
     db_manager.finalize(self)
Ejemplo n.º 19
0
 def compute(self, ngram_db, table: str, field: str, save_every=-1):
     in_fld = [field, 'freq']
     db_manager = DatabaseManager(ngram_db,
                                  table,
                                  in_fld,
                                  save_every=save_every)
     new_table = 'syn_context_counts'
     new_fields = [('word', 'text'), ('skipos', 'text'), ('length', 'int'),
                   ('freq', 'int')]
     db_manager.new_table(new_table, new_fields)
     for n in messages.pbar(range(2, db_manager.n_max + 1)):
         messages.computing_measure('syntactic context freq', n)
         ngram_freqs, _ = db_manager.get_iterator(n)
         self.n = n
         for i, (ngram, freq) in enumerate(ngram_freqs):
             self._upd(ngram, freq)
             db_manager.save_every(self._get_list(add_n=True), i)
         db_manager.save(self._get_list(add_n=True))
     db_manager.finalize(self)
Ejemplo n.º 20
0
 def compute(self, ngram_db, table: str, field: str, save_every=-1):
     in_fld = ['rowid', field, 'freq']
     db_manager = DatabaseManager(ngram_db,
                                  table,
                                  in_fld,
                                  save_every=save_every)
     new_fld = [(m.__name__, 'float') for m in self.measure]
     db_manager.new_fields(new_fld)
     self.freq_1 = db_manager.to_dict(table, [field], ['freq'], 1)
     for n in messages.pbar(range(2, db_manager.n_max + 1)):
         fld_str = ', '.join([f[0] for f in new_fld])
         messages.computing_measure(fld_str, n)
         freq_12, N = db_manager.get_iterator(n)
         self.n, self.N = n, N
         for i, (rowid, ngram, freq) in enumerate(freq_12):
             self._upd(rowid, ngram, freq)
             db_manager.save_every(self._get_list(), i)
         db_manager.save(self._get_list())
     db_manager.finalize(self)
Ejemplo n.º 21
0
 def compute(self, ngram_db, table: str, field: str, save_every=-1):
     in_fld = [field, 'freq']
     db_manager = DatabaseManager(ngram_db,
                                  table,
                                  in_fld,
                                  save_every=save_every)
     new_table = 'lex_context_counts'
     new_fields = [('word', 'text'), ('skipgram', 'text'),
                   ('length', 'int'), ('pred', 'float')]
     db_manager.new_table(new_table, new_fields)
     for n in messages.pbar(range(2, db_manager.n_max + 1)):
         messages.computing_measure('lexical predictability', n)
         ngram_freqs, self.N = db_manager.get_iterator(n)
         self.n = n
         self.skipgram_freqs = db_manager.to_dict('skipgram_counts',
                                                  ['skipgram'], ['freq'], n)
         for i, (ngram, freq) in enumerate(ngram_freqs):
             self._upd(ngram, freq)
             db_manager.save_every(self._get_list(), i)
         db_manager.save(self._get_list())
     db_manager.finalize(self)
Ejemplo n.º 22
0
    def save_as_text(self,
                     filename: str,
                     keys='all',
                     join_values='\t',
                     ignore_compound=False):
        """Save the sentences yielded by this iterator in a text file.

        Parameters
        ----------
        filename: str
        keys: dict, optional
            The list of the features of a word that are retuned (default is
            'all' which implies that all the elements passed to the 'idx_dict'
            argument of the constructor are returned)
        join_values: str, default='\t'
            Each word will be represented as a string in which the feature of
            the word are joined with the value passed to this argument.

        """
        assert join_values
        with codecs.open(filename, 'w', 'utf8') as fileout:
            sentences = self._iter_sent(keys, join_values, ignore_compound)
            for sent in messages.pbar(sentences, total=self.sentences):
                fileout.write(' '.join(sent) + '\n')
Ejemplo n.º 23
0
 def get_iterator(self, n: int, pbar=True):
     it, N = self.ngram_db[n]
     if pbar:
         it = messages.pbar(it, total=N)
     return it, N