Esempio n. 1
0
 def delete_term(self, term):
     if term not in self.positional_index.keys():
         bigrams = get_bigrams(term)
         for bigram in bigrams:
             if term in self.index[bigram]:
                 self.index[bigram].remove(term)
                 if not self.index[bigram]:
                     self.index.pop(bigram)
Esempio n. 2
0
    def add_doc(self, doc: str):
        """
        :param doc: string
        adds the bigrams in the doc to the index
        """
        tokenized_doc = self.preprocessor.normalize(doc)

        for token in tokenized_doc:
            bigrams = get_bigrams(token)
            for bigram in bigrams:
                if bigram in self.index.keys():
                    self.index[bigram].add(token)
                else:
                    self.index[bigram] = {token}  # store the tokens in a set
Esempio n. 3
0
    def suggest_word(self, token: str):
        bigrams_token = get_bigrams(token)

        possible_similar_words = set()
        for bigram in bigrams_token:
            possible_similar_words = possible_similar_words.union(
                self.bigram_index.index[bigram])

        jaccard_sims = []
        for word in possible_similar_words:
            jaccard_sims.append((word,
                                 jaccard_similarity(set(bigrams_token),
                                                    set(get_bigrams(word)))))

        # sorting the possibly similar words based on their jaccard distance to the main token
        jaccard_sims = sorted(jaccard_sims, key=lambda x: x[1], reverse=True)

        similar_words = jaccard_sims[:
                                     5]  # similar words with their jaccard distance to the main token
        distances = [(t[0], edit_distance(token, t[0])) for t in similar_words]
        distances = sorted(distances, key=lambda x: x[1])
        correct_word = distances[0][0]

        return correct_word
Esempio n. 4
0
def from_raw_text_new(chars, vocabs, w_list, number_normalized=False):
    from fastNLP.core import DataSet
    from utils import get_bigrams
    bigrams = get_bigrams(chars)
    seq_len = len(chars)
    target = ['O'] * seq_len
    dataset = DataSet({
        'chars': [chars],
        'bigrams': [bigrams],
        'seq_len': [seq_len],
        'target': [target]
    })
    datasets = {'train': dataset}

    def normalize_char(inp):
        result = []
        for c in inp:
            if c.isdigit():
                result.append('0')
            else:
                result.append(c)

        return result

    def normalize_bigram(inp):
        result = []
        for bi in inp:
            tmp = bi
            if tmp[0].isdigit():
                tmp = '0' + tmp[:1]
            if tmp[1].isdigit():
                tmp = tmp[0] + '0'

            result.append(tmp)
        return result

    if number_normalized == 3:
        print('not support exit!')
        exit()
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

        for k, v in datasets.items():
            v.apply_field(normalize_bigram, 'bigrams', 'bigrams')
        vocabs['bigram'] = Vocabulary()
        vocabs['bigram'].from_dataset(
            datasets['train'],
            field_name='bigrams',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    def get_skip_path(chars, w_trie):
        sentence = ''.join(chars)
        result = w_trie.get_lexicon(sentence)
        # print(result)

        return result

    from V0.utils_ import Trie
    from functools import partial
    from fastNLP.core import Vocabulary
    from fastNLP.embeddings import StaticEmbedding
    from fastNLP import DataSet
    a = DataSet()
    a.apply
    w_trie = Trie()
    for w in w_list:
        w_trie.insert(w)

    import copy
    for k, v in datasets.items():
        v.apply_field(partial(get_skip_path, w_trie=w_trie), 'chars',
                      'lexicons')
        v.apply_field(copy.copy, 'chars', 'raw_chars')
        v.add_seq_len('lexicons', 'lex_num')
        v.apply_field(lambda x: list(map(lambda y: y[0], x)), 'lexicons',
                      'lex_s')
        v.apply_field(lambda x: list(map(lambda y: y[1], x)), 'lexicons',
                      'lex_e')

    if number_normalized == 1:
        print('not support exit!')
        exit()
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    if number_normalized == 2:
        print('not support exit!')
        exit()
        for k, v in datasets.items():
            v.apply_field(normalize_char, 'chars', 'chars')
        vocabs['char'] = Vocabulary()
        vocabs['char'].from_dataset(
            datasets['train'],
            field_name='chars',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

        for k, v in datasets.items():
            v.apply_field(normalize_bigram, 'bigrams', 'bigrams')
        vocabs['bigram'] = Vocabulary()
        vocabs['bigram'].from_dataset(
            datasets['train'],
            field_name='bigrams',
            no_create_entry_dataset=[datasets['dev'], datasets['test']])

    def concat(ins):
        chars = ins['chars']
        lexicons = ins['lexicons']
        result = chars + list(map(lambda x: x[2], lexicons))
        # print('lexicons:{}'.format(lexicons))
        # print('lex_only:{}'.format(list(filter(lambda x:x[2],lexicons))))
        # print('result:{}'.format(result))
        return result

    def get_pos_s(ins):
        lex_s = ins['lex_s']
        seq_len = ins['seq_len']
        pos_s = list(range(seq_len)) + lex_s

        return pos_s

    def get_pos_e(ins):
        lex_e = ins['lex_e']
        seq_len = ins['seq_len']
        pos_e = list(range(seq_len)) + lex_e

        return pos_e

    for k, v in datasets.items():
        v.apply(concat, new_field_name='lattice')
        v.set_input('lattice')
        v.apply(get_pos_s, new_field_name='pos_s')
        v.apply(get_pos_e, new_field_name='pos_e')
        v.set_input('pos_s', 'pos_e')

    # print(list(datasets['train'][:10]['lexicons']))
    # print(list(datasets['train'][:10]['lattice']))
    # print(list(datasets['train'][:10]['lex_s']))
    # print(list(datasets['train'][:10]['lex_e']))
    # print(list(datasets['train'][:10]['pos_s']))
    # print(list(datasets['train'][:10]['pos_e']))
    # exit(1208)

    # for k,v in datasets.items():
    #     v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_l2r','skips_l2r_source')
    #     v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_l2r', 'skips_l2r_word')
    #
    # for k,v in datasets.items():
    #     v.apply_field(lambda x:[ list(map(lambda x:x[0],p)) for p in x],'skips_r2l','skips_r2l_source')
    #     v.apply_field(lambda x:[ list(map(lambda x:x[1],p)) for p in x], 'skips_r2l', 'skips_r2l_word')

    # for k,v in datasets.items():
    #     v.apply_field(lambda x:list(map(len,x)), 'skips_l2r_word', 'lexicon_count')
    #     v.apply_field(lambda x:
    #                   list(map(lambda y:
    #                            list(map(lambda z:word_vocab.to_index(z),y)),x)),
    #                   'skips_l2r_word',new_field_name='skips_l2r_word')
    #
    #     v.apply_field(lambda x:list(map(len,x)), 'skips_r2l_word', 'lexicon_count_back')
    #
    #     v.apply_field(lambda x:
    #                   list(map(lambda y:
    #                            list(map(lambda z:word_vocab.to_index(z),y)),x)),
    #                   'skips_r2l_word',new_field_name='skips_r2l_word')

    vocabs['char'].index_dataset(*(datasets.values()),
                                 field_name='chars',
                                 new_field_name='chars')
    vocabs['bigram'].index_dataset(*(datasets.values()),
                                   field_name='bigrams',
                                   new_field_name='bigrams')
    vocabs['label'].index_dataset(*(datasets.values()),
                                  field_name='target',
                                  new_field_name='target')
    vocabs['lattice'].index_dataset(*(datasets.values()),
                                    field_name='lattice',
                                    new_field_name='lattice')

    return datasets, vocabs