def analyze_golden():
    alchemy = load_alchemy('data.db')
    golden = get_golden('golden.txt',
                        drop_bad_synsets=True,
                        drop_unsure_words=True)
    absent_words = set(load_json_file('yarn_absent.json'))
    clean_synsets_with_origin_ids = [(key, value)
                                     for key, value in golden.items() if value]
    golden_absent = set()
    collocations = set()
    for clean_synset, origin_ids in clean_synsets_with_origin_ids:
        concatenated_words = alchemy.get_concatenated_synsets_by_yarn_ids(
            origin_ids)
        for word in concatenated_words:
            if len(word.split()) > 1:
                collocations.add(word)
            elif word in absent_words:
                golden_absent.add(word)

    print('Missing {} words'.format(len(golden_absent)))
    print('{} collocations found'.format(len(collocations)))

    with open('golden_absent.json', 'w') as f:
        json.dump(list(golden_absent), f)
    with open('golden_collocations.json', 'w') as f:
        json.dump(list(collocations), f)
Esempio n. 2
0
def process_stripped_synset_words():
    session = load_alchemy('data.db').get_session()
    for s_word in tqdm.tqdm(session.query(SynsetWord).all()):
        found_word = False
        stripped = s_word.word.strip()
        if stripped != s_word.word:
            found_word = True
            corresponding_dict_word = session.query(Word).filter(
                Word.word == stripped).one_or_none()
            if corresponding_dict_word:
                s_word.word_id = corresponding_dict_word.id
            s_word.word = stripped
        if found_word:
            session.commit()
def add_absent_words_from_dictionary(absent_words: List[str], dict_path: str):
    """
    Добавляет слова из absent_words в базу
    :param absent_words: список недостающих слов
    :param dict_path: путь до словаря
    :return:
    """
    a = load_alchemy('data.db')
    session = a.get_session()
    session_counter = 0
    absent_words = set(absent_words)
    added_word_id = {}
    with open(dict_path) as f:
        for line in tqdm.tqdm(f):
            if session_counter == 100:
                session.commit()
                session_counter = 0
            dict_entry = json.loads(line)
            if 'definition' in dict_entry and dict_entry['word']:
                word_entry = dict_entry['word'][0]
                if word_entry in absent_words:
                    definitions = [
                        Definition(d) for d in dict_entry['definition']
                    ]
                    if word_entry not in added_word_id:
                        word = Word(word_entry, dict_entry['POS'])
                        session.add(word)
                        session.flush()
                        word_id = word.id
                        added_word_id[word_entry] = word.id
                    else:
                        word_id = added_word_id[word_entry]
                    for d in definitions:
                        session.add(d)
                        session.flush()
                        session.add(WordDefinitionRelation(word_id, d.id))
                    session_counter += 1
    session.commit()
    return list(added_word_id.keys())
def find_absent_words():
    a = load_alchemy('data.db')
    session = a.get_session()
    frame = read_pandas('yarn-synsets.csv')

    dictionary = set()
    words_with_no_defs = set()

    synsets = frame.words
    for synset in synsets:
        dictionary.update(set(synset.split(';')))
    print(len(dictionary))

    for word in tqdm.tqdm(dictionary):
        if session.query(Word).filter(Word.word == word).all():
            continue
        else:
            words_with_no_defs.add(word)

    print(len(dictionary))
    print(len(words_with_no_defs))

    with open('absent_words.json', 'w') as fp:
        json.dump(list(words_with_no_defs), fp)
Esempio n. 5
0
                print('Исходный синсет: {}'.format(', '.join(words)))
                print('-------------------------------------')
                return [NewSynset(words, [])]
        return synsets

    def _matrix_processing(self, matrix):
        rows, _ = np.where(matrix >= self._threshold)
        return np.unique(rows)


if __name__ == '__main__':
    metric = partial(general_metric,
                     sim_metric=jacard_metric,
                     processing=remove_stop_words_tokens)
    model = LayerModel(0.00001, metric)
    alchemy = load_alchemy('data.db')

    yarn_ids, synset_definitions = alchemy.get_synsets_definitions((508, 508))
    for p in synset_definitions:
        print(list(p.keys()))
    print()
    for s in model.extract_new_synsets(synset_definitions[0]):
        print('Новый синсет: {}'.format(', '.join(s.words)))
        if s.definitions:
            print('Определения:')
            for d in s.definitions:
                print('\t{}'.format(d))
        else:
            print('Нет определений в словаре')
        print('-------------------')
Esempio n. 6
0
 def __init__(self):
     self.__dictionary = {}
     self.__alchemy = load_alchemy('data.db')