Esempio n. 1
0
def process_batches_for_lemma():
    for i in range(0, 19):
        t = time()
        print(saved_data_file_tokens_entities_tags.format(i))
        data = get_pickled(saved_data_file_tokens_entities_tags.format(i))
        lemma_map = get_lemma_map(data)
        save_to_file("lemma_map-{}".format(i), lemma_map)
Esempio n. 2
0
def process_batches(sentences_out):
    for i in range(5, 7):
        data = get_pickled(saved_data_file.format(i))
        data_map_of_sentences = map_docs_to_sentences(data)
        sentences_for_docs = get_list_sentences(data_map_of_sentences)
        sentences = flatten_list(sentences_for_docs)
        sentences_out.append(sentences)
Esempio n. 3
0
def process_batches(sentences_out):
    for i in range(0, 19):
        print(i)
        t = time()
        data = get_pickled(saved_data_file.format(i))
        data_map_of_sentences = map_docs_to_sentences(data)
        sentences_for_docs = get_list_sentences(data_map_of_sentences)
        sentences = flatten_list(sentences_for_docs)
        sentences_out.append(sentences)
        print('Process batch: {} mins'.format(round((time() - t) / 60, 2)))
Esempio n. 4
0
def process_batches(_i):
    print("Now processing test_tuples_chunk_{}".format(_i))
    _test_tuples = get_pickled("chunks/test_tuples_chunk_{}".format(_i))
    _ply = len(_test_tuples)
    print("#test_tuples_chunk: {}".format(_ply))
    t = time()
    _found, _not_found, _errors = validate(_test_tuples, entity_tuples, pl_map,
                                           prefix_map, lemma_map, _ply)
    print('Time to process test_tuples-{} : {} mins'.format(
        _i, round((time() - t) / 60, 2)))
    save_to_file("results_chunks-{}".format(_i), (_found, _not_found, _errors))
Esempio n. 5
0
from poleval.lib.entity.definitions import saved_data_file_test_set, input_file_test_set
from poleval.lib.poleval import data_object_map, get_pickled, get_test_data

data_object_map(input_file_test_set, saved_data_file_test_set)
data = get_pickled(saved_data_file_test_set)
test_tuples = get_test_data(data)

(category_map, entity_tuples, pl_map, en_map, disambiguation,
 prefix_map) = get_pickled("mapping-objects_ext")
lemma_map = get_pickled("lemma_map_ext")
Esempio n. 6
0
def process_test_tuples(_i):
    print("Now processing file: {}".format(saved_data_file.format(_i)))
    _data = get_pickled(saved_data_file.format(_i))
    _test_tuples = get_test_data(_data)
    save_to_file("test_tuples-{}".format(_i), _test_tuples)
Esempio n. 7
0
# entity_tuples.sort(key=lambda x: x.cleaned_original_entity)
# prefix_map = get_prefix_map(entity_tuples)
# save_to_file("mapping-objects", (category_map, entity_tuples, pl_map, en_map, disambiguation, prefix_map))

saved_data_file = "test_tuples-{}"
# data = get_pickled(saved_data_file.format(5))
#
# test_tuples = get_test_data(data)
# save_to_file("test_tuples", test_tuples)
# test_tuples = get_pickled("test_tuples")

# _found, _not_found, _errors = get_pickled("results")

# w2vec_model = Word2Vec.load(dir + "all-sentences-word2vec-m3.model")

lemma_map = get_pickled("lemma_map_ext")

(category_map, entity_tuples, pl_map, en_map, disambiguation,
 prefix_map) = get_pickled("mapping-objects_ext")
_t1 = get_label("rzymie", entity_tuples, pl_map, prefix_map, lemma_map,
                get_polish_stopwords())
# _t1 = get_label("system operacyjny", entity_tuples, pl_map, prefix_map, lemma_map,
#                 get_polish_stopwords())
# _t1 = get_label("system operacyjny systemów operacyjnych", entity_tuples, pl_map, prefix_map, lemma_map,
#                 get_polish_stopwords())
#
# _t1 = get_label("asocjacyjny", entity_tuples, pl_map, prefix_map, lemma_map,
#                 get_polish_stopwords())
# _t2 = get_label("energię", entity_tuples, pl_map, prefix_map, lemma_map)
# _found, _not_found, _errors = get_pickled("errors")
# validate_debug(_errors, entity_tuples, pl_map, prefix_map, lemma_map, 1000)
Esempio n. 8
0
    pool.join()



# entity_types = get_entity_types(entity_types_file)
# / type_jsons = read_json_file(json_file)
# / save_to_file(entity_types_file_output, type_jsons)
# type_jsons = get_pickled(entity_types_file_output)
#
#
# a = w2vec_model_3.wv.cosine_similarities((w2vec_model_3.wv.get_vector("kraj")+w2vec_model_3.wv.get_vector("federacja")+w2vec_model_3.wv.get_vector("niepodległy")+w2vec_model_3.wv.get_vector("kolonialny"))/4,  category_vectors)
#
# outsiders = get_outsiders(type_jsons, entity_types)
# save_to_file("outsiders", outsiders)

outsiders = get_pickled("outsiders")


BaseManager.register('GlobalMapClass', GlobalMapClass)
manager = BaseManager()
manager.start()
shared_map = manager.GlobalMapClass()

mp_handler(outsiders[0], shared_map)

i = 1
# with recursion_limit(1500):
#     global_map = {}
#     for _id in outsiders[:100000]:
#         get_mapping(_id, global_map)
#
Esempio n. 9
0
    for category in _categories:
        if len(category.split()) > 1:
            vec = (_model.wv.get_vector(category.split()[0]) +
                   _model.wv.get_vector(category.split()[1])) / 2
        else:
            vec = _model.wv.get_vector(category)
        _category_vectors.append(vec)
    return _category_vectors


w2vec_model_3 = Word2Vec.load(dir + "all-sentences-word2vec-m3.model")

category_vectors = categories_to_vectors(w2vec_model_3, categories)

entity_types = get_entity_types(entity_types_file)
# type_jsons = read_json_file(json_file)
# save_to_file(entity_types_file_output, type_jsons)
type_jsons = get_pickled(entity_types_file_output)

a = w2vec_model_3.wv.cosine_similarities(
    (w2vec_model_3.wv.get_vector("kraj") +
     w2vec_model_3.wv.get_vector("federacja") +
     w2vec_model_3.wv.get_vector("niepodległy") +
     w2vec_model_3.wv.get_vector("kolonialny")) / 4, category_vectors)

extract_main_entity_category(type_jsons, entity_types, category_vectors,
                             w2vec_model_3, categories, categories_dict)

data = get_pickled(saved_data_file_tokens_entities_tags)
data_map_of_sentences = map_docs_to_sentences(data)
Esempio n. 10
0
    return [item for sublist in list for item in sublist]


def process_batches(sentences_out):
    for i in range(5, 7):
        data = get_pickled(saved_data_file.format(i))
        data_map_of_sentences = map_docs_to_sentences(data)
        sentences_for_docs = get_list_sentences(data_map_of_sentences)
        sentences = flatten_list(sentences_for_docs)
        sentences_out.append(sentences)


# data = get_pickled(saved_data_file)
# data = get_pickled(saved_data_file.format(5))
# data_filtered = filter_empty_docs(data)
# data_filtered_text = map_docs_to_text(data_filtered)
# data_map_of_sentences = map_docs_to_sentences(data)

# sentences_for_docs = get_list_sentences(data_map_of_sentences)
# sentences = flatten_list(sentences_for_docs)
# sentences = [item for sublist in sentences_for_docs for item in sublist]

# data_filtered_longer_tokens = filter_longer_tokens(data)

article_parents = get_pickled(article_parents_output_file)
pages = get_pickled(pages_output_file)
category_parents = get_pickled(category_parents_output_file)
child_articles = get_pickled(child_articles_output_file)
child_categories = get_pickled(child_categories_output_file)
# link_by_sources = get_pickled(link_by_source_output_file)
Esempio n. 11
0
    _i = 0
    _scrapped_not_found_errors = []
    for __t in _tuples:
        _l = []
        for _id in __t[1]:
            _l.append(get_json(_id))
        _scrapped_not_found_errors.append((__t[0], _l))
        print(_i)
        _i += 1
    return _scrapped_not_found_errors


# scrap_mapped, scrap_not_mapped = get_pickled("scrap_found_and_not_found")

# i =1
not_found_errors_list_of_tuples_chunk1 = get_pickled("not_mapped_found_candidates-{}".format(i))
(_error_not_found_candidates, _found_candidates) = get_pickled("candidates")
scrap_found_map = get_pickled("scrap_found_map")


scrapped_not_found_errors = scrap_not_found2(_found_candidates)
save_to_file("scrapped_not_found_errors", scrapped_not_found_errors)

to_scrap = []
for _t in not_found_errors_list_of_tuples_chunk1:
    to_scrap.append((_t[0][0], _t[0][1], list(_t[1].keys())))

scrapped_not_found = scrap_not_found(to_scrap)

save_to_file("scrapped_not_found-{}".format(i), scrapped_not_found)
Esempio n. 12
0
from poleval.lib.poleval import get_pickled

lemma_map = get_pickled("lemma_map_ext")
(category_map, entity_tuples, pl_map, en_map, disambiguation,
 prefix_map) = get_pickled("mapping-objects_ext")
(_found, _not_found, _errors) = get_pickled("results_chunks_try")
Esempio n. 13
0
                    _new_list.append(_tuple)
        if len(_new_list) > 0:
            _clean_map[_key] = clean_tuples(_new_list, _lemma_map, _stopwords)
    return _clean_map


def list_to_map(_list):
    _map = {}
    for _tuple in _list:
        if _tuple[0] not in _map:
            _map[_tuple[0]] = _tuple[1]
    return _map


stopwords = get_polish_stopwords()
lemma_map = get_pickled("lemma_map_ext")

scrapped_not_found_errors = get_pickled("scrapped_not_found_errors")
_map_0 = list_to_map(scrapped_not_found_errors)
_map = remove_disamb_pages(_map_0, lemma_map, stopwords)

save_to_file("scrapped_not_found_errors_clean", _map)

tuple_map0 = remove_disamb_pages(get_pickled("tuple_map_scrap-{}".format(2)))
for i in range(3, 37):
    if i != 6:
        tuple_map1 = remove_disamb_pages(
            get_pickled("tuple_map_scrap-{}".format(i)))
        merge_tuple_map(tuple_map0, tuple_map1)

save_to_file("wikidata_context_map", tuple_map0)
Esempio n. 14
0
        if _t:
            _ee = get_entity(_t)
            if _ee:
                _l1.append(_ee)
    return _l1


def map_scrap_found(_scrap_found_and_not_found):
    _scrap_found_map = {}
    for _t in _scrap_found_and_not_found:
        _scrap_found_map[_t[0]] = _t[1]
    return _scrap_found_map


stopwords = get_polish_stopwords()
lemma_map = get_pickled("lemma_map_ext")

found_with_error = get_pickled("merged_map_found_with_error")
found = get_pickled("merged_map_found")
# (category_map, entity_tuples, pl_map, en_map, disambiguation, prefix_map) = get_pickled("mapping-objects_ext")
wikidata_context_map = get_pickled("wikidata_context_map")
entity_valid_map = get_pickled("entity_map-valid")

scrap_found_map = get_pickled("scrap_found_map")

filtered_found_with_error = filter_found_with_error(found_with_error)
filtered_found_with_error_more = filter_found_with_error_more(found_with_error)

w2vec = Word2Vec.load(dir + "all-sentences-word2vec-m3.model")

# mapped, not_mapped = map_to_valid(entity_valid_map, found, filtered_found_with_error, wikidata_context_map, w2vec, scrap_found_map)
Esempio n. 15
0
import sys

from poleval.lib.poleval import get_pickled, get_soup, get_text, save_to_file

entity_types_file_output = 'entity-types'

type_jsons = get_pickled(entity_types_file_output)

text_mapping = {}

__disambiguation = {}
__disambiguation_helper = {}

cnt = 0
multi = int(sys.argv[1])
ply = 100000

# (text_mapping, __disambiguation) = get_pickled("data-scraped-{}".format(multi))

cnt = ply * (multi - 1)
for json in type_jsons[(multi - 1) * ply:multi * ply]:
    cnt += 1
    if json['wiki']['pl']:
        to_exclude = [' ']
        _entity = json['wiki']['pl'].lower().translate(
            {ord(i): '_'
             for i in to_exclude}) if json['wiki']['pl'] else False
        text = get_text(get_soup(_entity))
        cnt += 1
        print("Progress {}".format(cnt)) if cnt % ply / 10 == 0 else False