Esempio n. 1
0
def process_batches_for_lemma():
    for i in range(0, 19):
        t = time()
        print(saved_data_file_tokens_entities_tags.format(i))
        data = get_pickled(saved_data_file_tokens_entities_tags.format(i))
        lemma_map = get_lemma_map(data)
        save_to_file("lemma_map-{}".format(i), lemma_map)
Esempio n. 2
0
def process_batches(_i):
    print("Now processing test_tuples_chunk_{}".format(_i))
    _test_tuples = get_pickled("chunks/test_tuples_chunk_{}".format(_i))
    _ply = len(_test_tuples)
    print("#test_tuples_chunk: {}".format(_ply))
    t = time()
    _found, _not_found, _errors = validate(_test_tuples, entity_tuples, pl_map,
                                           prefix_map, lemma_map, _ply)
    print('Time to process test_tuples-{} : {} mins'.format(
        _i, round((time() - t) / 60, 2)))
    save_to_file("results_chunks-{}".format(_i), (_found, _not_found, _errors))
Esempio n. 3
0
def process_test_tuples(_i):
    print("Now processing file: {}".format(saved_data_file.format(_i)))
    _data = get_pickled(saved_data_file.format(_i))
    _test_tuples = get_test_data(_data)
    save_to_file("test_tuples-{}".format(_i), _test_tuples)
Esempio n. 4
0
    save_to_file("lemma_map", lemma_map)


def process_batches(sentences_out):
    for i in range(0, 19):
        data = get_pickled(saved_data_file_tokens_entities_tags.format(i))
        data_map_of_sentences = map_docs_to_sentences(data)
        sentences_for_docs = get_list_sentences(data_map_of_sentences)
        sentences = flatten_list(sentences_for_docs)
        sentences_out.append(sentences)


# page_object_map(pages_input_file, pages_output_file)
# article_parent_object_map(article_parents_input_file, article_parents_output_file)
# category_parent_object_map(category_parents_input_file, category_parents_output_file)
# child_article_object_map(child_articles_input_file, child_articles_output_file)
# child_category_object_map(child_categories_input_file, child_categories_output_file)
# link_by_source_object_map(link_by_source_input_file, link_by_source_output_file)

# saved_data_file = "1mln_tokens"

# save_to_file(saved_data_file, data)

process_batches_for_lemma()

sentences_pred = []
process_batches(sentences_pred)
sentences = flatten_list(sentences_pred)

save_to_file("all-sentences", sentences)
Esempio n. 5
0
        _scrapped_not_found_errors.append((__t[0], _l))
        print(_i)
        _i += 1
    return _scrapped_not_found_errors


# scrap_mapped, scrap_not_mapped = get_pickled("scrap_found_and_not_found")

# i =1
not_found_errors_list_of_tuples_chunk1 = get_pickled("not_mapped_found_candidates-{}".format(i))
(_error_not_found_candidates, _found_candidates) = get_pickled("candidates")
scrap_found_map = get_pickled("scrap_found_map")


scrapped_not_found_errors = scrap_not_found2(_found_candidates)
save_to_file("scrapped_not_found_errors", scrapped_not_found_errors)

to_scrap = []
for _t in not_found_errors_list_of_tuples_chunk1:
    to_scrap.append((_t[0][0], _t[0][1], list(_t[1].keys())))

scrapped_not_found = scrap_not_found(to_scrap)

save_to_file("scrapped_not_found-{}".format(i), scrapped_not_found)

# _found, _not_found, _errors = get_pickled("chunks/bin/results_chunks-83")
# lemma_map = get_pickled("lemma_map_ext")
# (category_map, entity_tuples, pl_map, en_map, disambiguation, prefix_map) = get_pickled("mapping-objects_ext")
# stopwords = get_polish_stopwords()

merged_map = get_pickled("merged_map_not_found_errors")
Esempio n. 6
0
def list_to_map(_list):
    _map = {}
    for _tuple in _list:
        if _tuple[0] not in _map:
            _map[_tuple[0]] = _tuple[1]
    return _map


stopwords = get_polish_stopwords()
lemma_map = get_pickled("lemma_map_ext")

scrapped_not_found_errors = get_pickled("scrapped_not_found_errors")
_map_0 = list_to_map(scrapped_not_found_errors)
_map = remove_disamb_pages(_map_0, lemma_map, stopwords)

save_to_file("scrapped_not_found_errors_clean", _map)

tuple_map0 = remove_disamb_pages(get_pickled("tuple_map_scrap-{}".format(2)))
for i in range(3, 37):
    if i != 6:
        tuple_map1 = remove_disamb_pages(
            get_pickled("tuple_map_scrap-{}".format(i)))
        merge_tuple_map(tuple_map0, tuple_map1)

save_to_file("wikidata_context_map", tuple_map0)

# lemma_map = get_pickled("lemma_map-{}".format(0))
merged_map = {}
not_found_list_of_tuples = []
# global_map1 = get_pickled("global_map.10")
# global_map2 = get_pickled("global_map.13")
Esempio n. 7
0
(mapped, not_mapped) = get_pickled("map_test_set_with_scrap_filled")

import csv
with open(dir + 'roziewski-poleval-task3.tsv', 'w') as tsvfile:
    writer = csv.writer(tsvfile, delimiter='=')
    for _t in mapped:
        writer.writerow([_t[0], _t[1]])

(mapped, not_mapped) = get_pickled("map_test_set_with_scrap")
# (mapped, not_mapped) = get_pickled("map_test_set")

clean_scrapped_not_found = get_pickled("clean_scrapped_not_found")
scrap_mapped, scrap_not_mapped = get_pickled("scrap_found_and_not_found")

scrap_found_map = map_scrap_found(scrap_mapped)
save_to_file("scrap_found_map", scrap_found_map)

# scrap_mapped, scrap_not_mapped = map_scrapped_not_found(clean_scrapped_not_found, w2vec)
# save_to_file("scrap_found_and_not_found", (scrap_mapped, scrap_not_mapped))

i = 1
# clean_scrapped_not_found = []
# for _t in scrapped_not_found:
#     _clean_tuple = (_t[0], _t[1], clean_tuples(_t[2]))
#     clean_scrapped_not_found.append(_clean_tuple)
#
# save_to_file("clean_scrapped_not_found", clean_scrapped_not_found)

# not_found_errors_list_of_tuples_chunk1 = get_pickled("not_mapped_found_candidates-5")

# _to_scrap = []
Esempio n. 8
0
cnt = 0
multi = int(sys.argv[1])
ply = 100000

# (text_mapping, __disambiguation) = get_pickled("data-scraped-{}".format(multi))

cnt = ply * (multi - 1)
for json in type_jsons[(multi - 1) * ply:multi * ply]:
    cnt += 1
    if json['wiki']['pl']:
        to_exclude = [' ']
        _entity = json['wiki']['pl'].lower().translate(
            {ord(i): '_'
             for i in to_exclude}) if json['wiki']['pl'] else False
        text = get_text(get_soup(_entity))
        cnt += 1
        print("Progress {}".format(cnt)) if cnt % ply / 10 == 0 else False

        if _entity in text_mapping.keys():
            if _entity not in __disambiguation:
                __disambiguation[_entity] = [__disambiguation_helper[_entity]]
                del (__disambiguation_helper[_entity])
            __disambiguation[_entity].append(text)
        else:
            text_mapping[_entity] = text
            __disambiguation_helper[_entity] = text
    if cnt % ply / 10 == 0:
        save_to_file("data-scraped-{}-{}".format(multi, cnt),
                     (text_mapping, __disambiguation))
        text_mapping = {}