def process_batches_for_lemma(): for i in range(0, 19): t = time() print(saved_data_file_tokens_entities_tags.format(i)) data = get_pickled(saved_data_file_tokens_entities_tags.format(i)) lemma_map = get_lemma_map(data) save_to_file("lemma_map-{}".format(i), lemma_map)
def process_batches(sentences_out): for i in range(5, 7): data = get_pickled(saved_data_file.format(i)) data_map_of_sentences = map_docs_to_sentences(data) sentences_for_docs = get_list_sentences(data_map_of_sentences) sentences = flatten_list(sentences_for_docs) sentences_out.append(sentences)
def process_batches(sentences_out): for i in range(0, 19): print(i) t = time() data = get_pickled(saved_data_file.format(i)) data_map_of_sentences = map_docs_to_sentences(data) sentences_for_docs = get_list_sentences(data_map_of_sentences) sentences = flatten_list(sentences_for_docs) sentences_out.append(sentences) print('Process batch: {} mins'.format(round((time() - t) / 60, 2)))
def process_batches(_i): print("Now processing test_tuples_chunk_{}".format(_i)) _test_tuples = get_pickled("chunks/test_tuples_chunk_{}".format(_i)) _ply = len(_test_tuples) print("#test_tuples_chunk: {}".format(_ply)) t = time() _found, _not_found, _errors = validate(_test_tuples, entity_tuples, pl_map, prefix_map, lemma_map, _ply) print('Time to process test_tuples-{} : {} mins'.format( _i, round((time() - t) / 60, 2))) save_to_file("results_chunks-{}".format(_i), (_found, _not_found, _errors))
from poleval.lib.entity.definitions import saved_data_file_test_set, input_file_test_set from poleval.lib.poleval import data_object_map, get_pickled, get_test_data data_object_map(input_file_test_set, saved_data_file_test_set) data = get_pickled(saved_data_file_test_set) test_tuples = get_test_data(data) (category_map, entity_tuples, pl_map, en_map, disambiguation, prefix_map) = get_pickled("mapping-objects_ext") lemma_map = get_pickled("lemma_map_ext")
def process_test_tuples(_i): print("Now processing file: {}".format(saved_data_file.format(_i))) _data = get_pickled(saved_data_file.format(_i)) _test_tuples = get_test_data(_data) save_to_file("test_tuples-{}".format(_i), _test_tuples)
# entity_tuples.sort(key=lambda x: x.cleaned_original_entity) # prefix_map = get_prefix_map(entity_tuples) # save_to_file("mapping-objects", (category_map, entity_tuples, pl_map, en_map, disambiguation, prefix_map)) saved_data_file = "test_tuples-{}" # data = get_pickled(saved_data_file.format(5)) # # test_tuples = get_test_data(data) # save_to_file("test_tuples", test_tuples) # test_tuples = get_pickled("test_tuples") # _found, _not_found, _errors = get_pickled("results") # w2vec_model = Word2Vec.load(dir + "all-sentences-word2vec-m3.model") lemma_map = get_pickled("lemma_map_ext") (category_map, entity_tuples, pl_map, en_map, disambiguation, prefix_map) = get_pickled("mapping-objects_ext") _t1 = get_label("rzymie", entity_tuples, pl_map, prefix_map, lemma_map, get_polish_stopwords()) # _t1 = get_label("system operacyjny", entity_tuples, pl_map, prefix_map, lemma_map, # get_polish_stopwords()) # _t1 = get_label("system operacyjny systemów operacyjnych", entity_tuples, pl_map, prefix_map, lemma_map, # get_polish_stopwords()) # # _t1 = get_label("asocjacyjny", entity_tuples, pl_map, prefix_map, lemma_map, # get_polish_stopwords()) # _t2 = get_label("energię", entity_tuples, pl_map, prefix_map, lemma_map) # _found, _not_found, _errors = get_pickled("errors") # validate_debug(_errors, entity_tuples, pl_map, prefix_map, lemma_map, 1000)
pool.join() # entity_types = get_entity_types(entity_types_file) # / type_jsons = read_json_file(json_file) # / save_to_file(entity_types_file_output, type_jsons) # type_jsons = get_pickled(entity_types_file_output) # # # a = w2vec_model_3.wv.cosine_similarities((w2vec_model_3.wv.get_vector("kraj")+w2vec_model_3.wv.get_vector("federacja")+w2vec_model_3.wv.get_vector("niepodległy")+w2vec_model_3.wv.get_vector("kolonialny"))/4, category_vectors) # # outsiders = get_outsiders(type_jsons, entity_types) # save_to_file("outsiders", outsiders) outsiders = get_pickled("outsiders") BaseManager.register('GlobalMapClass', GlobalMapClass) manager = BaseManager() manager.start() shared_map = manager.GlobalMapClass() mp_handler(outsiders[0], shared_map) i = 1 # with recursion_limit(1500): # global_map = {} # for _id in outsiders[:100000]: # get_mapping(_id, global_map) #
for category in _categories: if len(category.split()) > 1: vec = (_model.wv.get_vector(category.split()[0]) + _model.wv.get_vector(category.split()[1])) / 2 else: vec = _model.wv.get_vector(category) _category_vectors.append(vec) return _category_vectors w2vec_model_3 = Word2Vec.load(dir + "all-sentences-word2vec-m3.model") category_vectors = categories_to_vectors(w2vec_model_3, categories) entity_types = get_entity_types(entity_types_file) # type_jsons = read_json_file(json_file) # save_to_file(entity_types_file_output, type_jsons) type_jsons = get_pickled(entity_types_file_output) a = w2vec_model_3.wv.cosine_similarities( (w2vec_model_3.wv.get_vector("kraj") + w2vec_model_3.wv.get_vector("federacja") + w2vec_model_3.wv.get_vector("niepodległy") + w2vec_model_3.wv.get_vector("kolonialny")) / 4, category_vectors) extract_main_entity_category(type_jsons, entity_types, category_vectors, w2vec_model_3, categories, categories_dict) data = get_pickled(saved_data_file_tokens_entities_tags) data_map_of_sentences = map_docs_to_sentences(data)
return [item for sublist in list for item in sublist] def process_batches(sentences_out): for i in range(5, 7): data = get_pickled(saved_data_file.format(i)) data_map_of_sentences = map_docs_to_sentences(data) sentences_for_docs = get_list_sentences(data_map_of_sentences) sentences = flatten_list(sentences_for_docs) sentences_out.append(sentences) # data = get_pickled(saved_data_file) # data = get_pickled(saved_data_file.format(5)) # data_filtered = filter_empty_docs(data) # data_filtered_text = map_docs_to_text(data_filtered) # data_map_of_sentences = map_docs_to_sentences(data) # sentences_for_docs = get_list_sentences(data_map_of_sentences) # sentences = flatten_list(sentences_for_docs) # sentences = [item for sublist in sentences_for_docs for item in sublist] # data_filtered_longer_tokens = filter_longer_tokens(data) article_parents = get_pickled(article_parents_output_file) pages = get_pickled(pages_output_file) category_parents = get_pickled(category_parents_output_file) child_articles = get_pickled(child_articles_output_file) child_categories = get_pickled(child_categories_output_file) # link_by_sources = get_pickled(link_by_source_output_file)
_i = 0 _scrapped_not_found_errors = [] for __t in _tuples: _l = [] for _id in __t[1]: _l.append(get_json(_id)) _scrapped_not_found_errors.append((__t[0], _l)) print(_i) _i += 1 return _scrapped_not_found_errors # scrap_mapped, scrap_not_mapped = get_pickled("scrap_found_and_not_found") # i =1 not_found_errors_list_of_tuples_chunk1 = get_pickled("not_mapped_found_candidates-{}".format(i)) (_error_not_found_candidates, _found_candidates) = get_pickled("candidates") scrap_found_map = get_pickled("scrap_found_map") scrapped_not_found_errors = scrap_not_found2(_found_candidates) save_to_file("scrapped_not_found_errors", scrapped_not_found_errors) to_scrap = [] for _t in not_found_errors_list_of_tuples_chunk1: to_scrap.append((_t[0][0], _t[0][1], list(_t[1].keys()))) scrapped_not_found = scrap_not_found(to_scrap) save_to_file("scrapped_not_found-{}".format(i), scrapped_not_found)
from poleval.lib.poleval import get_pickled lemma_map = get_pickled("lemma_map_ext") (category_map, entity_tuples, pl_map, en_map, disambiguation, prefix_map) = get_pickled("mapping-objects_ext") (_found, _not_found, _errors) = get_pickled("results_chunks_try")
_new_list.append(_tuple) if len(_new_list) > 0: _clean_map[_key] = clean_tuples(_new_list, _lemma_map, _stopwords) return _clean_map def list_to_map(_list): _map = {} for _tuple in _list: if _tuple[0] not in _map: _map[_tuple[0]] = _tuple[1] return _map stopwords = get_polish_stopwords() lemma_map = get_pickled("lemma_map_ext") scrapped_not_found_errors = get_pickled("scrapped_not_found_errors") _map_0 = list_to_map(scrapped_not_found_errors) _map = remove_disamb_pages(_map_0, lemma_map, stopwords) save_to_file("scrapped_not_found_errors_clean", _map) tuple_map0 = remove_disamb_pages(get_pickled("tuple_map_scrap-{}".format(2))) for i in range(3, 37): if i != 6: tuple_map1 = remove_disamb_pages( get_pickled("tuple_map_scrap-{}".format(i))) merge_tuple_map(tuple_map0, tuple_map1) save_to_file("wikidata_context_map", tuple_map0)
if _t: _ee = get_entity(_t) if _ee: _l1.append(_ee) return _l1 def map_scrap_found(_scrap_found_and_not_found): _scrap_found_map = {} for _t in _scrap_found_and_not_found: _scrap_found_map[_t[0]] = _t[1] return _scrap_found_map stopwords = get_polish_stopwords() lemma_map = get_pickled("lemma_map_ext") found_with_error = get_pickled("merged_map_found_with_error") found = get_pickled("merged_map_found") # (category_map, entity_tuples, pl_map, en_map, disambiguation, prefix_map) = get_pickled("mapping-objects_ext") wikidata_context_map = get_pickled("wikidata_context_map") entity_valid_map = get_pickled("entity_map-valid") scrap_found_map = get_pickled("scrap_found_map") filtered_found_with_error = filter_found_with_error(found_with_error) filtered_found_with_error_more = filter_found_with_error_more(found_with_error) w2vec = Word2Vec.load(dir + "all-sentences-word2vec-m3.model") # mapped, not_mapped = map_to_valid(entity_valid_map, found, filtered_found_with_error, wikidata_context_map, w2vec, scrap_found_map)
import sys from poleval.lib.poleval import get_pickled, get_soup, get_text, save_to_file entity_types_file_output = 'entity-types' type_jsons = get_pickled(entity_types_file_output) text_mapping = {} __disambiguation = {} __disambiguation_helper = {} cnt = 0 multi = int(sys.argv[1]) ply = 100000 # (text_mapping, __disambiguation) = get_pickled("data-scraped-{}".format(multi)) cnt = ply * (multi - 1) for json in type_jsons[(multi - 1) * ply:multi * ply]: cnt += 1 if json['wiki']['pl']: to_exclude = [' '] _entity = json['wiki']['pl'].lower().translate( {ord(i): '_' for i in to_exclude}) if json['wiki']['pl'] else False text = get_text(get_soup(_entity)) cnt += 1 print("Progress {}".format(cnt)) if cnt % ply / 10 == 0 else False