''' author: Eleanor Bill @eljne ''' ''' create vectors for additional training data - +ve - CONTINUED''' from kg.EB_classes import unpickle, pickl from kg.EB_classes import cal_average, find_vector_kge from gensim.models import KeyedVectors import numpy as np new_positive_samples = unpickle('training_vectors/12_train_new_positive_samples') print('unpickled') # new_positive_samples = pd.DataFrame(new_positive_samples) new_positive_samples['new nps2'] = new_positive_samples['np list'] + new_positive_samples['additional np list'] new_positive_samples['new nouns'] = new_positive_samples['noun list'] + new_positive_samples['additional noun list'] print('new fields created') # get pre-trained word embeddings fastTextfile = 'data/wiki-news-300d-1M.vec' loaded_model = KeyedVectors.load_word2vec_format(fastTextfile) print('models loaded') # find word embedding vector def find_vector_we(word_or_phrase): try: vector = loaded_model.word_vec(word_or_phrase) except: vector = np.zeros(1) print('.') return vector
''' author: Eleanor Bill @eljne ''' ''' concatenate vectors and export to dataframe/csv ''' import pandas as pd import numpy as np from kg.EB_classes import unpickle, pickl dbpedia_train_wh = unpickle('training_vectors/09_dbpedia_train_wh') ''' we_wh_vector - First position could be for the embedding of the wh question word (we can create our own embedding/encoding). we_nouns_vector - Second position for the WE of the sentence or set of nouns. entities_KGE_vector - Third position (up to 3 or 4 vector positions) for noun phrases with a good correspondence in KG (KGE of entity representing noun phrase) we_type_vector - Fourth position (up to 3 or 4 vector positions) for the WE of the types of the KG entities above. ''' dbpedia_train_wh = pd.DataFrame(dbpedia_train_wh) dbpedia_train_wh = dbpedia_train_wh.fillna(0) dbpedia_train_wh['entities_KGE_vector_2'] = dbpedia_train_wh['we_wh_vector'].copy() pd.set_option('mode.chained_assignment', None) # make sure all the same length (if returned zeros, replace with array of zeroes that is correct length) for a in range(0, len(dbpedia_train_wh)): try: if len(dbpedia_train_wh['we_wh_vector'][a]) == 1: dbpedia_train_wh['we_wh_vector'][a] = np.zeros(300) except: try: if dbpedia_train_wh['we_wh_vector'][a] == 0: dbpedia_train_wh['we_wh_vector'][a] = np.zeros(300) except:
''' author: Eleanor Bill @eljne ''' ''' augment positive samples to create more positive samples ''' ''' about 20 mins ''' import pandas as pd from kg.EB_classes import pickl, unpickle, get_last from matching.kg_matching import Endpoint from kg.endpoints import DBpediaEndpoint from ontology.onto_access import DBpediaOntology # unpickle # load = unpickle('training_vectors/final_original_training_vectors') # when we have training data from task to eval load = unpickle('training_vectors/final_original_training_vectors_minus_tests' ) # created own testing data from splitting train df_positive = pd.DataFrame(load) df_positive['polarity'] = "1" '''create more positive samples do this by: - getting a different but similar entity using SPARQLEndpoint ''' onto_access = DBpediaOntology() onto_access.loadOntology(True) ep = DBpediaEndpoint() # getEntitiesForType def get_alt_entities(entity_types): lis = [] for ls in entity_types: # print('ls', ls) enty = get_last(ls) # only get finest entity
''' author: Eleanor Bill @eljne ''' ''' create vectors for negative samples ''' from kg.EB_classes import pickl, unpickle from kg.EB_classes import cal_average from gensim.models import KeyedVectors import numpy as np # negative - shuffled types and categories affect the types and # - sibling type affects type vectors # 3 more sets of -ve data: neg = unpickle('training_vectors/21_train_new_negative_samples') df_negative_st = neg.copy() # - shuffled type - change __ embeddings df_negative_sc = neg.copy() # - shuffled category - change __ embeddings df_negative_sb = neg.copy() # - sibling type - change __ embeddings print('done copied') # how do the new samples affect the vector? fastTextfile = 'data/wiki-news-300d-1M.vec' loaded_model = KeyedVectors.load_word2vec_format(fastTextfile) # find word embedding vector def find_vector_we(word_or_phrase): try: vector = loaded_model.word_vec(word_or_phrase) except: vector = np.zeros(1) return vector
''' author: Eleanor Bill @eljne ''' ''' reformats results to be run through evaluation script''' from kg.EB_classes import unpickle, heuristics_2 import re import pandas as pd import json results_concat = unpickle('results/finals/results_OGTD_concat') results_KGE = unpickle('results/finals/results_OGTD_KGE') # results = unpickle('results/results_ALLTD') ''' REFORMATTING FOR EVAL SCRIPT ''' ''' export results to be used in evaluation script ''' ''' - `system_output_json` is a JSON file with the (participating) system's category and type predictions. The format is a list of dictionaries with keys `id`, `category`, and `type`, holding the question ID, predicted category, and ranked list of up to 10 types, respectively.''' def get_first(value): val = value['category_scores'] val = val.split(',') val = val[0] re.sub('[^A-Za-z0-9]+', '', val) val = val.replace("'", "") val = val.replace("(", "") return str(val) def get_first_list(value):
'''change depending on vector component to test''' vector_component_category = 'concatenated_vector' vector_component_type = 'concatenated_vector' # we_wh_vector # we_nouns_vector # entities_KGE_vector # we_type_vector # con_wh_nouns # con_wh_kge # con_nouns_KGE # con_wh_nouns_kge # con_wh_kge_types # concatenated_vector '''unpickle classifiers''' # use classifiers trained on all training data classifiers_cat = unpickle('classifiers/classifiers_all_cat_ALL') classifiers_typ = unpickle('classifiers/classifiers_all_typ_ALL') results_path = 'results/results_ALLTD' # use classifiers trained on original training data # classifiers_cat = unpickle('classifiers/classifiers_all_cat_OGTD') # classifiers_typ = unpickle('classifiers/classifiers_all_typ_OGTD') # results_path = 'results/results_OGTD' '''load test data vectors''' # dbpedia_test_final = unpickle('testing_vectors/10_dbpedia_test_fin') # when using provided by task dbpedia_test_final = unpickle( 'testing_vectors/11_testing_vectors_from_og_training_data' ) # when using og training test_data = pd.DataFrame(dbpedia_test_final) # data split # test_data['con_wh_nouns_2'] = test_data.apply(reformat_2, axis=1)
''' author: Eleanor Bill @eljne ''' ''' reformat test data to evaluate ''' from kg.EB_classes import unpickle, replace_Location_2 import json import pandas as pd test_truth = unpickle('testing_vectors/11_testing_vectors_from_og_training_data') test_truth = pd.DataFrame(test_truth) test_truth['type2'] = test_truth['type'].apply(replace_Location_2) test_truth2 = test_truth.drop(['type'], axis=1) test_truth3 = test_truth2.rename(columns={'type2': 'type'}) test_truth3 = test_truth3[['question', 'category', 'type', 'id']] test_truth_json = [] def reform(value): i = value['id'] c = value['category'] t = value['type'] q = value['question'] dict = {"id": i, "category": c, "type": t, "question": q} print(dict) test_truth_json.append(dict) return 0 test_truth3.apply(reform, axis=1) json_string = json.dumps(test_truth_json)
''' author: Eleanor Bill @eljne ''' ''' join all data - lots of renaming and dropping fields''' '''takes about 20 minutes''' from kg.EB_classes import pickl, unpickle import numpy as np import pandas as pd all_td = unpickle('training_vectors/30_all_td') all_td = pd.DataFrame(all_td).reset_index() all_td = all_td.fillna(0) all_td['entities_KGE_vector_2'] = all_td['we_wh_vector'].copy() pd.set_option('mode.chained_assignment', None) # make sure all the same length (if returned zeros, replace with array of zeroes that is correct length) for a in range(0, len(all_td)): try: if len(all_td['we_wh_vector'][a]) == 1: all_td['we_wh_vector'][a] = np.zeros(300) except: try: if all_td['we_wh_vector'][a] == 0: all_td['we_wh_vector'][a] = np.zeros(300) except: print('1', all_td['we_wh_vector'][a]) try: if len(all_td['we_nouns_vector'][a]) == 1: all_td['we_nouns_vector'][a] = np.zeros(300) except: try:
''' author: Eleanor Bill @eljne ''' ''' require splitting original training data into training and test data as it's the data we have with correct types + categories, can measure accuracy ''' from kg.EB_classes import unpickle, pickl training_data = unpickle('training_vectors/final_original_training_vectors') training_data2 = training_data.sample(frac=0.8, random_state=1) testing_data = training_data.drop(training_data2.index) training_data3 = training_data2.reset_index(drop=True) testing_data2 = testing_data.reset_index(drop=True) pickl('training_vectors/final_original_training_vectors_minus_tests', training_data3) pickl('testing_vectors/11_testing_vectors_from_og_training_data', testing_data2)
vector_component_type = 'concatenated_vector' # we_wh_vector # we_nouns_vector # entities_KGE_vector # we_type_vector # con_wh_nouns # con_wh_kge # con_nouns_KGE # con_wh_nouns_kge # con_wh_kge_types # concatenated_vector '''load training data''' # use all training data file_path_cat = 'classifiers/classifiers_all_cat_ALL' file_path_typ = 'classifiers/classifiers_all_typ_ALL' all_td = unpickle('training_vectors/31_all_td_fin') # use all training data td = pd.DataFrame(all_td) # use only original training data # file_path_cat = 'classifiers/classifiers_all_cat_OGTD' # file_path_typ = 'classifiers/classifiers_all_typ_OGTD' # og_td = unpickle('training_vectors/final_original_training_vectors') # td = pd.DataFrame(og_td) # td['polarity'] = "1" # td['con_wh_nouns_2'] = td.apply(reformat_2, axis=1) # td2 = td.drop(['con_wh_nouns'], axis=1) # td = td2.rename(columns={'con_wh_nouns_2': 'con_wh_nouns'}) # td['con_wh_kge_2'] = td.apply(reformat_3, axis=1) # td2 = td.drop(['con_wh_kge'], axis=1)
''' author: Eleanor Bill @eljne ''' ''' create vectors for test data ''' import numpy as np import pandas as pd from kg.EB_classes import pickl, unpickle dbpedia_test = unpickle('testing_vectors/09_dbpedia_test') # check for arrays length 1 dbpedia_test = pd.DataFrame(dbpedia_test) dbpedia_test = dbpedia_test.fillna(0) dbpedia_test['entities_KGE_vector_2'] = dbpedia_test['we_wh_vector'].copy() pd.set_option('mode.chained_assignment', None) # make sure all the same length (if returned zeros, replace with array of zeroes that is correct length) for a in range(0, len(dbpedia_test)): try: if len(dbpedia_test['we_wh_vector'][a]) == 1: dbpedia_test['we_wh_vector'][a] = np.zeros(300) except: try: if dbpedia_test['we_wh_vector'][a] == 0: dbpedia_test['we_wh_vector'][a] = np.zeros(300) except: print('1', dbpedia_test['we_wh_vector'][a]) try: if len(dbpedia_test['we_nouns_vector'][a]) == 1: dbpedia_test['we_nouns_vector'][a] = np.zeros(300) except:
''' author: Eleanor Bill @eljne ''' ''' reformats results to be run through evaluation script''' from kg.EB_classes import unpickle, heuristics_2 import re import json # results = unpickle('results/results_OGTD') results = unpickle('results/results_ALLTD') ''' REFORMATTING FOR EVAL SCRIPT ''' ''' export results to be used in evaluation script ''' ''' - `system_output_json` is a JSON file with the (participating) system's category and type predictions. The format is a list of dictionaries with keys `id`, `category`, and `type`, holding the question ID, predicted category, and ranked list of up to 10 types, respectively.''' def get_first(value): val = value['category_scores'] val = val.split(',') val = val[0] re.sub('[^A-Za-z0-9]+', '', val) val = val.replace("'", "") val = val.replace("(", "") return str(val) def get_first_list(value):