def get_alt_entities(entity_types): lis = [] for ls in entity_types: # print('ls', ls) enty = get_last(ls) # only get finest entity # print('entity:', enty) try: # simty = ep.getEntitiesForDBPediaClass(enty, 100) - slower version simty = ep.getEntitiesForType(enty, 0, 10) lis.append(simty) # print('similar entity', simty) except: pass return lis df_positive['similar_entities'] = df_positive['entity_types'].apply( get_alt_entities) # column by column print('done got similar finest entities') # separate positive and negative samples df_positive_final = df_positive[[ "category", "type", "question", "wh", "id", "similar_entities", "polarity", "noun list", "np list" ]] # subset of df # pickle pickl('training_vectors/10_train_new_positive_samples', df_positive_final) print('done pickled')
return vector new_positive_samples['we_wh_vector'] = new_positive_samples['wh'].apply(find_vector_we) print('done 0') new_positive_samples['new we_nouns_vector'] = new_positive_samples['new nouns'].apply(find_vector_we) print('done 1') new_positive_samples['new avg we_nouns_vector'] = new_positive_samples['new we_nouns_vector'].apply(cal_average) print('done 2') new_positive_samples['new we_type_vector'] = new_positive_samples['new entity types'].apply(find_vector_we) print('done 5') new_positive_samples['new avg we_type_vector'] = new_positive_samples['new we_type_vector'].apply(cal_average) print('done 6') del loaded_model new_positive_samples['new entities_KGE_vector'] = new_positive_samples['new nps2'].apply(find_vector_kge) print('done 7') new_positive_samples['new avg entities_KGE_vector'] = new_positive_samples['new entities_KGE_vector'].apply(cal_average) print('done 8') # create positive vectors new_positive_samples['new_concatenated_vector'] = new_positive_samples.apply(lambda x: [x['we_wh_vector'], x['new avg we_nouns_vector'], x['new avg entities_KGE_vector'], x['new avg we_type_vector']], axis=1) pickl('training_vectors/13_train_new_positive_samples_fin', new_positive_samples) print('done pickled 2')
for entry in dbpedia_train: if entry['question'] is None: dbpedia_train.remove(entry) '''PARSING AND EXTRACTION''' re_list = [] for a in dbpedia_train: temp = find_w(a['question']) a.update({'wh': str(temp[0])}) re_list.append(a) print('done find wh') dbpedia_train_wh = re_list pickl('training_vectors/01_dbpedia_train_wh', dbpedia_train_wh) re_list = [] for entry in dbpedia_train_wh: question = entry['question'] noun_list = nouns(question) entry.update({'noun list': noun_list}) re_list.append(entry) dbpedia_train_wh = re_list pickl('training_vectors/02_dbpedia_train_wh', dbpedia_train_wh) print('done nouns parsed') re_list = [] for entry in dbpedia_train_wh:
"type": row_column['type'], "question": row_column['question'], "wh": row_column['wh'], "id": row_column['id'], "entity": entity, "polarity": "1", "noun list": row_column['noun list'], "np list": row_column['np list'] } df = df.append(new_row, ignore_index=True) else: return df return df # convert similar entities into new samples in dataframe new_positive_samples = pd.DataFrame(columns=['category', 'type', 'question', 'wh', 'id', 'entity', 'polarity', 'np list', 'noun list']) for i in range(len(df_positive)): # iterate through questions t = df_positive.iloc[i] # might need to change back to loc positive_samples = new_samples(t) # create new row for each similar entity - df of length 100 new_positive_samples = new_positive_samples.append(positive_samples) # append to overall df print("question", i, "/", len(df_positive), new_positive_samples.shape) print('samples created') print('test', new_positive_samples) pickl('training_vectors/11_train_new_positive_samples', new_positive_samples) print('pickled')
df_negative['polarity'] = "0" ''' category concatenated_vector entities entities_KGE_vector entity_types found category found type id noun list np list question type we_nouns_vector we_type_vector we_wh_vector wh sibling_type shuffled_type shuffled_category ''' # check sample is actually negative df_negative2 = df_negative[(df_negative.shuffled_type != df_negative.type)] df_negative3 = df_negative2[(df_negative2.shuffled_category != df_negative2.category)] pickl('training_vectors/21_train_new_negative_samples', df_negative3) print('done pickled')
# print('entities_KGE_vector', len(dbpedia_train_wh['entities_KGE_vector_2'][a])) # 200 # print('we_type_vector', len(dbpedia_train_wh['we_type_vector'][a])) # 300 dbpedia_train_wh['concatenated_vector'] = dbpedia_train_wh.apply(lambda x: [x['we_wh_vector'], x['we_nouns_vector'], x['entities_KGE_vector_2'], x['we_type_vector']], axis=1) dbpedia_train_wh['con_wh_nouns'] = dbpedia_train_wh.apply(lambda x: [x['we_wh_vector'], x['we_nouns_vector']], axis=1) dbpedia_train_wh['con_wh_kge'] = dbpedia_train_wh.apply(lambda x: [x['we_wh_vector'], x['entities_KGE_vector_2']], axis=1) dbpedia_train_wh['con_nouns_KGE'] = dbpedia_train_wh.apply(lambda x: [x['we_nouns_vector'], x['entities_KGE_vector_2']], axis=1) dbpedia_train_wh['con_wh_nouns_kge'] = dbpedia_train_wh.apply(lambda x: [x['we_wh_vector'], x['we_nouns_vector'], x['entities_KGE_vector_2']], axis=1) dbpedia_train_wh['con_wh_kge_types'] = dbpedia_train_wh.apply(lambda x: [x['we_wh_vector'], x['entities_KGE_vector_2'], x['we_type_vector']], axis=1) dbpedia_train_wh2 = dbpedia_train_wh.drop(['entities_KGE_vector'], axis=1) dbpedia_train_wh3 = dbpedia_train_wh2.rename(columns={'entities_KGE_vector_2': 'entities_KGE_vector'}) print('done concatenate vector') pickl('training_vectors/final_original_training_vectors', dbpedia_train_wh3) print('done pickled') df_sample = dbpedia_train_wh3[0:10] df_sample.to_csv('data/test code/concat_initialTD_vectors.csv') print('done sampled to csv')
def typ_scores(value): type_scores = {} test = value[vector_component_type] wh = value['wh'] predict = list(test) for item in classifiers_typ: # for each classifier typ = item # get type c = classifiers_typ[item] # get classifier pred_typ = c.predict_proba([predict]) p2 = re.split(' ', str(pred_typ[0])) try: pred_typ = float(p2[1]) except: pred_typ = 0.00000000 type_scores.update({typ: pred_typ}) # store label and score in dictionary sorted_typ = heuristics(type_scores, wh, 'type') sorted_typ2 = sorted(sorted_typ.items(), key=operator.itemgetter(1), reverse=True) sorted_typ3 = replace_Location(list(sorted_typ2)) sorted_typ_top_ten = sorted_typ3[0:10] print('..') return str(sorted_typ_top_ten) test_data = test_data.drop_duplicates(subset=['id']) test_data['category_scores'] = test_data.apply(cat_scores, axis=1) test_data['type_scores'] = test_data.apply(typ_scores, axis=1) pickl(results_path, test_data)
# df_negative_sb - changed entity types df_negative_sb['new we_type_vector'] = df_negative_sb['sibling_type'].apply( find_vector_we) print('done 1') df_negative_sb['new avg we_type_vector'] = df_negative_sb[ 'new we_type_vector'].apply(cal_average) print('done 2') for col in df_negative_sb.columns: print(col) df_negative_sb['concatenated_vector'] = df_negative_sb.apply(lambda x: [ x['we_wh_vector'], x['we_nouns_vector'], x['entities_KGE_vector'], x[ 'new avg we_type_vector'] ], axis=1) print('done 3') # df_negative_st - changed types - doesn't affect anything, just changes assigned type # df_negative_sc - changed categories - doesn't affect anything, just changes assigned category # append together df_neg_all = df_negative_sb.append(df_negative_st) df_neg_all2 = df_neg_all.append(df_negative_sc) print('done 5') # pickle pickl('training_vectors/22_train_new_negative_samples_fin', df_neg_all2) print('pickled')
# print('we_nouns_vector', len(dbpedia_train_wh['we_nouns_vector'][a])) # 300 # print('entities_KGE_vector', len(dbpedia_train_wh['entities_KGE_vector_2'][a])) # 200 # print('we_type_vector', len(dbpedia_train_wh['we_type_vector'][a])) # 300 print(('...')) # rebuild concatenated vectors all_td2 = all_td.drop(['concatenated_vector', 'entities_KGE_vector'], axis=1) all_td3 = all_td2.rename( columns={'entities_KGE_vector_2': 'entities_KGE_vector'}) all_td3['con_wh_nouns'] = all_td3.apply( lambda x: [x['we_wh_vector'], x['we_nouns_vector']], axis=1) all_td3['con_wh_kge'] = all_td3.apply( lambda x: [x['we_wh_vector'], x['entities_KGE_vector']], axis=1) all_td3['con_nouns_KGE'] = all_td3.apply( lambda x: [x['we_nouns_vector'], x['entities_KGE_vector']], axis=1) all_td3['con_wh_nouns_kge'] = all_td3.apply( lambda x: [x['we_wh_vector'], x['we_nouns_vector'], x['entities_KGE_vector']], axis=1) all_td3['con_wh_kge_types'] = all_td3.apply( lambda x: [x['we_wh_vector'], x['entities_KGE_vector'], x['we_type_vector']], axis=1) all_td3['concatenated_vector'] = all_td3.apply(lambda x: [ x['we_wh_vector'], x['we_nouns_vector'], x['entities_KGE_vector'], x[ 'we_type_vector'] ], axis=1) pickl('training_vectors/31_all_td_fin', all_td3) print('pickled')
# remove none questions - clean data for entry in dbpedia_test: if entry['question'] is None: dbpedia_test.remove(entry) """PARSING AND EXTRACTION""" re_list = [] for a in dbpedia_test: temp = find_w(a['question']) a.update({'wh': str(temp[0])}) re_list.append(a) print('done find wh') dbpedia_test = re_list pickl('testing_vectors/01_dbpedia_test', dbpedia_test) re_list = [] for entry in dbpedia_test: question = entry['question'] noun_list = nouns(question) entry.update({'noun list': noun_list}) re_list.append(entry) dbpedia_test = re_list pickl('testing_vectors/02_dbpedia_test', dbpedia_test) print('done nouns parsed') re_list = [] for entry in dbpedia_test: question = entry['question']
''' author: Eleanor Bill @eljne ''' ''' require splitting original training data into training and test data as it's the data we have with correct types + categories, can measure accuracy ''' from kg.EB_classes import unpickle, pickl training_data = unpickle('training_vectors/final_original_training_vectors') training_data2 = training_data.sample(frac=0.8, random_state=1) testing_data = training_data.drop(training_data2.index) training_data3 = training_data2.reset_index(drop=True) testing_data2 = testing_data.reset_index(drop=True) pickl('training_vectors/final_original_training_vectors_minus_tests', training_data3) pickl('testing_vectors/11_testing_vectors_from_og_training_data', testing_data2)
train_set = random_sample_ratioed( copy_df, 0.80, 1, 1) # split differently according to pos/neg balance X = train_set[vector_component_category] y = train_set["y"] print(X.shape) print(y.shape) classifier = train_classifier_category( X, y) # need to convert vector from list of arrays to matrix classifiers_cat[cat_label] = classifier # all types for typ_label in types_all_unique: copy_df2 = training_data.copy() print('type label', typ_label) copy_df2["y"] = copy_df2.apply( lambda row: label_polarity_all_typs(row, typ_label, 'type'), axis=1) # label -/+ train_set = random_sample_ratioed( copy_df2, 0.80, 1, 1) # split differently according to pos/neg balance X = train_set[vector_component_type] y = train_set["y"] classifier = train_classifier( X, y) # need to convert vector from list of arrays to matrix classifiers_typ[typ_label] = classifier print('classifiers_cat', classifiers_cat) pickl(file_path_cat, classifiers_cat) print('classifiers_typ', classifiers_typ) pickl(file_path_typ, classifiers_typ)
entry['entities_KGE_vector_2'], entry['we_type_vector'] ] return cv dbpedia_test['concatenated_vector'] = dbpedia_test.apply(concatenate_vector, axis=1) dbpedia_test2 = dbpedia_test.drop(['entities_KGE_vector'], axis=1) dbpedia_test3 = dbpedia_test2.rename( columns={'entities_KGE_vector_2': 'entities_KGE_vector'}) dbpedia_test3['con_wh_nouns'] = dbpedia_test3.apply( lambda x: [x['we_wh_vector'], x['we_nouns_vector']], axis=1) dbpedia_test3['con_wh_kge'] = dbpedia_test3.apply( lambda x: [x['we_wh_vector'], x['entities_KGE_vector']], axis=1) dbpedia_test3['con_nouns_KGE'] = dbpedia_test3.apply( lambda x: [x['we_nouns_vector'], x['entities_KGE_vector']], axis=1) dbpedia_test3['con_wh_nouns_kge'] = dbpedia_test3.apply( lambda x: [x['we_wh_vector'], x['we_nouns_vector'], x['entities_KGE_vector']], axis=1) dbpedia_test3['con_wh_kge_types'] = dbpedia_test3.apply( lambda x: [x['we_wh_vector'], x['entities_KGE_vector'], x['we_type_vector']], axis=1) dbpedia_test3['concatenated_vector'] = dbpedia_test3.apply(lambda x: [ x['we_wh_vector'], x['we_nouns_vector'], x['entities_KGE_vector'], x[ 'we_type_vector'] ], axis=1) pickl('testing_vectors/10_dbpedia_test_fin', dbpedia_test3)