Ejemplo n.º 1
0
def get_alt_entities(entity_types):
    lis = []
    for ls in entity_types:
        # print('ls', ls)
        enty = get_last(ls)  # only get finest entity
        # print('entity:', enty)
        try:
            # simty = ep.getEntitiesForDBPediaClass(enty, 100) - slower version
            simty = ep.getEntitiesForType(enty, 0, 10)
            lis.append(simty)
            # print('similar entity', simty)
        except:
            pass
    return lis


df_positive['similar_entities'] = df_positive['entity_types'].apply(
    get_alt_entities)  # column by column
print('done got similar finest entities')

# separate positive and negative samples
df_positive_final = df_positive[[
    "category", "type", "question", "wh", "id", "similar_entities", "polarity",
    "noun list", "np list"
]]  # subset of df

# pickle
pickl('training_vectors/10_train_new_positive_samples', df_positive_final)
print('done pickled')
Ejemplo n.º 2
0
    return vector


new_positive_samples['we_wh_vector'] = new_positive_samples['wh'].apply(find_vector_we)
print('done 0')
new_positive_samples['new we_nouns_vector'] = new_positive_samples['new nouns'].apply(find_vector_we)
print('done 1')
new_positive_samples['new avg we_nouns_vector'] = new_positive_samples['new we_nouns_vector'].apply(cal_average)
print('done 2')
new_positive_samples['new we_type_vector'] = new_positive_samples['new entity types'].apply(find_vector_we)
print('done 5')
new_positive_samples['new avg we_type_vector'] = new_positive_samples['new we_type_vector'].apply(cal_average)
print('done 6')

del loaded_model

new_positive_samples['new entities_KGE_vector'] = new_positive_samples['new nps2'].apply(find_vector_kge)
print('done 7')
new_positive_samples['new avg entities_KGE_vector'] = new_positive_samples['new entities_KGE_vector'].apply(cal_average)
print('done 8')

# create positive vectors
new_positive_samples['new_concatenated_vector'] = new_positive_samples.apply(lambda x: [x['we_wh_vector'],
                                                                                        x['new avg we_nouns_vector'],
                                                                                        x['new avg entities_KGE_vector'],
                                                                                        x['new avg we_type_vector']],
                                                                             axis=1)

pickl('training_vectors/13_train_new_positive_samples_fin', new_positive_samples)
print('done pickled 2')
Ejemplo n.º 3
0
for entry in dbpedia_train:
    if entry['question'] is None:
        dbpedia_train.remove(entry)


'''PARSING AND EXTRACTION'''

re_list = []
for a in dbpedia_train:
    temp = find_w(a['question'])
    a.update({'wh': str(temp[0])})
    re_list.append(a)

print('done find wh')
dbpedia_train_wh = re_list
pickl('training_vectors/01_dbpedia_train_wh', dbpedia_train_wh)


re_list = []
for entry in dbpedia_train_wh:
    question = entry['question']
    noun_list = nouns(question)
    entry.update({'noun list': noun_list})
    re_list.append(entry)

dbpedia_train_wh = re_list
pickl('training_vectors/02_dbpedia_train_wh', dbpedia_train_wh)
print('done nouns parsed')

re_list = []
for entry in dbpedia_train_wh:
                       "type": row_column['type'],
                       "question": row_column['question'],
                       "wh": row_column['wh'],
                       "id": row_column['id'],
                       "entity": entity,
                       "polarity": "1",
                       "noun list": row_column['noun list'],
                       "np list": row_column['np list']
                       }
            df = df.append(new_row, ignore_index=True)
    else:
        return df
    return df


# convert similar entities into new samples in dataframe
new_positive_samples = pd.DataFrame(columns=['category', 'type', 'question', 'wh', 'id', 'entity', 'polarity', 'np list', 'noun list'])

for i in range(len(df_positive)):  # iterate through questions
    t = df_positive.iloc[i] # might need to change back to loc
    positive_samples = new_samples(t)  # create new row for each similar entity - df of length 100
    new_positive_samples = new_positive_samples.append(positive_samples)  # append to overall df
    print("question", i, "/", len(df_positive), new_positive_samples.shape)

print('samples created')

print('test', new_positive_samples)

pickl('training_vectors/11_train_new_positive_samples', new_positive_samples)
print('pickled')
Ejemplo n.º 5
0
df_negative['polarity'] = "0"

'''
category	
concatenated_vector	
entities	
entities_KGE_vector	
entity_types	
found category	
found type	
id	
noun list	
np list	
question	
type	
we_nouns_vector	
we_type_vector	
we_wh_vector	
wh	
sibling_type	
shuffled_type
shuffled_category
'''

# check sample is actually negative
df_negative2 = df_negative[(df_negative.shuffled_type != df_negative.type)]
df_negative3 = df_negative2[(df_negative2.shuffled_category != df_negative2.category)]

pickl('training_vectors/21_train_new_negative_samples', df_negative3)
print('done pickled')
    # print('entities_KGE_vector', len(dbpedia_train_wh['entities_KGE_vector_2'][a]))  # 200
    # print('we_type_vector', len(dbpedia_train_wh['we_type_vector'][a]))  # 300

dbpedia_train_wh['concatenated_vector'] = dbpedia_train_wh.apply(lambda x: [x['we_wh_vector'],
                                                                            x['we_nouns_vector'],
                                                                            x['entities_KGE_vector_2'],
                                                                            x['we_type_vector']], axis=1)
dbpedia_train_wh['con_wh_nouns'] = dbpedia_train_wh.apply(lambda x: [x['we_wh_vector'],
                                                                     x['we_nouns_vector']], axis=1)
dbpedia_train_wh['con_wh_kge'] = dbpedia_train_wh.apply(lambda x: [x['we_wh_vector'],
                                                                   x['entities_KGE_vector_2']], axis=1)
dbpedia_train_wh['con_nouns_KGE'] = dbpedia_train_wh.apply(lambda x: [x['we_nouns_vector'],
                                                                      x['entities_KGE_vector_2']], axis=1)
dbpedia_train_wh['con_wh_nouns_kge'] = dbpedia_train_wh.apply(lambda x: [x['we_wh_vector'],
                                                                         x['we_nouns_vector'],
                                                                         x['entities_KGE_vector_2']], axis=1)
dbpedia_train_wh['con_wh_kge_types'] = dbpedia_train_wh.apply(lambda x: [x['we_wh_vector'],
                                                                         x['entities_KGE_vector_2'],
                                                                         x['we_type_vector']], axis=1)

dbpedia_train_wh2 = dbpedia_train_wh.drop(['entities_KGE_vector'], axis=1)
dbpedia_train_wh3 = dbpedia_train_wh2.rename(columns={'entities_KGE_vector_2': 'entities_KGE_vector'})
print('done concatenate vector')

pickl('training_vectors/final_original_training_vectors', dbpedia_train_wh3)
print('done pickled')

df_sample = dbpedia_train_wh3[0:10]
df_sample.to_csv('data/test code/concat_initialTD_vectors.csv')
print('done sampled to csv')
Ejemplo n.º 7
0
def typ_scores(value):
    type_scores = {}
    test = value[vector_component_type]
    wh = value['wh']
    predict = list(test)
    for item in classifiers_typ:  # for each classifier
        typ = item  # get type
        c = classifiers_typ[item]  # get classifier
        pred_typ = c.predict_proba([predict])
        p2 = re.split(' ', str(pred_typ[0]))
        try:
            pred_typ = float(p2[1])
        except:
            pred_typ = 0.00000000
        type_scores.update({typ:
                            pred_typ})  # store label and score in dictionary
    sorted_typ = heuristics(type_scores, wh, 'type')
    sorted_typ2 = sorted(sorted_typ.items(),
                         key=operator.itemgetter(1),
                         reverse=True)
    sorted_typ3 = replace_Location(list(sorted_typ2))
    sorted_typ_top_ten = sorted_typ3[0:10]
    print('..')
    return str(sorted_typ_top_ten)


test_data = test_data.drop_duplicates(subset=['id'])
test_data['category_scores'] = test_data.apply(cat_scores, axis=1)
test_data['type_scores'] = test_data.apply(typ_scores, axis=1)
pickl(results_path, test_data)
# df_negative_sb - changed entity types
df_negative_sb['new we_type_vector'] = df_negative_sb['sibling_type'].apply(
    find_vector_we)
print('done 1')
df_negative_sb['new avg we_type_vector'] = df_negative_sb[
    'new we_type_vector'].apply(cal_average)
print('done 2')

for col in df_negative_sb.columns:
    print(col)

df_negative_sb['concatenated_vector'] = df_negative_sb.apply(lambda x: [
    x['we_wh_vector'], x['we_nouns_vector'], x['entities_KGE_vector'], x[
        'new avg we_type_vector']
],
                                                             axis=1)
print('done 3')

# df_negative_st - changed types - doesn't affect anything, just changes assigned type
# df_negative_sc - changed categories - doesn't affect anything, just changes assigned category

# append together
df_neg_all = df_negative_sb.append(df_negative_st)
df_neg_all2 = df_neg_all.append(df_negative_sc)
print('done 5')

# pickle
pickl('training_vectors/22_train_new_negative_samples_fin', df_neg_all2)
print('pickled')
Ejemplo n.º 9
0
    # print('we_nouns_vector', len(dbpedia_train_wh['we_nouns_vector'][a]))  # 300
    # print('entities_KGE_vector', len(dbpedia_train_wh['entities_KGE_vector_2'][a]))  # 200
    # print('we_type_vector', len(dbpedia_train_wh['we_type_vector'][a]))  # 300
    print(('...'))

# rebuild concatenated vectors
all_td2 = all_td.drop(['concatenated_vector', 'entities_KGE_vector'], axis=1)
all_td3 = all_td2.rename(
    columns={'entities_KGE_vector_2': 'entities_KGE_vector'})
all_td3['con_wh_nouns'] = all_td3.apply(
    lambda x: [x['we_wh_vector'], x['we_nouns_vector']], axis=1)
all_td3['con_wh_kge'] = all_td3.apply(
    lambda x: [x['we_wh_vector'], x['entities_KGE_vector']], axis=1)
all_td3['con_nouns_KGE'] = all_td3.apply(
    lambda x: [x['we_nouns_vector'], x['entities_KGE_vector']], axis=1)
all_td3['con_wh_nouns_kge'] = all_td3.apply(
    lambda x:
    [x['we_wh_vector'], x['we_nouns_vector'], x['entities_KGE_vector']],
    axis=1)
all_td3['con_wh_kge_types'] = all_td3.apply(
    lambda x:
    [x['we_wh_vector'], x['entities_KGE_vector'], x['we_type_vector']],
    axis=1)
all_td3['concatenated_vector'] = all_td3.apply(lambda x: [
    x['we_wh_vector'], x['we_nouns_vector'], x['entities_KGE_vector'], x[
        'we_type_vector']
],
                                               axis=1)
pickl('training_vectors/31_all_td_fin', all_td3)
print('pickled')
# remove none questions - clean data
for entry in dbpedia_test:
    if entry['question'] is None:
        dbpedia_test.remove(entry)

"""PARSING AND EXTRACTION"""

re_list = []
for a in dbpedia_test:
    temp = find_w(a['question'])
    a.update({'wh': str(temp[0])})
    re_list.append(a)

print('done find wh')
dbpedia_test = re_list
pickl('testing_vectors/01_dbpedia_test', dbpedia_test)

re_list = []
for entry in dbpedia_test:
    question = entry['question']
    noun_list = nouns(question)
    entry.update({'noun list': noun_list})
    re_list.append(entry)

dbpedia_test = re_list
pickl('testing_vectors/02_dbpedia_test', dbpedia_test)
print('done nouns parsed')

re_list = []
for entry in dbpedia_test:
    question = entry['question']
''' author: Eleanor Bill @eljne '''
''' require splitting original training data into training and test data as it's the
data we have with correct types + categories, can measure accuracy '''
from kg.EB_classes import unpickle, pickl

training_data = unpickle('training_vectors/final_original_training_vectors')
training_data2 = training_data.sample(frac=0.8, random_state=1)
testing_data = training_data.drop(training_data2.index)

training_data3 = training_data2.reset_index(drop=True)
testing_data2 = testing_data.reset_index(drop=True)

pickl('training_vectors/final_original_training_vectors_minus_tests',
      training_data3)
pickl('testing_vectors/11_testing_vectors_from_og_training_data',
      testing_data2)
    train_set = random_sample_ratioed(
        copy_df, 0.80, 1, 1)  # split differently according to pos/neg balance
    X = train_set[vector_component_category]
    y = train_set["y"]
    print(X.shape)
    print(y.shape)
    classifier = train_classifier_category(
        X, y)  # need to convert vector from list of arrays to matrix
    classifiers_cat[cat_label] = classifier

# all types
for typ_label in types_all_unique:
    copy_df2 = training_data.copy()
    print('type label', typ_label)
    copy_df2["y"] = copy_df2.apply(
        lambda row: label_polarity_all_typs(row, typ_label, 'type'),
        axis=1)  # label -/+
    train_set = random_sample_ratioed(
        copy_df2, 0.80, 1, 1)  # split differently according to pos/neg balance
    X = train_set[vector_component_type]
    y = train_set["y"]
    classifier = train_classifier(
        X, y)  # need to convert vector from list of arrays to matrix
    classifiers_typ[typ_label] = classifier

print('classifiers_cat', classifiers_cat)
pickl(file_path_cat, classifiers_cat)

print('classifiers_typ', classifiers_typ)
pickl(file_path_typ, classifiers_typ)
        entry['entities_KGE_vector_2'], entry['we_type_vector']
    ]
    return cv


dbpedia_test['concatenated_vector'] = dbpedia_test.apply(concatenate_vector,
                                                         axis=1)
dbpedia_test2 = dbpedia_test.drop(['entities_KGE_vector'], axis=1)
dbpedia_test3 = dbpedia_test2.rename(
    columns={'entities_KGE_vector_2': 'entities_KGE_vector'})
dbpedia_test3['con_wh_nouns'] = dbpedia_test3.apply(
    lambda x: [x['we_wh_vector'], x['we_nouns_vector']], axis=1)
dbpedia_test3['con_wh_kge'] = dbpedia_test3.apply(
    lambda x: [x['we_wh_vector'], x['entities_KGE_vector']], axis=1)
dbpedia_test3['con_nouns_KGE'] = dbpedia_test3.apply(
    lambda x: [x['we_nouns_vector'], x['entities_KGE_vector']], axis=1)
dbpedia_test3['con_wh_nouns_kge'] = dbpedia_test3.apply(
    lambda x:
    [x['we_wh_vector'], x['we_nouns_vector'], x['entities_KGE_vector']],
    axis=1)
dbpedia_test3['con_wh_kge_types'] = dbpedia_test3.apply(
    lambda x:
    [x['we_wh_vector'], x['entities_KGE_vector'], x['we_type_vector']],
    axis=1)
dbpedia_test3['concatenated_vector'] = dbpedia_test3.apply(lambda x: [
    x['we_wh_vector'], x['we_nouns_vector'], x['entities_KGE_vector'], x[
        'we_type_vector']
],
                                                           axis=1)
pickl('testing_vectors/10_dbpedia_test_fin', dbpedia_test3)