Esempio n. 1
0
''' author: Eleanor Bill @eljne '''
''' create vectors for additional training data - +ve - CONTINUED'''

from kg.EB_classes import unpickle, pickl
from kg.EB_classes import cal_average, find_vector_kge
from gensim.models import KeyedVectors
import numpy as np

new_positive_samples = unpickle('training_vectors/12_train_new_positive_samples')
print('unpickled')
# new_positive_samples = pd.DataFrame(new_positive_samples)

new_positive_samples['new nps2'] = new_positive_samples['np list'] + new_positive_samples['additional np list']
new_positive_samples['new nouns'] = new_positive_samples['noun list'] + new_positive_samples['additional noun list']
print('new fields created')

# get pre-trained word embeddings
fastTextfile = 'data/wiki-news-300d-1M.vec'
loaded_model = KeyedVectors.load_word2vec_format(fastTextfile)
print('models loaded')


# find word embedding vector
def find_vector_we(word_or_phrase):
    try:
        vector = loaded_model.word_vec(word_or_phrase)
    except:
        vector = np.zeros(1)
    print('.')
    return vector
''' author: Eleanor Bill @eljne '''
''' concatenate vectors and export to dataframe/csv '''

import pandas as pd
import numpy as np
from kg.EB_classes import unpickle, pickl

dbpedia_train_wh = unpickle('training_vectors/09_dbpedia_train_wh')

'''
we_wh_vector - First position could be for the embedding of the wh question word (we can create our own embedding/encoding).
we_nouns_vector - Second position for the WE of the sentence or set of nouns.
entities_KGE_vector - Third position (up to 3 or 4 vector positions) for noun phrases with a good correspondence in KG (KGE of entity representing noun phrase)
we_type_vector - Fourth position (up to 3 or 4 vector positions) for the WE of the types of the KG entities above.
'''

dbpedia_train_wh = pd.DataFrame(dbpedia_train_wh)
dbpedia_train_wh = dbpedia_train_wh.fillna(0)
dbpedia_train_wh['entities_KGE_vector_2'] = dbpedia_train_wh['we_wh_vector'].copy()

pd.set_option('mode.chained_assignment', None)
# make sure all the same length (if returned zeros, replace with array of zeroes that is correct length)
for a in range(0, len(dbpedia_train_wh)):
    try:
        if len(dbpedia_train_wh['we_wh_vector'][a]) == 1:
            dbpedia_train_wh['we_wh_vector'][a] = np.zeros(300)
    except:
        try:
            if dbpedia_train_wh['we_wh_vector'][a] == 0:
                dbpedia_train_wh['we_wh_vector'][a] = np.zeros(300)
        except:
Esempio n. 3
0
''' author: Eleanor Bill @eljne '''
''' augment positive samples to create more positive samples '''
''' about 20 mins '''

import pandas as pd
from kg.EB_classes import pickl, unpickle, get_last
from matching.kg_matching import Endpoint
from kg.endpoints import DBpediaEndpoint
from ontology.onto_access import DBpediaOntology

# unpickle
# load = unpickle('training_vectors/final_original_training_vectors') # when we have training data from task to eval
load = unpickle('training_vectors/final_original_training_vectors_minus_tests'
                )  # created own testing data from splitting train
df_positive = pd.DataFrame(load)
df_positive['polarity'] = "1"
'''create more positive samples
do this by:
- getting a different but similar entity using SPARQLEndpoint
'''

onto_access = DBpediaOntology()
onto_access.loadOntology(True)
ep = DBpediaEndpoint()  # getEntitiesForType


def get_alt_entities(entity_types):
    lis = []
    for ls in entity_types:
        # print('ls', ls)
        enty = get_last(ls)  # only get finest entity
''' author: Eleanor Bill @eljne '''
''' create vectors for negative samples '''

from kg.EB_classes import pickl, unpickle
from kg.EB_classes import cal_average
from gensim.models import KeyedVectors
import numpy as np

# negative - shuffled types and categories affect the types and
#          - sibling type affects type vectors
# 3 more sets of -ve data:

neg = unpickle('training_vectors/21_train_new_negative_samples')

df_negative_st = neg.copy()  # - shuffled type - change __ embeddings
df_negative_sc = neg.copy()  # - shuffled category - change __ embeddings
df_negative_sb = neg.copy()  # - sibling type - change __ embeddings
print('done copied')

# how do the new samples affect the vector?
fastTextfile = 'data/wiki-news-300d-1M.vec'
loaded_model = KeyedVectors.load_word2vec_format(fastTextfile)


# find word embedding vector
def find_vector_we(word_or_phrase):
    try:
        vector = loaded_model.word_vec(word_or_phrase)
    except:
        vector = np.zeros(1)
    return vector
Esempio n. 5
0
''' author: Eleanor Bill @eljne '''
''' reformats results to be run through evaluation script'''

from kg.EB_classes import unpickle, heuristics_2
import re
import pandas as pd
import json

results_concat = unpickle('results/finals/results_OGTD_concat')
results_KGE = unpickle('results/finals/results_OGTD_KGE')

# results = unpickle('results/results_ALLTD')
''' REFORMATTING FOR EVAL SCRIPT '''
''' export results to be used in evaluation script '''
'''    - `system_output_json` is a JSON file with the (participating) system's
      category and type predictions. The format is a list of dictionaries with
      keys `id`, `category`, and `type`, holding the question ID, predicted
      category, and ranked list of up to 10 types, respectively.'''


def get_first(value):
    val = value['category_scores']
    val = val.split(',')
    val = val[0]
    re.sub('[^A-Za-z0-9]+', '', val)
    val = val.replace("'", "")
    val = val.replace("(", "")
    return str(val)


def get_first_list(value):
Esempio n. 6
0
'''change depending on vector component to test'''
vector_component_category = 'concatenated_vector'
vector_component_type = 'concatenated_vector'
# we_wh_vector
# we_nouns_vector
# entities_KGE_vector
# we_type_vector
# con_wh_nouns
# con_wh_kge
# con_nouns_KGE
# con_wh_nouns_kge
# con_wh_kge_types
# concatenated_vector
'''unpickle classifiers'''
# use classifiers trained on all training data
classifiers_cat = unpickle('classifiers/classifiers_all_cat_ALL')
classifiers_typ = unpickle('classifiers/classifiers_all_typ_ALL')
results_path = 'results/results_ALLTD'

# use classifiers trained on original training data
# classifiers_cat = unpickle('classifiers/classifiers_all_cat_OGTD')
# classifiers_typ = unpickle('classifiers/classifiers_all_typ_OGTD')
# results_path = 'results/results_OGTD'
'''load test data vectors'''
# dbpedia_test_final = unpickle('testing_vectors/10_dbpedia_test_fin')    # when using provided by task
dbpedia_test_final = unpickle(
    'testing_vectors/11_testing_vectors_from_og_training_data'
)  # when using og training
test_data = pd.DataFrame(dbpedia_test_final)  # data split

# test_data['con_wh_nouns_2'] = test_data.apply(reformat_2, axis=1)
Esempio n. 7
0
''' author: Eleanor Bill @eljne '''
''' reformat test data to evaluate '''

from kg.EB_classes import unpickle, replace_Location_2
import json
import pandas as pd

test_truth = unpickle('testing_vectors/11_testing_vectors_from_og_training_data')
test_truth = pd.DataFrame(test_truth)

test_truth['type2'] = test_truth['type'].apply(replace_Location_2)
test_truth2 = test_truth.drop(['type'], axis=1)
test_truth3 = test_truth2.rename(columns={'type2': 'type'})
test_truth3 = test_truth3[['question', 'category', 'type', 'id']]
test_truth_json = []


def reform(value):
    i = value['id']
    c = value['category']
    t = value['type']
    q = value['question']
    dict = {"id": i, "category": c, "type": t, "question": q}
    print(dict)
    test_truth_json.append(dict)
    return 0


test_truth3.apply(reform, axis=1)
json_string = json.dumps(test_truth_json)
Esempio n. 8
0
''' author: Eleanor Bill @eljne '''
''' join all data - lots of renaming and dropping fields'''
'''takes about 20 minutes'''

from kg.EB_classes import pickl, unpickle
import numpy as np
import pandas as pd

all_td = unpickle('training_vectors/30_all_td')
all_td = pd.DataFrame(all_td).reset_index()
all_td = all_td.fillna(0)
all_td['entities_KGE_vector_2'] = all_td['we_wh_vector'].copy()

pd.set_option('mode.chained_assignment', None)
# make sure all the same length (if returned zeros, replace with array of zeroes that is correct length)
for a in range(0, len(all_td)):
    try:
        if len(all_td['we_wh_vector'][a]) == 1:
            all_td['we_wh_vector'][a] = np.zeros(300)
    except:
        try:
            if all_td['we_wh_vector'][a] == 0:
                all_td['we_wh_vector'][a] = np.zeros(300)
        except:
            print('1', all_td['we_wh_vector'][a])

    try:
        if len(all_td['we_nouns_vector'][a]) == 1:
            all_td['we_nouns_vector'][a] = np.zeros(300)
    except:
        try:
''' author: Eleanor Bill @eljne '''
''' require splitting original training data into training and test data as it's the
data we have with correct types + categories, can measure accuracy '''
from kg.EB_classes import unpickle, pickl

training_data = unpickle('training_vectors/final_original_training_vectors')
training_data2 = training_data.sample(frac=0.8, random_state=1)
testing_data = training_data.drop(training_data2.index)

training_data3 = training_data2.reset_index(drop=True)
testing_data2 = testing_data.reset_index(drop=True)

pickl('training_vectors/final_original_training_vectors_minus_tests',
      training_data3)
pickl('testing_vectors/11_testing_vectors_from_og_training_data',
      testing_data2)
vector_component_type = 'concatenated_vector'
# we_wh_vector
# we_nouns_vector
# entities_KGE_vector
# we_type_vector
# con_wh_nouns
# con_wh_kge
# con_nouns_KGE
# con_wh_nouns_kge
# con_wh_kge_types
# concatenated_vector
'''load training data'''
# use all training data
file_path_cat = 'classifiers/classifiers_all_cat_ALL'
file_path_typ = 'classifiers/classifiers_all_typ_ALL'
all_td = unpickle('training_vectors/31_all_td_fin')  # use all training data
td = pd.DataFrame(all_td)

# use only original training data
# file_path_cat = 'classifiers/classifiers_all_cat_OGTD'
# file_path_typ = 'classifiers/classifiers_all_typ_OGTD'
# og_td = unpickle('training_vectors/final_original_training_vectors')
# td = pd.DataFrame(og_td)
# td['polarity'] = "1"

# td['con_wh_nouns_2'] = td.apply(reformat_2, axis=1)
# td2 = td.drop(['con_wh_nouns'], axis=1)
# td = td2.rename(columns={'con_wh_nouns_2': 'con_wh_nouns'})

# td['con_wh_kge_2'] = td.apply(reformat_3, axis=1)
# td2 = td.drop(['con_wh_kge'], axis=1)
''' author: Eleanor Bill @eljne '''
''' create vectors for test data '''
import numpy as np
import pandas as pd
from kg.EB_classes import pickl, unpickle

dbpedia_test = unpickle('testing_vectors/09_dbpedia_test')

# check for arrays length 1

dbpedia_test = pd.DataFrame(dbpedia_test)
dbpedia_test = dbpedia_test.fillna(0)
dbpedia_test['entities_KGE_vector_2'] = dbpedia_test['we_wh_vector'].copy()

pd.set_option('mode.chained_assignment', None)
# make sure all the same length (if returned zeros, replace with array of zeroes that is correct length)
for a in range(0, len(dbpedia_test)):
    try:
        if len(dbpedia_test['we_wh_vector'][a]) == 1:
            dbpedia_test['we_wh_vector'][a] = np.zeros(300)
    except:
        try:
            if dbpedia_test['we_wh_vector'][a] == 0:
                dbpedia_test['we_wh_vector'][a] = np.zeros(300)
        except:
            print('1', dbpedia_test['we_wh_vector'][a])

    try:
        if len(dbpedia_test['we_nouns_vector'][a]) == 1:
            dbpedia_test['we_nouns_vector'][a] = np.zeros(300)
    except:
Esempio n. 12
0
''' author: Eleanor Bill @eljne '''
''' reformats results to be run through evaluation script'''

from kg.EB_classes import unpickle, heuristics_2
import re
import json

# results = unpickle('results/results_OGTD')

results = unpickle('results/results_ALLTD')

''' REFORMATTING FOR EVAL SCRIPT '''
''' export results to be used in evaluation script '''

'''    - `system_output_json` is a JSON file with the (participating) system's
      category and type predictions. The format is a list of dictionaries with
      keys `id`, `category`, and `type`, holding the question ID, predicted
      category, and ranked list of up to 10 types, respectively.'''


def get_first(value):
    val = value['category_scores']
    val = val.split(',')
    val = val[0]
    re.sub('[^A-Za-z0-9]+', '', val)
    val = val.replace("'", "")
    val = val.replace("(", "")
    return str(val)


def get_first_list(value):