Esempio n. 1
0
def get_name_fome_id_mysql(mid, table_name):
    db = MySQL(ip='10.61.2.166', port=3306, user='******',pw='zengyutao', db_name='wikidata')
    query = "select name from %s where mid = '%s' " % (table_name, mid)
    print query
    name = db.search(query)
    if len(name) >= 1:
        return name[0]
    return None
Esempio n. 2
0
def get_single_relation_tensor(data, max_cand, word_dict, i, triples, relation_dict):
    db_conn = MySQL(ip='10.61.2.166', port=3306, user='******',
                                       pw='zengyutao', db_name='wikidata')
    question = data.loc[data['qid'] == i, 'question'].iloc[0]
    seq_len = len(word_tokenize(question))
    rel_tensor = {}
    cand_rels = get_cand_rel(triples, [max_cand])
    cand_tw = []
    cand_te = []
    cand_ti = []
    for rel in cand_rels:
        cand_ti.append(Variable(torch.LongTensor([relation_dict[rel]]).cuda()))
        rel_re = rel.replace('/', ' ').replace('_', ' ')
        print (word_tokenize(rel_re))
        rel_len = len(word_tokenize(rel_re))
        r_tw = line_to_word_tensor(rel_re, word_dict, rel_len)
        cand_tw.append(Variable(r_tw))
        rel_vec = get_relation_vector(db_conn, rel)
        rel_vec = [float(x) for x in str(rel_vec).split(',')]
        #r_te = Variable(torch.Tensor([rel_vec])).cuda()
        r_tensor = torch.rand(1, r_tw.size(1), len(rel_vec))
        for x in range(r_tw.size(1)):
            r_tensor[0][x] = torch.Tensor(rel_vec)
        cand_te.append(Variable(r_tensor.cuda()))
        #cand_te.append(r_te)

    gold_rel = data.loc[data['qid'] == i, 'relation'].iloc[0]

    rel_tensor['candrw_tensor'] = cand_tw
    rel_tensor['candre_tensor'] = cand_te
    rel_tensor['candri_tensor'] = cand_ti
    rel_tensor['golden_relation'] = gold_rel
    rel_tensor['cand_rels'] = cand_rels
    return rel_tensor
Esempio n. 3
0
def get_type(df):
    db_conn = MySQL(ip='10.61.2.166',
                    port=3306,
                    user='******',
                    pw='zengyutao',
                    db_name='wikidata')
    df['type'] = df.apply(lambda x: mid_type(db_conn, x['subject_id']), axis=1)
    df['type_name'] = df.apply(
        lambda x: get_mid_to_name_mysql(db_conn, x['type']), axis=1)
    return df
Esempio n. 4
0
def load_relation_embedding(relation_dict, dim):
    db_conn = MySQL(ip='10.61.2.166', port=3306, user='******',
                                       pw='zengyutao', db_name='wikidata')
    embedding = torch.randn(len(relation_dict), dim)

    for e in relation_dict:
        vec = get_relation_vector(db_conn, e)
        vec_s = [float(x) for x in str(vec).split(',')]
        embedding[relation_dict[e]] = torch.Tensor(vec_s)
    return torch.Tensor(embedding).cuda()
Esempio n. 5
0
 def __init__(self, mid_name_file=None,
              mid_qid_file=None,
              topic_words_file=None,
              sq_data_file=None):
     self.sq_dataset = pd.DataFrame()
     self.subject_ids = {}
     self.relations = {}
     self.object_ids = {}
     self.questions = {}
     self.subject_names = {}
     self.object_names = {}
     self.fb_dict = FbDictionary()
     self.fb_entities = {}
     self.fb_relations = set()
     self.word_dict = Dictionary()
     self.mid_name_file = "../datas/mid2name.tsv" if mid_name_file is None else mid_name_file
     self.mid_qid_file = "../datas/fb2w.nt" if mid_qid_file is None else mid_qid_file
     self.topic_words_file = topic_words_file
     self.sq_data_file = sq_data_file
     #self.tp_replace_file = tp_replace_file
     self.mid_name_dict, self.name_mid_dict = self.mid_name_convert(mid_name_file)
     self.db_conn = MySQL(ip='10.61.2.166', port=3306, user='******',
                               pw='zengyutao', db_name='wikidata')
Esempio n. 6
0
def feature_select(sq_datas):
    feature_columns = ['lcs_pq', 'lcs_pe', 'lcw_pq', 'lcw_pe', 'word_score']
    sq_datas['question'] = sq_datas['question'].fillna(value='none')
    sq_datas['topic_words_names'] = sq_datas['topic_words_names'].fillna(value='none')
    sq_datas['lcs_pq'] = sq_datas.apply(get_lcs_pq, axis=1)
    sq_datas['lcs_pe'] = sq_datas.apply(get_lcs_pe, axis=1)
    sq_datas['lcw_pq'] = sq_datas.apply(get_lcw_pq, axis=1)
    sq_datas['lcw_pe'] = sq_datas.apply(get_lcw_pe, axis=1)
    # sq_datas['word_score'] = sq_datas['word_score'].replace(to_replace="[a-zA-Z]", value='', regex=True, inplace=True)
    # sq_datas['word_score'] = sq_datas['word_score'].apply(pd.to_numeric, errors='coerce')


    # sq_datas['qid'] = sq_datas.index
    # sq_datas['token_question'] = sq_datas['question'].apply(lambda x: word_tokenize(x))
    # tf = sq_datas.token_question.apply(pd.value_counts).fillna(0)
    # idf = np.log((len(sq_datas) + 1) / (tf.gt(0).sum() + 1))
    # tf_idf = tf * idf
    # sq_datas['tf_idf'] = sq_datas.apply(lambda x: get_tf_idf(x.topic_words_names,
    #                                                          x.qid,
    #                                                          tf_idf), axis=1)

    db_conn = MySQL(ip='10.61.2.166', port=3306, user='******',
                                  pw='zengyutao', db_name='wikidata')

    sq_datas['vector'] = sq_datas.apply(lambda x: get_entity_vector(db_conn, x['topic_words']), axis=1)
    vec_cols = []
    for i in range(50):
        feature_columns.append("vec_" + str(i))
        vec_cols.append("vec_" + str(i))
    # print (sq_datas)
    print (len(sq_datas))
    #sq_datas[vec_cols] = sq_datas.dropna(axis=0, how='any')

    print (len(sq_datas))
    sq_datas[vec_cols] = pd.DataFrame(sq_datas['vector'].str.split(',').values.tolist())
    sq_datas[vec_cols] = sq_datas[vec_cols].astype(float)
    sq_datas = sq_datas.fillna(value=0)

    #sq_datas['type'] = sq_datas.apply(lambda x: mid_type(db_conn, x['topic_words']), axis=1)
    #dummy = pd.get_dummies(sq_datas['type'], prefix='type')
    #sq_datas = pd.concat([sq_datas, dummy], axis=1)
    #for col in dummy.columns:
    #    feature_columns.append(col)
    print (sq_datas[0: 50])
    #sq_datas = sq_datas.sample(frac=0.6)
    return sq_datas[feature_columns], sq_datas['label']
Esempio n. 7
0
def prepair_test_data(data):
    data = data.sort_values(['qid', 'predict'], ascending=[True, False])
    db_conn = MySQL(ip='10.61.2.166', port=3306, user='******',
                                       pw='zengyutao', db_name='wikidata')
    test = {'question':[], 'cand_ents':[], 'pos':[], 'entities_vec':[],
            'predict':[], 'cand_vecs': []}
    qid_max = max(data['qid'].tolist())
    for i in range(qid_max):
        question = data[data['qid'] == i, 'question'].iloc[0]
        test['question'].append(question)

        pos = data.loc[data['qid'] == i, 'pos'].iloc[0]
        test['pos'].append(pos)

        predict = data.loc[data['qid'] == i, 'predict']
        test['predict'].append(predict.tolist())

        cand_mid = data.loc[data['qid'] == i, 'topic_words']
        test['cant_ents'].append(cand_mid.tolist())

        vectors = []
        cand_vec_dict = {}
        for cand in cand_mid:
            #print cand
            vec = get_entity_vector(db_conn, cand)
            #print vec
            vec = [float(x) for x in str(vec).split(',')]
            vectors.append(vec)
            cand_vec_dict[cand] = vec

        entities_vec = np.matmul(np.array(predict), np.array(vectors))
        test['entities_vec'].append(entities_vec)

        cand_vec = [cand_vec_dict[c] for c in cand_mid]
        test['cand_vecs'].append(cand_vec)
    test_data = pd.DataFrame()
    for key in test:
        test_data[key] = test.get(key)
    test_data['qid'] = test_data.index
    return test_data
Esempio n. 8
0
class DataReader(object):
    def __init__(self, mid_name_file=None,
                 mid_qid_file=None,
                 topic_words_file=None,
                 sq_data_file=None):
        self.sq_dataset = pd.DataFrame()
        self.subject_ids = {}
        self.relations = {}
        self.object_ids = {}
        self.questions = {}
        self.subject_names = {}
        self.object_names = {}
        self.fb_dict = FbDictionary()
        self.fb_entities = {}
        self.fb_relations = set()
        self.word_dict = Dictionary()
        self.mid_name_file = "../datas/mid2name.tsv" if mid_name_file is None else mid_name_file
        self.mid_qid_file = "../datas/fb2w.nt" if mid_qid_file is None else mid_qid_file
        self.topic_words_file = topic_words_file
        self.sq_data_file = sq_data_file
        #self.tp_replace_file = tp_replace_file
        self.mid_name_dict, self.name_mid_dict = self.mid_name_convert(mid_name_file)
        self.db_conn = MySQL(ip='10.61.2.166', port=3306, user='******',
                                  pw='zengyutao', db_name='wikidata')

    def read_sq_data(self):
        cur_idx = 0
        datas = open(self.sq_data_file, 'r')

        for line in datas.readlines():
            line = line.split('\t')
            assert len(line) == 4, "invalid input format" + str(cur_idx)
            sub_id, rel, obj_id, question = line[0:4]
            self.subject_ids[cur_idx] = sub_id.replace("www.freebase.com","")
            self.relations[cur_idx] = rel
            self.object_ids[cur_idx] = obj_id.replace("www.freebase.com", "")
            self.questions[cur_idx] = question
            # mid_name_dict = self.mid_name_convert('../datas/mid2name.tsv')[0]
            self.subject_names[cur_idx] = self.get_mid_to_name_mysql(sub_id.replace("www.freebase.com", ""))
            self.object_names[cur_idx] = self.get_mid_to_name_mysql(obj_id.replace("www.freebase.com", ""))
            cur_idx += 1

        self.sq_dataset['subject_ids'] = self.subject_ids.values()
        self.sq_dataset['relations'] = self.relations.values()
        self.sq_dataset['object_ids'] = self.object_ids.values()
        self.sq_dataset['questions'] = self.questions.values()
        self.sq_dataset['subject_names'] = self.subject_names.values()
        self.sq_dataset['object_names'] = self.object_names.values()
        self.sq_dataset['qid'] = self.sq_dataset.index
        datas.close()
        print ("read sq_data done!")

    def read_sq_data_pd(self):
        print ("read data start")
        datas = pd.read_csv(self.sq_data_file, header=None, sep='\t', skip_blank_lines=False)
        datas.columns = ['subject_id', 'relation', 'object_id', 'question']
        for c in ['subject_id', 'relation', 'object_id']:
            datas[c] = datas[c].apply(lambda x: x.replace("www.freebase.com", ""))
        datas['subject_name'] = datas['subject_id'].apply(lambda x: self.get_mid_to_name_mysql(x))
        datas['object_name'] = datas['object_id'].apply(lambda x: self.get_mid_to_name_mysql(x))
        self.sq_dataset = datas
        self.sq_dataset['qid'] = self.sq_dataset.index
        #print self.sq_dataset[len(self.sq_dataset) - 10:]
        print ("load sq data with df done!")

    def load_topic_word_pos(self, sq_datas):
        #data = pd.read_csv(self.tp_replace_file, header=None, sep='\t')
        #data.columns = ['sid', 'canid', 'qr']
        #new_data = pd.concat([sq_datas, data], axis=1)
        sq_datas['pos'] = sq_datas.apply(lambda x: self.get_topic_word_pos_mid(x['question'],
                                                                           x['subject_name'],
                                                                           x['object_name']),
                                         axis=1)
        #print (sq_datas['pos'])
        return sq_datas
    @staticmethod
    def get_topic_word_pos_mid(question, sub_name, obj_name):
        idxs = []

        if question is None or sub_name is None or obj_name is None:
            return idxs
        question = question.lower()
        sub_name = sub_name.lower()
        obj_name = obj_name.lower()
        part_in = False
        #print question + "," + sub_name + ", " + obj_name
        if sub_name in question:
            golden_word = sub_name
        elif obj_name in question:
            golden_word = obj_name
        else:
            for word in question:
                if word in sub_name:
                    golden_word = sub_name
                    part_in = True
                    break
                elif word in obj_name:
                    golden_word = obj_name
                    part_in = True
                    break
                else:
                    #print question + ', ' + sub_name + ',' + obj_name
                    return idxs
        question = word_tokenize(question)
        golden_word = word_tokenize(golden_word)

        gl = len(golden_word)
        if part_in:
            #print question, golden_word
            cross = [w for w in golden_word if w in question]
            #obj_cross = [w for w in obj_name if w in question]
            #print len(sub_cross), len(obj_cross)
            #print len(cross)
            if len(cross) == 0:
                return idxs
            else:
                for i, word in enumerate(question):
                    if word in cross:
                        idxs.append(i)
            return idxs
        if gl == 1:
            #print golden_word
            for i , word in enumerate(question):
                if word == golden_word[0]:
                    idxs.append(i)
                    #print idxs
                    return idxs
        for i, word in enumerate(question):
            if word == golden_word[0] and question[i + 1] == golden_word[1]:
                idx = i
                for inc in range(0, gl):
                    pos = idx + inc
                    idxs.append(pos)
                return idxs
        #print (question + "," + sub_name + ", " + obj_name)
        print ('=========================')
        return idxs

    @staticmethod
    def get_topic_word_pos(str, str1):
        s = str.strip().split(' ')
        s1 = str1.strip().split(' ')
        leng = max(0, len(s) - len(s1))
        #print (leng)
        idxs = []
        for idx, w in enumerate(s1):
            if len(w) == 0 or len(w) == 1:
                continue
            if w[0] == '#' and w[len(w)-1] == '#':
                for i in range(0, leng+1):
                    idxs.append(idx + i)
                #print (idxs)
                return idxs
        return idxs

    @staticmethod
    def mid_name_convert(mid_name_file):
        mid_name = pd.read_csv(mid_name_file, sep='\t', header=None)
        mid_name.columns = ['mid', 'name']
        # print (mid_name[0:10])
        mid_name_dict = dict(zip(mid_name.mid, mid_name.name))
        # print ({k: mid_name_dict[k] for k in list(mid_name_dict)[:20]})
        name_mid_dict = dict(zip(mid_name.name, mid_name.mid))
        return mid_name_dict, name_mid_dict

    @staticmethod
    def get_mid_to_name(mid, mid_name_dict):
        # (mid_name_dict, _) = self.mid_name_convert(self.mid_name_file)
        if mid in mid_name_dict:
            return mid_name_dict[mid]
        return None

    def get_mid_to_name_mysql(self, mid):
        table_name = 'mid2name'
        query = "select name from %s where mid = '%s' " % (table_name, mid)
        #print query
        name = self.db_conn.search(query)
        if name is not None and len(name) >= 1:
            return name[0]
        return None

    def get_mid_to_type_mysql(self, mid):
        table_name = 'mid2type'
        query = "select notable_type from %s where mid = '%s' " % (table_name, mid)
        #print query
        name = self.db_conn.search(query)
        if name is not None and len(name) >= 1:
            return name[0]
        return None

    @staticmethod
    def get_name_to_mid(name, name_mid_dict):
        # (_, name_mid_dict) = self.mid_name_convert(self.mid_name_file)
        if name in name_mid_dict:
            return name_mid_dict[name]
        return None

    @staticmethod
    def mid_qid_convert(mid_qid_file):
        mid_qid = pd.read_csv(mid_qid_file, sep='\t', header=None)
        mid_qid.columns=['fb', 'rel', 'wiki']
        print (mid_qid[0:10])
        mid_qid['fb'] = mid_qid['fb'].apply(
            lambda x: x.replace("<http://rdf.freebase.com/ns", "").replace(">","").replace(".", "/"))
        mid_qid['wiki'] = mid_qid['wiki'].apply(
            lambda x: x.replace("<", "").replace("> .", ""))
        mid_qid_dict = dict(zip(mid_qid.fb, mid_qid.wiki))
        qid_mid_dict = dict(zip(mid_qid.wiki, mid_qid.fb))
        # print ({k: qid_mid_dict[k] for k in list(qid_mid_dict)[:20]})
        return mid_qid_dict, qid_mid_dict

    def load_fb(self, fb_file, rp_dict):
        fb = pd.read_csv(fb_file, sep='\t', header=None)
        fb.columns = ['sub', 'rel', 'obj']
        for c in fb.columns:
            fb[c] = fb[c].apply(lambda x: x.replace("www.freebase.com", ""))

        # load entities
        for sub in fb['sub']:
            sub_name = self.get_mid_to_name(sub, self.mid_name_dict)
            if sub not in self.fb_entities:
                self.fb_entities[sub] = sub_name
            if sub in rp_dict:
                vocab = rp_dict[sub]
            else:
                vocab = np.random.rand(1, 50)
            if sub in self.fb_dict:
                continue
            self.fb_dict[('ent', sub)] = vocab

        for obj in fb['obj']:
            obj_name = self.get_mid_to_name(obj, self.mid_name_dict)
            if obj not in self.fb_entities:
                self.fb_entities[obj] = obj_name
            if obj_name in rp_dict:
                vocab = rp_dict[obj_name]
            else:
                vocab = np.random.rand(1, 50)
            if obj in self.fb_dict:
                continue
            self.fb_dict[('ent', obj)] = vocab

        # load relation
        for rel in fb['rel']:
            self.fb_relations.add(rel)
            if rel in rp_dict:
                vocab = rp_dict[rel]
            else:
                vocab = np.random.rand(1, 50)
            if rel in self.fb_dict:
                continue
            self.fb_dict[('rel', rel)] = vocab

    # load topic words
    def load_topic_words(self, stage, sq_datas):
        with codecs.open(self.topic_words_file, 'r', encoding='utf-8', errors='ignore') as tw_data:
            data = {"label": [], "word_list": [],
                    "label_name": [], "word_name_list": [],
                    "word_score": []}
            index = 0
            if stage == 'train':
                for line in tw_data.readlines():
                    line = line.strip().split('\t')
                    assert len(line) >= 1, "should be no less than one item " + str(index)
                    label = line[0].replace('m.', '/m/')
                    label_name = self.get_mid_to_name_mysql(label)
                    word_list = list()
                    word_name_list = list()
                    score_list = list()
                    word_score_dict = dict()
                    for i in range(1, len(line)):
                        word_score = line[i].strip().split(' ')
                        assert len(word_score) == 2, "item should be contain 2 parts " + str(index)
                        word_list.append(word_score[0].replace('m.', '/m/'))
                        word_score_dict[word_score[0].replace('m.', '/m/')] = float(word_score[1])

                    if len(word_list) <= 10:
                        if label not in word_list:
                            word_list.append(label)
                            word_score_dict[label] = 2.0
                        data["word_list"].append(word_list)
                        for i in range(len(word_list)):
                            word_name_list.append(self.get_mid_to_name_mysql(word_list[i]))
                            score_list.append(word_score_dict[word_list[i]])
                        data["word_name_list"].append(word_name_list)
                        data['word_score'].append(score_list)
                    else:
                        word_list_sample = [word_list[i] for i in sorted(random.sample(range(len(word_list)), 10))]
                        if label not in word_list_sample:
                            word_list_sample.append(label)
                            word_score_dict[label] = 2.0
                        data["word_list"].append(word_list_sample)
                        for i in range(len(word_list_sample)):
                            word_name_list.append(self.get_mid_to_name_mysql(word_list_sample[i]))
                            score_list.append(word_score_dict[word_list_sample[i]])
                        data["word_name_list"].append(word_name_list)
                        data['word_score'].append(score_list)
                    data["label"].append(label)
                    data["label_name"].append(label_name)
                    index += 1
            elif stage == "valid" or stage == "test":
                for line in tw_data.readlines():
                    line = line.strip().split('\t')
                    assert len(line) >= 1, "should be no less than one item " + str(index)
                    label = line[0].replace('m.', '/m/')
                    label_name = self.get_mid_to_name_mysql(label)
                    word_list = list()
                    word_name_list = list()
                    score_list = list()
                    word_score_dict = dict()

                    for i in range(1, len(line)):
                        word_score = line[i].strip().split(' ')
                        assert len(word_score) == 2, "item should be contain 2 parts " + str(index)
                        word_list.append(word_score[0].replace('m.', '/m/'))
                        score_list.append(float(word_score[1]))
                        word_name_list.append(self.get_mid_to_name_mysql(word_score[0].replace('m.', '/m/')))
                    if len(score_list) == 0:
                        word_list.append(label)
                        score_list.append(2.0)
                        word_name_list.append(label_name)
                    if label not in word_list:
                        word_list.append(label)
                        score_list.append(1.5)
                        word_name_list.append(label_name)
                    data["word_list"].append(word_list)
                    data["word_name_list"].append(word_name_list)
                    data['word_score'].append(score_list)
                    data["label"].append(label)
                    data["label_name"].append(label_name)
                    index += 1
        print (index, len(data['label']), len(sq_datas))
        assert len(data['label']) == len(self.sq_dataset), "two dataset must have same lines"
        sq_datas['topic_words'] = data.get('word_list')
        sq_datas['topic_words_names'] = data.get('word_name_list')
        sq_datas['golden_word'] = data.get('label')
        sq_datas['golden_word_name'] = data.get('label_name')
        sq_datas['word_score'] = data.get('word_score')
        sq_datas['word_score'] = sq_datas['word_score'].fillna(value=0.0)
        self.db_conn.connect.close()
        print (type(sq_datas.loc[50, 'word_score']))
        print (sq_datas.loc[50, 'word_score'])
        print ("read topic words done!")
        return sq_datas
Esempio n. 9
0
def gen_simgle_test_data(data, word_dict, char_dict, i, entity_dim, entity_dict):
    data = data.sort_values(['qid', 'predict'], ascending=[True, False])
    db_conn = MySQL(ip='10.61.2.166', port=3306, user='******',
                                       pw='zengyutao', db_name='wikidata')
    test_tensor = {}

    question = data.loc[data['qid'] == i, 'question'].iloc[0]
    max_len = len(word_tokenize(question))
    qw_tensor = Variable(line_to_word_tensor(question, word_dict, max_len))
    qc_tensor = Variable(line_to_char_tensor(question, char_dict, max_len))
    test_tensor['qw_tensor'] = qw_tensor
    test_tensor['qc_tensor'] = qc_tensor

    pos = data.loc[data['qid'] == i, 'pos'].iloc[0]
    predict = data.loc[data['qid'] == i, 'predict'].tolist()

    cand_mid = data.loc[data['qid'] == i, 'topic_words'].tolist()
    test_tensor['cand_mid'] = cand_mid
    vectors = []
    cand_vec_dict = {}
    cand_indice = []
    cant_types = []
    for cand in cand_mid:
        #print cand
        vec = get_entity_vector(db_conn, cand)
        #print vec
        vec = [float(x) for x in str(vec).split(',')]
        vectors.append(vec)
        cand_vec_dict[cand] = vec
        cand_indice.append(entity_dict[cand])
        type_id = get_mid_type(db_conn, cand)
        type_name = get_mid_to_name_mysql(db_conn, type_id)
        cant_types.append(type_name)

    entities_vec = np.matmul(np.array(predict), np.array(vectors))
    test_tensor['cand_indices'] = Variable(torch.LongTensor(cand_indice).cuda())
    test_tensor['predict'] = Variable(torch.Tensor([predict]).cuda())
    # test_tensor['cand_type'] = lines_to_word_tensor(cant_types, word_dict)

    seq_len = len(word_tokenize(question))

    entity_tensor = torch.rand(1, seq_len, entity_dim)
    pos_vec = torch.rand(1, seq_len).fill_(1)
    pos = ast.literal_eval(pos)

    for p in pos:
        entity_tensor[0][p] = torch.from_numpy(entities_vec)
        pos_vec[0][p] = 2

    test_tensor['entity_tensor'] = Variable(entity_tensor.cuda(), requires_grad=False)
    test_tensor['position_tensor'] = Variable(pos_vec.cuda(), requires_grad=False)

    #cand_tensor = [Variable(torch.Tensor([cand_vec_dict[c]]).cuda(), requires_grad=False)for c in cand_mid]
    cand_tensor = []
    # for c in cand_mid:
    #     c_tensor = torch.rand(1, 1, entity_dim)
    #     for x in range(seq_len):
    #         c_tensor[0][0] = torch.Tensor(cand_vec_dict[c])
    #     cand_tensor.append(Variable(c_tensor.cuda()))
    # test_tensor['cande_tensor'] = cand_tensor
    cand_names = [get_mid_to_name_mysql(db_conn, mid) for mid in cand_mid]

    cand_tw = []
    cand_tc = []
    cand_ti = []
    cand_tt = []
    assert (len(cand_names) == len(cant_types) and
            len(cand_names) == len(cand_indice)), "len cands should be equal"
    for idx, line in enumerate(cand_names):
        print (line)
        if line is None and cant_types[idx] is None:
            max_line_len = 0
        elif line is None and cant_types[idx] is not None:
            max_line_len = len(word_tokenize(cant_types[idx]))
        elif line is not None and cant_types[idx] is None:
            max_line_len = len(word_tokenize(line))
        else:
            max_line_len = max(len(word_tokenize(line)), len(word_tokenize(cant_types[idx])))
        l_tw = line_to_word_tensor(line, word_dict, max_line_len)

        l_tc = line_to_char_tensor(line, char_dict, max_line_len)
        print (l_tc.size())
        cand_tw.append(Variable(l_tw))
        cand_tc.append(Variable(l_tc))

        l_tt = line_to_word_tensor(cant_types[idx], word_dict, max_line_len)
        cand_tt.append(Variable(l_tt))
        cand_ti.append(Variable(torch.LongTensor([cand_indice[idx]]).cuda()))

        c_tensor = torch.rand(1, max_line_len, entity_dim)
        for x in range(max_line_len):
            c_tensor[0][x] = torch.Tensor(cand_vec_dict[cand_mid[idx]])
        cand_tensor.append(Variable(c_tensor.cuda()))
    test_tensor['cande_tensor'] = cand_tensor

    test_tensor['candw_tensor'] = cand_tw
    test_tensor['candc_tensor'] = cand_tc
    test_tensor['candi_tensor'] = cand_ti
    test_tensor['cand_type'] = cand_tt
    test_tensor['golden'] = data.loc[data['qid'] == i, 'golden_word'].iloc[0]
    #test_tensor['candw_tensor'] = [lines_to_word_tensor(line, word_dict) for line in cand_names]
    #test_tensor['candc_tensor'] = [lines_to_char_tensor(line, char_dict) for line in cand_names]

    return test_tensor
Esempio n. 10
0
def prepair_data(data, args, entity_dict, relation_dict):
    data = data.sort_values(['qid', 'predict'], ascending=[True, False])
    qid_max = max(data['qid'].tolist())
    print (qid_max)
    train = {'question':[], 'cand_ent':[], 'pos':[], 'entities_vec':[], 'positive_vec':[],
             'positive':[], 'negative':[], 'predict':[], 'negative_vec':[], 'cand_rel':[],
             'pos_rel':[], 'pos_rel_vec':[], 'neg_rel':[], 'neg_rel_vec':[], 'cand_indices':[],
             'positive_idx':[], 'positive_type':[], 'negative_idx':[], 'negative_type':[],
             'pos_rel_idx': [], 'neg_rel_idx': []}
    db_conn = MySQL(ip='10.61.2.166', port=3306, user='******',
                                       pw='zengyutao', db_name='wikidata')
    #data['vector'] = data.apply(lambda x: get_entity_vector(db_conn, x['topic_words']), axis=1)
    #print(data['vector'])

    for i in range(qid_max+1):
        nums = len(data[data['qid'] == i])
        question = data.loc[data['qid'] == i, 'question'].iloc[0]
        train['question'].append(question)

        pos = data.loc[data['qid'] == i, 'pos'].iloc[0]
        train['pos'].append(pos)

        predict = data.loc[data['qid'] == i, 'predict']
        train['predict'].append(predict.tolist())

        cand_mid = data.loc[data['qid']==i, 'topic_words']
        #data['vectors'] = data['vector'].apply(lambda x: x.split(','))
        train['cand_ent'].append(cand_mid)
        #print (data.loc[data['qid'] == i, 'vectors'])
        vectors = []
        cand_vec_dict = {}
        cand_indices = []
        for cand in cand_mid:
            #print cand
            vec = get_entity_vector(db_conn, cand)
            #print vec
            vec = [float(x) for x in str(vec).split(',')]
            vectors.append(vec)
            cand_vec_dict[cand] = vec
            cand_indices.append(entity_dict[cand])

        entities_vec = np.matmul(np.array(predict), np.array(vectors))
        #entities_vec = mul_vec(predict.tolist(), vectors)
        train['entities_vec'].append(entities_vec)
        train['cand_indices'].append(cand_indices)

        positive = data.loc[data['qid'] == i, 'golden_word_name'].iloc[0]
        train['positive'].append(positive)

        # cands_name = data.loc[data['qid']==i, 'topic_words_names']
        # neg_name = [w for w in cands_name if w != positive]
        # train['negtive'].append(neg_name)

        positive_mid = data.loc[data['qid'] == i, 'golden_word'].iloc[0]
        train['positive_idx'].append(entity_dict[positive_mid])
        ptype = get_mid_type(db_conn, positive_mid)
        if ptype is None:
            train['positive_type'].append("none")
        else:
            ptype_name = get_mid_to_name_mysql(db_conn, ptype)
            if ptype_name is None:
                train['positive_type'].append("none")
            else:
                train['positive_type'].append(ptype_name)
        positive_vec = cand_vec_dict[positive_mid]
        train['positive_vec'].append(positive_vec)

        neg_vec = [cand_vec_dict[neg_mid] for neg_mid in cand_mid if neg_mid != positive_mid]
        neg = [get_mid_to_name_mysql(db_conn, neg_mid) for neg_mid in cand_mid if neg_mid != positive_mid]
        neg_idx = [entity_dict[neg_mid] for neg_mid in cand_mid if neg_mid != positive_mid]
        #neg_type = [get_mid_type(db_conn, neg_mid) for neg_mid in cand_mid if neg_mid != positive_mid]
        train['negative'].append(neg)
        train['negative_vec'].append(neg_vec)
        train['negative_idx'].append(neg_idx)

        neg_type = []
        for neg_mid in cand_mid:
            if neg_mid == positive_mid:
                continue
            ntype = get_mid_type(db_conn, neg_mid)
            if ntype is None:
                neg_type.append("none")
            else:
                ntype_name = get_mid_to_name_mysql(db_conn, ntype)
                if ntype_name is None:
                    neg_type.append("none")
                else:
                    neg_type.append(ntype_name)
        train['negative_type'].append(neg_type)
        #print ("================gen pos rel ==================")
        positive_rel = data.loc[data['qid'] == i, 'relation'].iloc[0]
        pos_rel_idx = relation_dict[positive_rel]
        pos_rel_vec = get_relation_vector(db_conn, positive_rel)
        pos_rel_vec = [float(x) for x in str(pos_rel_vec).split(',')]
        positive_rel_re = positive_rel.replace('/', ' ').replace('_', ' ')

        train['pos_rel'].append(positive_rel_re)
        train['pos_rel_vec'].append(pos_rel_vec)
        train['pos_rel_idx'].append(pos_rel_idx)

        #print ("----------------get cand rel----------")
        cand_rel = get_cand_rel(args.triples, cand_mid)
        train['cand_rel'].append(cand_rel)

        #print ("--------------get neg rel --------------")
        neg_rel = [rel for rel in cand_rel if rel != positive_rel]
        neg_rel_idx = [relation_dict[rel] for rel in cand_rel if rel != positive_rel]
        neg_rel_vec = []
        neg_rel_re = []
        for n in neg_rel:
            vec = get_relation_vector(db_conn, n)
            neg_v = [float(x) for x in str(vec).split(',')]
            neg_rel_vec.append(neg_v)
            neg_rel_re.append(n.replace('/', ' ').replace('_', ' '))

        train['neg_rel_idx'].append(neg_rel_idx)
        train['neg_rel'].append(neg_rel_re)
        train['neg_rel_vec'].append(neg_rel_vec)


    train_data = pd.DataFrame(train)

    # for key in train:
    #     train_data[key] = train.get(key)
    train_data.to_csv('../datas/prepaired_train_data_1w.csv', index=False)
    return train_data