Example #1
0
def simulate_walks(my_args, g, num_walks, walk_length, node_map):
    '''
    Repeatedly simulate random walks from each node, and relation pair
    '''
    walks, all_node_nbrs = [], {}
    nodes = list(g.nodes())
    for walk_iter in range(num_walks):
        print 'args.output=' + my_args.output
        fn = os.path.join(
            my_args.output,
            my_args.run_name + '-' + str(walk_iter) + '-' + '.txt')
        print 'output file name = ' + fn
        of = open(fn, 'w')
        print 'walk_iter=' + str(walk_iter)
        random.shuffle(nodes)
        node_ct, node_total = 0, len(nodes)
        for node in nodes:
            node_ct += 1
            walks, next_start_nodes = walk_rels(node_map, node)
            for i in range(0, len(walks)):
                ere_walk(g,
                         all_node_nbrs,
                         walk_length=walk_length,
                         start_node=next_start_nodes[i],
                         walk=walks[i],
                         op_file=of)
            bas_utils.print_status(
                '/ ' + str(node_total) + ' simulate_walks         ', node_ct,
                100)
        of.close()
        print 'ere.ere.learn_embeddings() - walks saved to - ' + my_args.output
    return walks
Example #2
0
def load_node_map(my_args):
    """
    rel_2_obj - Key = relationship / predicate, Value = List of Objects
    :param my_args:
    :return:
    """
    ct = 0
    node_map = mm.my_map(-1)
    print 'load_node_map() - inp file = ' + my_args.input
    with open(my_args.input) as f:
        content = f.readlines()
    for next_line in content:
        toks = next_line.split()
        subj = toks[0]
        obj = toks[1]
        preds = toks[2]
        ct += 1
        for pred in preds.split('_'):
            if node_map.contains_key(subj):
                rel_2_obj = node_map.get(subj)
            else:
                rel_2_obj = dict()
            if pred in rel_2_obj.keys():
                obj_list = rel_2_obj[pred]
            else:
                obj_list = list()
            obj_list.append(obj)
            rel_2_obj[pred] = obj_list
            node_map.set(subj, rel_2_obj)
            bas_utils.print_status('load_node_map()                ', ct, 100)
    return node_map
Example #3
0
def print_questions(qs, new_file_path, rel_count=-1):
    print '\nstarted printing'
    op_file = open(new_file_path, 'w')
    ct = 0
    print 'len(qs) = ' + str(len(qs))
    for q in qs:
        next_q = ''
        ct += 1
        bas_utils.print_status('print_questions()', ct, 1)
        my_list = []
        for p in q:
            my_list.append(p['r1'])
            my_list.append(p['r2'])
            next_q = next_q + p['e1'] + ',' + p['r1'] + ',' + p['e2'] + ',' + p['r2'] + ',' + p['e3'] + ','
        if (rel_count > 0) and (len(set(my_list)) == rel_count):
            op_file.write(next_q + '\n')
        elif rel_count < 0:
            op_file.write(next_q + '\n')
        else:
            print next_q
            print '-----------'
            print my_list
            print '-----------'
            print set(my_list)
            print '================='
            raw_input("Press Enter to see more Examples...")
    op_file.close()
    print 'printing complete....'
Example #4
0
def pre_process_relations(word_index, isl):
    '''
    Used for learning a representation of the relations using their tokens
    '''
    ctr, max_sen_len = 0, 0
    pf = '/data/Work-Homes/LOD_HOME/subgraph_webq/input_data/split-3/rel-map-subg1.rdf'
    lc = bas_utils.get_line_count(pf)
    sqs, y, oov = list(), list(), list()
    with open(pf) as f:
        content = f.readlines()
    for next_line in content:
        toks = next_line.split(';')
        relid = toks[1].replace('\n', '')
        sq = list()
        max_sen_len = max(max_sen_len, len(toks[0].split()))
        for w in toks[0].split():
            if not (w in word_index.keys()):
                sq.append(0)
            else:
                sq.append(word_index[w])
        sqs.append(sq)
        y.append(relid)
        ctr += 1
        bas_utils.print_status(
            ' / ' + str(lc) + ' pre_process_relations()                   ',
            ctr, 1)
    pad_seqs_of_indexes = pad_sequences(sqs, maxlen=isl)
    print '\nrelation token - seq len = ' + str(max_sen_len)
    print pad_seqs_of_indexes[0]
    print '\nExiting - pre_process_relations()'
    return pad_seqs_of_indexes, y
Example #5
0
def get_paths(l1, l2, l3, l4, r1, r2):
    """
    It takes an intersection of the list of objects of r1, and subjects of r2.
    whenever object of r1 and subject of r2 are equal, it appends a path in the path_list

    :param l1: List of subjects of r1 relationship
    :param l2: List of objects of r1 relationship
    :param l3: List of subjects of r2 relationship
    :param l4: List of objects of r2 relationship
    :param r1: first relationship
    :param r2: second relationship
    :return: path_list
    """
    max_ct1 = len(l2)
    max_ct2 = len(l3)
    print 'ct(' + r1 + ')=' + format(
        max_ct1, ',d') + ', ct(' + r2 + ')=' + format(max_ct2, ',d')
    path_list = []
    ct = 0
    for i in range(0, max_ct1, 1):
        for j in range(0, max_ct2, 1):
            ct += 1
            #print_status(ct, 1000000)
            bas_utils.print_status('get_paths()', ct, 1)
            if l2[i] == l3[j]:
                path_list.append(l1[i] + ',' + r1 + ',' + l2[i] + ',' + l3[j] +
                                 ',' + r2 + ',' + l4[j])
    return path_list
Example #6
0
def eval_model(my_args,
               np_y_all_pred,
               y_all_true,
               src_ents,
               word_index,
               embed_mat,
               rmat,
               op_seq_len,
               json_obj,
               qid_seq,
               all_rel_ids,
               split_name='tst'):
    print '\nEntering - eval_model()<--- for ' + split_name + ', beam_size=' + str(
        my_args.p['beam_size'])
    print 'np_y_all_pred.shape=' + str(np_y_all_pred.shape)
    print 'Reading Edge List in networkx ... from ' + my_args.edge_list_pf + '\n'
    g = nx.read_edgelist(my_args.edge_list_pf,
                         nodetype=str,
                         data=(('relation', str), ),
                         create_using=nx.DiGraph())
    r, results, gt_lengths, neg_data = '', [], [], dict()
    op_file = bas_utils.open_file(
        os.path.join(my_args.job_folder, split_name + '_pred_distances.txt'))
    ct = 0
    for i in range(len(src_ents)):  # for every question
        ct += 1
        y_pred, y_true, my_ents = np_y_all_pred[i], y_all_true[i], src_ents[i]
        qid = qid_seq[i]
        jsonq = json_obj[qid]
        is_true, jq1, neg_data = eval_next_xy(my_args, g, qid, y_pred, y_true,
                                              my_ents, word_index, embed_mat,
                                              rmat, jsonq, i, op_file,
                                              neg_data, all_rel_ids,
                                              split_name)
        json_obj[qid] = jq1
        results.append(is_true)
        gt_lengths.append(len(y_true))
        bas_utils.print_status(
            ' / ' + str(len(src_ents)) + ' check_accuracy()             ', i,
            1)
    op_file.close()
    save_neg_data(my_args, neg_data, split_name)
    cum_seq_right, ind_seq_right, all_right, json2 = cal_stats(
        my_args, results, gt_lengths, op_seq_len, json_obj, qid_seq,
        split_name)
    print_results(my_args, cum_seq_right, ind_seq_right, all_right, results,
                  split_name)
    bas_utils.save_json_to_file(
        json2, os.path.join(my_args.job_folder, split_name + '_res.json'),
        True)
    sd_folder = os.path.abspath(
        os.path.join(my_args.job_folder, os.pardir, 'saved_data'))
    bas_utils.save_json_to_file(nbr_ent_cache,
                                os.path.join(sd_folder, 'nbr_ent_cache.json'))
    bas_utils.save_json_to_file(nbr_ent_cache,
                                os.path.join(sd_folder, 'nbr_rel_cache.json'))
    return neg_data
Example #7
0
def load_core_rels():
    my_rels, ct = set(), 0
    with open(core_rels_file) as f:
        content = f.readlines()
    for next_line in content:
        ct += 1
        my_rels.add(next_line.replace('\n', ''))
        bas_utils.print_status(' load_core_rels()          ', ct, 1)
    print "\nCore Relations Loaded....."
    return my_rels
Example #8
0
def load_node_map():
    ct = 0
    node_map = mm.my_map(-1)
    with open(node_map_pf) as f:
        content = f.readlines()
    for next_line in content:
        ct += 1
        toks = next_line.split()
        kg_node = toks[0]
        node_id = toks[1]
        node_map.set(kg_node, node_id)
        bas_utils.print_status(' load_node_map()          ', ct, 1)
    return node_map
Example #9
0
def load_rel_map():
    rel_map = dict()
    ct = 0
    with open(rel_map_pf) as f:
        content = f.readlines()
    for next_line in content:
        ct += 1
        toks = next_line.split()
        kg_rel_name = toks[0]
        rel_id = toks[1]
        rel_map[kg_rel_name] = rel_id
        bas_utils.print_status(' load_rel_map()          ', ct, 1)
    return rel_map
Example #10
0
def pre_process_inputs(my_args, x_all, word_index, max_sen_len, args,
                       json_objs, qs):
    '''
    it puts the final question in 'fq' field of the json objection.
    it also prepares padded sequences of word indexes for every question
    '''
    op_path = os.path.abspath(
        os.path.join(args.job_folder, os.pardir, 'saved_data'))
    x_all_padded, f = list(), None
    pad_seqs_of_indexes = None
    for i in range(0, len(x_all)):
        print '\npre_process_inputs - Dataset = ' + str(i)
        seqs_of_indexes, ct, my_json_obj = list(), 0, json_objs[i]
        oov_file = open(os.path.join(op_path, 'oov-' + str(i) + '.txt'), 'w')
        for j in range(len(x_all[i])):
            qid = qs[i][j]
            qjson = my_json_obj[qid]
            q, qjson['OOV'] = x_all[i][j], ''
            sq = list()
            fq = q
            qjson['fq'] = fq
            ct += 1
            for wi in fq.split():
                if check_replace_src_ent(my_args, wi):
                    w = 'SRC_ENT'
                    qjson['fq'] = fq.replace(wi, w)
                else:
                    w = wi
                if not (w in word_index.keys()):
                    qjson['OOV'] += w + ';'
                    sq.append(0)
                    oov_file.write(qs[i][j] + ',' + qjson['qid'] + ',' +
                                   qjson['qm'] + ',' + fq + ',' + w + '\n')
                else:
                    if word_index[w] > 5432:
                        f = create_imp_words_list(str(w), f)
                    sq.append(word_index[w])
            seqs_of_indexes.append(sq)
            bas_utils.print_status(
                ' / ' + str(len(x_all[i])) + ' pre_process_inputs()     ', j,
                1)
        oov_file.close()
        pad_seqs_of_indexes = pad_sequences(seqs_of_indexes,
                                            maxlen=max_sen_len)
        x_all_padded.append(pad_seqs_of_indexes)
    f.close()
    print('Shape of pad_seqs_of_indexes tensor:', pad_seqs_of_indexes.shape)
    return x_all_padded, json_objs
Example #11
0
def compare_files(f1, f2):
    print 'compare_files() - ' + f1 + ', ' + f2
    l1 = load_file(f1)
    l2 = load_file(f2)
    qs, ct = [], 0
    for i in l1:
        for j in l2:
            ct += 1
            bas_utils.print_status('compare_files()', ct, 1)
            if i['e3'] != j['e3']:
                continue
            q = list()
            q.append(i)
            q.append(j)
            qs.append(q)
    return qs
Example #12
0
def word_count_n_dim(args, wi, ct):
    word_count, embd_dim = 0, 0
    f = open(args.word_embedding)
    for a_line in f:
        toks = a_line.split(' ')
        if len(toks) < 3:
            continue
        word_count += 1
        if embd_dim == 0:
            embd_dim = len(toks) - 1
        if toks[0] in wi.keys():
            continue
        ct += 1
        wi[toks[0]] = ct
        bas_utils.print_status(' word_count_n_dim()             ', ct, 1)
    f.close()
    word_count += 1  # Including 'None' as a new word.
    print('\nFound %s word vectors.' % word_count)
    print('Embed Mat Size - %s ' % ct)
    return word_count, embd_dim, wi, ct
Example #13
0
def get_ere_embed(wi, random_embeddings):
    print '\nEntering get_ere_embed() - len(wi)=' + str(len(wi.keys()))
    ct, i, idx, word_list = 0, 0, 0, set(wi.keys())
    embedding_matrix = np.zeros((len(wi) + 1, 300))
    with open(ere_embedding_file) as f:
        content = f.readlines()
    for next_line in content:
        i += 1
        msg = 'i=' + str(i) + ', idx=' + str(idx) + ' / ' + str(len(wi))
        bas_utils.print_status(msg + '  get_ere_embed()', ct, 1)
        my_words = wi.keys()
        toks = next_line.split()
        if len(toks) < 5:
            continue
        if toks[0] not in my_words:
            continue
        word_list.remove(toks[0])
        ct += 1
        idx = wi[toks[0]]
        embedding_matrix[idx] = np.asarray(toks[1:], dtype=float)
    print '\nWord count not in ERE=' + str(len(word_list)) + ', e.g., ' + list(
        word_list)[0] + ', ' + list(word_list)[1]
    print '\n - Starting to add Random Embedding after ct=' + str(ct)
    final_pending = set(word_list)
    for w in word_list:
        if not str(w).startswith('r'):
            continue
        print 'Adding Random Embedding for - ' + w
        idx = wi[w]
        e = random_embeddings.pop()
        embedding_matrix[idx] = np.asarray(e, dtype=float)
        ct += 1
        final_pending.remove(w)
    print '\nWord count not in final_embd=' + str(
        len(final_pending)) + ' - ' + bas_utils.to_string(final_pending, ' ')
    print '\nct=' + str(ct) + ', i=' + str(i)
    return embedding_matrix
Example #14
0
def load_embeddings(my_args):
    print 'load_embeddings() Loading Embeddings from - ' + my_args.word_embedding
    word_index, ct = get_imp_words(my_args)
    word_count, embd_dim, word_index, ct = word_count_n_dim(
        my_args, word_index, ct)
    embedding_matrix = np.zeros((ct + 1, embd_dim))
    print 'embedding_matrix.shape = ' + str(embedding_matrix.shape)
    print 'len(word_index) = ' + str(len(word_index))
    print 'ct = ' + str(ct)
    lctr = 0
    with open(my_args.word_embedding) as f:
        content = f.readlines()
    for next_line in content:
        if len(next_line.split()) < 3:
            continue
        lctr += 1
        toks = next_line.split(' ')
        coefs = np.asarray(toks[1:], dtype='float32')
        embedding_matrix[word_index[toks[0]]] = coefs
        bas_utils.print_status(
            ' / ' + str(word_count) + ' load_embeddings()         ', lctr, 1)
    rel_embd_mat = get_rel_embd_mat(my_args, embedding_matrix, word_index)
    print '\nExiting - load_embeddings()'
    return embd_dim, word_index, embedding_matrix, rel_embd_mat