コード例 #1
0
# input: training set, validation set, test set
# output: part of test set such that each left hand side entity appear no more
# than FREQ times in the training set.

import sys
import io
import numpy as np
import batch

# split the data into training set, validation set and testing set
if __name__ == "__main__":
    max_freq = 2 # selecting lhs that appear <= max_freq times in the training set
    lhs, rel, rhs = batch.load_labeled_entities(io.open("../data/prescription-sparse2-train.txt"))
    lhs_dict, lhs_count = batch.build_entity_dictionary(lhs)

    lhs, rel, rhs = batch.load_labeled_entities(io.open("../data/prescription-sparse2-test.txt"))
    buf = []
    for i in range(len(lhs)):
        if lhs[i] not in lhs_count or lhs_count[lhs[i]] <= max_freq:
            buf.append("{}\t{}\t{}\n".format(lhs[i], rel[i], rhs[i]))

    lhs, rel, rhs = batch.load_labeled_entities(io.open("../data/prescription-sparse2-valid.txt"))
    for i in range(len(lhs)):
        if lhs[i] not in lhs_count or lhs_count[lhs[i]] <= max_freq:
            buf.append("{}\t{}\t{}\n".format(lhs[i], rel[i], rhs[i]))

    with open("../data/prescription-sparse2-rare-{}-test.txt".format(max_freq), "w") as f_out:
        f_out.writelines(buf)
コード例 #2
0
ファイル: main_nn.py プロジェクト: YuxingZhang/prescription
    #lhs, rel, rhs = batch.load_labeled_entities(io.open(sys.argv[1],'r'))
    #lhs_v, rel_v, rhs_v = batch.load_labeled_entities(io.open(sys.argv[2],'r'))
    #lhs_s, rel_s, rhs_s = batch.load_labeled_entities(io.open("../data/prescription-freq-test.txt"))
    #lhs, rel, rhs = batch.load_labeled_entities(io.open("../data/prescription-sparse2-train.txt")) # sparse 2 is by different train,valid,test ratio
    #lhs_v, rel_v, rhs_v = batch.load_labeled_entities(io.open("../data/prescription-sparse2-valid.txt"))
    #lhs_s, rel_s, rhs_s = batch.load_labeled_entities(io.open("../data/prescription-sparse2-test.txt"))
    lhs, rel, rhs = batch.load_labeled_entities(io.open("../data/yago-sparse-entity-train.txt")) # sparse 2 is by different train,valid,test ratio
    lhs_v, rel_v, rhs_v = batch.load_labeled_entities(io.open("../data/yago-sparse-entity-valid.txt"))
    lhs_s, rel_s, rhs_s = batch.load_labeled_entities(io.open("../data/yago-sparse-entity-test.txt"))

    # left hand side dictionaries, both character and entity
    chardict, charcount = batch.build_char_dictionary(lhs)
    n_char = len(chardict.keys()) + 1
    batch.save_dictionary(chardict,charcount,'%s/dict.pkl' % save_path)

    lhs_dict, lhs_count = batch.build_entity_dictionary(lhs)
    n_lhs = len(lhs_dict.keys())
    batch.save_dictionary(lhs_dict,lhs_count,'%s/lhs_dict.pkl' % save_path)

    # build dictionary for relations
    rel_dict, rel_count = batch.build_entity_dictionary(rel)
    batch.save_dictionary(rel_dict, rel_count, '%s/rel_dict.pkl' % save_path)
    n_rel = len(rel_dict.keys())
    # this tells number of triples in different relations

    # build dictionary for right hand side entities
    rhs_dict, rhs_count = batch.build_entity_dictionary(rhs)
    batch.save_dictionary(rhs_dict, rhs_count, '%s/rhs_dict.pkl' % save_path)
    n_rhs = len(rhs_dict.keys())

    # batches
コード例 #3
0
# input: training set, validation set, test set
# output: part of test set such that each left hand side entity appear no more
# than FREQ times in the training set.

import sys
import io
import numpy as np
import batch

# split the data into training set, validation set and testing set
if __name__ == "__main__":
    max_freq = 2  # selecting lhs that appear <= max_freq times in the training set
    lhs, rel, rhs = batch.load_labeled_entities(
        io.open("../data/prescription-sparse2-train.txt"))
    lhs_dict, lhs_count = batch.build_entity_dictionary(lhs)

    lhs, rel, rhs = batch.load_labeled_entities(
        io.open("../data/prescription-sparse2-test.txt"))
    buf = []
    for i in range(len(lhs)):
        if lhs[i] not in lhs_count or lhs_count[lhs[i]] <= max_freq:
            buf.append("{}\t{}\t{}\n".format(lhs[i], rel[i], rhs[i]))

    lhs, rel, rhs = batch.load_labeled_entities(
        io.open("../data/prescription-sparse2-valid.txt"))
    for i in range(len(lhs)):
        if lhs[i] not in lhs_count or lhs_count[lhs[i]] <= max_freq:
            buf.append("{}\t{}\t{}\n".format(lhs[i], rel[i], rhs[i]))

    with open("../data/prescription-sparse2-rare-{}-test.txt".format(max_freq),
              "w") as f_out: