Ejemplo n.º 1
0
def get_complex_scores(model_name, output_path):
    checkpoint_file = os.path.join(config.CHECKPOINT_PATH, model_name)
    graph = tf.Graph()
    with graph.as_default():
        sess = tf.Session()
        saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
        saver.restore(sess, checkpoint_file)

        heads = graph.get_operation_by_name("head_entities").outputs[0]
        tails = graph.get_operation_by_name("tail_entities").outputs[0]
        relations = graph.get_operation_by_name("relations").outputs[0]
        pred = graph.get_operation_by_name("pred").outputs[0]

        relation2id = load_dict_from_txt("../HRERE/data/relation2id.txt")
        id2r = {relation2id[x]: x for x in relation2id.keys()}
        r2id = load_dict_from_txt(config.FB3M_R2ID)
        r = []
        for i in range(1, 55):
            r.append(r2id[id2r[i]])
        infile = open(config.FB1M_TEST)
        cnt = 0
        for line in infile.readlines():
            e1, l, e2 = line.strip().split(",")
            e1, l, e2 = int(e1), int(l), int(e2)
            res = sess.run(pred,
                           feed_dict={
                               heads: [e1] * 54,
                               tails: [e2] * 54,
                               relations: r
                           })
            p = np.argmax(res)
            if r[p] == l:
                cnt += 1
        print(cnt)
Ejemplo n.º 2
0
def reconstruct(input_file, output_file, kb_file, count_file, threshold=30):
    d = data_utils.load_dict_from_txt(input_file)
    count_dict = data_utils.load_dict_from_txt(count_file)
    e = [
        v for v in d.values()
        if (v in count_dict) and (count_dict[v] >= threshold)
    ]
    e = set(e)
    # linecount = data_utils.file_len(kb_file)
    linecount = 435406270  # Freebase
    infile = open(kb_file)
    outfile = open(output_file, "w")
    for i in tqdm(range(linecount)):
        e1, r, e2 = infile.readline().strip().split("\t")
        if not e1.startswith("m."):
            continue
        if not e2.startswith("m."):
            continue
        if (e1 in e) or (e2 in e):
            if count_dict[e1] >= threshold:
                e.add(e1)
            if count_dict[e2] >= threshold:
                e.add(e2)
    infile.close()
    infile = open(kb_file)
    for i in tqdm(range(linecount)):
        e1, r, e2 = infile.readline().strip().split("\t")
        if (e1 in e) and (e2 in e):
            outfile.write("%s\t%s\t%s\n" % (r, e1, e2))
    infile.close()
    outfile.close()
Ejemplo n.º 3
0
def complex():
    entity1 = np.load(config.KG_PATH + "/entity1.npy")
    entity2 = np.load(config.KG_PATH + "/entity2.npy")
    relation1 = np.load(config.KG_PATH + "/relation1.npy")
    relation2 = np.load(config.KG_PATH + "/relation2.npy")
    e2id = load_dict_from_txt(config.KG_PATH + "/e2id.txt")
    r2id = load_dict_from_txt(config.KG_PATH + "/r2id.txt")

    entity2id = load_dict_from_txt(config.E2ID)
    relation2id = load_dict_from_txt(config.R2ID)
    e1_embeddings = np.random.uniform(0.0, 1.0, (len(entity2id), entity1.shape[1]))
    e2_embeddings = np.random.uniform(0.0, 1.0, (len(entity2id), entity2.shape[1]))
    r1_embeddings = np.random.uniform(0.0, 1.0, (len(relation2id), relation1.shape[1]))
    r2_embeddings = np.random.uniform(0.0, 1.0, (len(relation2id), relation2.shape[1]))
    for e in entity2id:
        if e not in e2id:
            continue
        idx1 = entity2id[e]
        idx2 = e2id[e]
        e1_embeddings[idx1, :] = entity1[idx2, :]
        e2_embeddings[idx1, :] = entity2[idx2, :]
    for r in relation2id:
        if r not in r2id:
            continue
        idx1 = relation2id[r]
        idx2 = r2id[r]
        r1_embeddings[idx1, :] = relation1[idx2, :]
        r2_embeddings[idx1, :] = relation2[idx2, :]
    np.save(config.ENTITY_EMBEDDING1, e1_embeddings)
    np.save(config.ENTITY_EMBEDDING2, e2_embeddings)
    np.save(config.RELATION_EMBEDDING1, r1_embeddings)
    np.save(config.RELATION_EMBEDDING2, r2_embeddings)
Ejemplo n.º 4
0
def main(options):
    if options.data_name == "bp":
        e2id = load_dict_from_txt(config.BP_E2ID)
        r2id = load_dict_from_txt(config.BP_R2ID)
        n_entities = len(e2id)
        i2e = {v: k for k, v in e2id.items()}
        i2r = {v: k for k, v in r2id.items()}
    e1 = options.head
    e2 = options.tail
    r = options.relation
    if r == -1:
        raise AttributeError("Please specify the relation!")
    if (e1 == -1) and (e2 == -1):
        raise AttributeError("Please specify one entity!")
    if (e1 != -1) and (e2 != -1):
        raise AttributeError("Please specify only one entity!")
    idx_mat = np.empty((n_entities, 3), dtype=np.int64)
    if e1 == -1:
        idx_mat[:, 1:] = np.tile((r, e2), (n_entities, 1))
        idx_mat[:, 0] = np.arange(n_entities)
    else:
        idx_mat[:, :2] = np.tile((e1, r), (n_entities, 1))
        idx_mat[:, 2] = np.arange(n_entities)

    checkpoint_file = os.path.join(config.CHECKPOINT_PATH,
                                   "best_TransE_L2_wn18")
    graph = tf.Graph()
    with graph.as_default():
        sess = tf.Session()
        saver = tf.train.import_meta_graph("{}.meta".format(checkpoint_file))
        saver.restore(sess, checkpoint_file)

        heads = graph.get_operation_by_name("head_entities").outputs[0]
        tails = graph.get_operation_by_name("tail_entities").outputs[0]
        relations = graph.get_operation_by_name("relations").outputs[0]
        labels = graph.get_operation_by_name("labels").outputs[0]

        pred = graph.get_operation_by_name("pred").outputs[0]

        preds = sess.run(pred, {
            heads: idx_mat[:, 0],
            tails: idx_mat[:, 2],
            relations: idx_mat[:, 1]
        })
        scores = {x: y for x, y in enumerate(preds)}
        cnt = 0
        print("Top 10 Candidates for (%s, %s, %s):" %
              (i2e.get(e1, "_"), i2r[r], i2e.get(e2, "_")))
        for w in sorted(scores, key=scores.get, reverse=True):
            cnt += 1
            if cnt > 10:
                break
            print(i2e[w], scores[w])
Ejemplo n.º 5
0
def convert_corpus(corpus, output_corpus, known, count_file, threshold=30):
    sen = pd.read_csv(corpus, sep="\t", names=["h", "t", "s", "p"])
    count_dict = data_utils.load_dict_from_txt(count_file)
    d = data_utils.load_dict_from_txt(known)
    known = {
        k: d[k]
        for k in d.keys()
        if (d[k] in count_dict) and (count_dict[d[k]] >= threshold)
    }
    sen.h = sen.h.map(known)
    sen.t = sen.t.map(known)
    sen.dropna(axis=0, how="any", inplace=True)
    sen.to_csv(output_corpus, sep="\t", header=False, index=False)
Ejemplo n.º 6
0
def real():
    entity = np.load(config.KG_PATH + "/entity.npy")
    relation = np.load(config.KG_PATH + "/relation.npy")
    e2id = load_dict_from_txt(config.KG_PATH + "/e2id.txt")
    r2id = load_dict_from_txt(config.KG_PATH + "/r2id.txt")

    entity2id = load_dict_from_txt(config.E2ID)
    relation2id = load_dict_from_txt(config.R2ID)
    e_embeddings = np.random.uniform(0.0, 1.0, (len(entity2id), entity.shape[1]))
    r_embeddings = np.random.uniform(0.0, 1.0, (len(relation2id), relation.shape[1]))
    for e in entity2id:
        if e not in e2id:
            continue
        idx1 = entity2id[e]
        idx2 = e2id[e]
        e_embeddings[idx1, :] = entity[idx2, :]
    for r in relation2id:
        if r not in r2id:
            continue
        idx1 = relation2id[r]
        idx2 = r2id[r]
        r_embeddings[idx1, :] = relation[idx2, :]
    np.save(config.ENTITY_EMBEDDING, e_embeddings)
    np.save(config.RELATION_EMBEDDING, r_embeddings)
Ejemplo n.º 7
0
    def __init__(self, model_name, data_name, cv_runs, params_dict, logger):
        if data_name == "wn18":
            self.train_triples = pd.read_csv(config.WN18_TRAIN,
                                             names=["e1", "r",
                                                    "e2"]).as_matrix()
            self.valid_triples = pd.read_csv(config.WN18_VALID,
                                             names=["e1", "r",
                                                    "e2"]).as_matrix()
            self.test_triples = pd.read_csv(config.WN18_TEST,
                                            names=["e1", "r",
                                                   "e2"]).as_matrix()
            self.e2id = load_dict_from_txt(config.WN18_E2ID)
            self.r2id = load_dict_from_txt(config.WN18_R2ID)
        elif data_name == "fb15k":
            self.train_triples = pd.read_csv(config.FB15K_TRAIN,
                                             names=["e1", "r",
                                                    "e2"]).as_matrix()
            self.valid_triples = pd.read_csv(config.FB15K_VALID,
                                             names=["e1", "r",
                                                    "e2"]).as_matrix()
            self.test_triples = pd.read_csv(config.FB15K_TEST,
                                            names=["e1", "r",
                                                   "e2"]).as_matrix()
            self.e2id = load_dict_from_txt(config.FB15K_E2ID)
            self.r2id = load_dict_from_txt(config.FB15K_R2ID)
        elif data_name == "bp":
            self.train_triples = pd.read_csv(config.BP_TRAIN,
                                             names=["e1", "r",
                                                    "e2"]).as_matrix()
            self.valid_triples = pd.read_csv(config.BP_VALID,
                                             names=["e1", "r",
                                                    "e2"]).as_matrix()
            self.test_triples = pd.read_csv(config.BP_TEST,
                                            names=["e1", "r",
                                                   "e2"]).as_matrix()
            self.e2id = load_dict_from_txt(config.BP_E2ID)
            self.r2id = load_dict_from_txt(config.BP_R2ID)
        elif data_name == "fb1m":
            self.train_triples = pd.read_csv(config.FB1M_TRAIN,
                                             names=["e1", "r",
                                                    "e2"]).as_matrix()
            self.valid_triples = pd.read_csv(config.FB1M_VALID,
                                             names=["e1", "r",
                                                    "e2"]).as_matrix()
            self.test_triples = pd.read_csv(config.FB1M_TEST,
                                            names=["e1", "r",
                                                   "e2"]).as_matrix()
            self.e2id = load_dict_from_txt(config.FB1M_E2ID)
            self.r2id = load_dict_from_txt(config.FB1M_R2ID)
        else:
            raise AttributeError(
                "Invalid data name! (Valid data name: wn18, fb15k, bp)")

        self.model_name = model_name
        self.data_name = data_name
        self.cv_runs = cv_runs
        self.params_dict = params_dict
        self.hparams = AttrDict(params_dict)
        self.logger = logger
        self.n_entities = len(self.e2id)
        self.n_relations = len(self.r2id)

        self.model = self._get_model()
        self.saver = tf.train.Saver(tf.global_variables())
        checkpoint_path = os.path.abspath(config.CHECKPOINT_PATH)
        if not os.path.exists(checkpoint_path):
            os.makedirs(checkpoint_path)
        self.checkpoint_prefix = os.path.join(checkpoint_path, self.__str__())
Ejemplo n.º 8
0
def preprocess(raw_data, clean_data, if_test=False):
    e2id = load_dict_from_txt(config.E2ID)
    r2id = load_dict_from_txt(config.R2ID)

    if if_test:
        df = pd.read_csv(raw_data,
                         sep="\t",
                         names=["a1", "a2", "e1", "e2", "r", "s", "end"],
                         na_values=[],
                         keep_default_na=False)
    else:
        df = pd.read_csv(raw_data,
                         sep="\t",
                         names=["a1", "a2", "e1", "e2", "r", "s"],
                         na_values=[],
                         keep_default_na=False)
        df.s = df.s.map(lambda x: " ".join(x.split()[:-1]))

    df["len1"] = df.e1.map(lambda x: len(x.split('_')))
    df["len2"] = df.e2.map(lambda x: len(x.split('_')))
    df["len"] = df.s.map(lambda x: len(x.split())) + df.len1 + df.len2 - 2

    df["info"] = df.s + "\t" + df.e1 + "\t" + df.e2
    df["info"] = df["info"].map(lambda x: find_pos(x))
    df["s"] = df["info"].map(lambda x: x[0])
    df["x1"] = df["info"].map(lambda x: x[1])
    df["y1"] = df["info"].map(lambda x: x[2])
    df["x2"] = df["info"].map(lambda x: x[3])
    df["y2"] = df["info"].map(lambda x: x[4])
    df["e1"] = df.a1.map(e2id)
    df["e2"] = df.a2.map(e2id)

    def transform(x):
        if x == "/business/company/industry":
            return "/business/business_operation/industry"
        if x == "/business/company/locations":
            return "/organization/organization/locations"
        if x == "/business/company/founders":
            return "/organization/organization/founders"
        if x == "/business/company/major_shareholders":
            return "/organization/organization/founders"
        if x == "/business/company/advisors":
            return "/organization/organization/advisors"
        if x == "/business/company_shareholder/major_shareholder_of":
            return "/organization/organization_founder/organizations_founded"
        if x == "business/company/place_founded":
            return "/organization/organization/place_founded"
        if x == "/people/person/place_lived":
            return "/people/person/place_of_birth"
        if x == "/business/person/company":
            return "/organization/organization_founder/organizations_founded"
        return x

    df.r = df.r.map(transform)
    df.r = df.r.map(lambda x: r2id.get(x, 0)).astype(int)
    # print(df.r.value_counts())

    df[["r", "e1", "x1", "y1", "e2", "x2", "y2", "s"]].to_csv(clean_data,
                                                              sep="\t",
                                                              index=False,
                                                              header=False)