Ejemplo n.º 1
0
def compute_relklinker(G, relsim, subs, preds, objs):
    """
	Parameters:
	-----------
	G: rgraph
		See `datastructures`.
	relsim: ndarray
		A square matrix containing relational similarity scores.
	subs, preds, objs: sequence
		Sequences representing the subject, predicate and object of 
		input triples.

	Returns:
	--------
	scores, paths, rpaths, times: sequence
		One sequence each for the proximity scores, shortest path in terms of 
		nodes, shortest path in terms of relation sequence, and times taken.
	"""
    # set weights
    indegsim = weighted_degree(G.indeg_vec, weight=WTFN).reshape((1, G.N))
    indegsim = indegsim.ravel()
    targets = G.csr.indices % G.N
    specificity_wt = indegsim[targets]  # specificity
    G.csr.data = specificity_wt.copy()

    # relation vector
    relations = (G.csr.indices - targets) / G.N
    relations_int = relations.astype(int)  # convert to int for indexing

    # back up
    data = G.csr.data.copy()
    indices = G.csr.indices.copy()
    indptr = G.csr.indptr.copy()

    scores, paths, rpaths, times = [], [], [], []
    for idx, (s, p, o) in enumerate(zip(subs, preds, objs)):
        print('{}. Working on {}..'.format(idx + 1, (s, p, o)), end=' ')
        ts = time()
        # set relational weight
        G.csr.data[targets ==
                   o] = 1  # no cost for target t => max. specificity.
        relsimvec = relsim[p, :]  # specific to predicate p
        relsim_wt = relsimvec[relations_int]  # graph weight
        G.csr.data = np.multiply(relsim_wt, G.csr.data)

        rp = relclosure(G, s, p, o, kind='metric', linkpred=True)
        tend = time()
        print('time: {:.2f}s'.format(tend - ts))
        times.append(tend - ts)
        scores.append(rp.score)
        paths.append(rp.path)
        rpaths.append(rp.relational_path)

        # reset graph
        G.csr.data = data.copy()
        G.csr.indices = indices.copy()
        G.csr.indptr = indptr.copy()
        sys.stdout.flush()
    log.info('')
    return scores, paths, rpaths, times
Ejemplo n.º 2
0
    def compute_klinker(self, G, sid, pid, oid):
        """
		Parameters:
		-----------
		G: rgraph
			See `datastructures`.
		subs, preds, objs: sequence
			Sequences representing the subject, predicate and object of
			input triples.

		Returns:
		--------
		scores, paths, rpaths, times: sequence
			One sequence each for the proximity scores, shortest path in terms of
			nodes, shortest path in terms of relation sequence, and times taken.
		"""
        # set weights
        indegsim = weighted_degree(G.indeg_vec, weight=self.WTFN).reshape(
            (1, G.N))
        indegsim = indegsim.ravel()
        targets = G.csr.indices % G.N
        specificity_wt = indegsim[targets]  # specificity
        G.csr.data = specificity_wt.copy()

        # back up
        data = G.csr.data.copy()
        indices = G.csr.indices.copy()
        indptr = G.csr.indptr.copy()

        # compute closure
        scores, paths, rpaths, times = [], [], [], []
        for idx, (s, p, o) in enumerate(zip(sid, pid, oid)):
            print '{}. Working on {}..'.format(idx + 1, (s, p, o)),
            ts = time()
            rp = closure(G, s, p, o, kind='metric', linkpred=True)
            tend = time()
            print 'time: {:.2f}s'.format(tend - ts)
            times.append(tend - ts)
            scores.append(rp.score)
            paths.append(rp.path)
            rpaths.append(rp.relational_path)

            # reset graph
            G.csr.data = data.copy()
            G.csr.indices = indices.copy()
            G.csr.indptr = indptr.copy()
            sys.stdout.flush()
        log.info('')
        return scores, paths, rpaths, times
Ejemplo n.º 3
0
def compute_mincostflow(G, relsim, subs, preds, objs, flowfile):
    """
	Parameters:
	-----------
	G: rgraph
		See `datastructures`.
	relsim: ndarray
		A square matrix containing relational similarity scores.
	subs, preds, objs: sequence
		Sequences representing the subject, predicate and object of 
		input triples.
	flowfile: str
		Absolute path of the file where flow will be stored as JSON,
		one line per triple.

	Returns:
	--------
	mincostflows: sequence
		A sequence containing total flow for each triple.
	times: sequence
		Times taken to compute stream of each triple. 
	"""
    # take graph backup
    G_bak = {
        'data': G.csr.data.copy(),
        'indices': G.csr.indices.copy(),
        'indptr': G.csr.indptr.copy()
    }
    cost_vec_bak = np.log(G.indeg_vec).copy()

    # some set up
    G.sources = np.repeat(np.arange(G.N), np.diff(G.csr.indptr))
    G.targets = G.csr.indices % G.N
    cost_vec = cost_vec_bak.copy()
    indegsim = weighted_degree(G.indeg_vec, weight=WTFN)
    specificity_wt = indegsim[G.targets]  # specificity
    relations = (G.csr.indices - G.targets) / G.N
    mincostflows, times = [], []
    with open(flowfile, 'w', 0) as ff:
        for idx, (s, p, o) in enumerate(zip(subs, preds, objs)):
            s, p, o = [int(x) for x in (s, p, o)]
            ts = time()
            print '{}. Working on {} .. '.format(idx + 1, (s, p, o)),
            sys.stdout.flush()

            # set weights
            relsimvec = np.array(relsim[p, :])  # specific to predicate p
            relsim_wt = relsimvec[relations]
            G.csr.data = np.multiply(relsim_wt, specificity_wt)

            # compute
            mcflow = succ_shortest_path(G,
                                        cost_vec,
                                        s,
                                        p,
                                        o,
                                        return_flow=False,
                                        npaths=5)
            mincostflows.append(mcflow.flow)
            ff.write(json.dumps(mcflow.stream) + '\n')
            tend = time()
            times.append(tend - ts)
            print 'mincostflow: {:.5f}, #paths: {}, time: {:.2f}s.'.format(
                mcflow.flow, len(mcflow.stream['paths']), tend - ts)

            # reset state of the graph
            np.copyto(G.csr.data, G_bak['data'])
            np.copyto(G.csr.indices, G_bak['indices'])
            np.copyto(G.csr.indptr, G_bak['indptr'])
            np.copyto(cost_vec, cost_vec_bak)
    return mincostflows, times
Ejemplo n.º 4
0
def train_model_sm(G,
                   triples,
                   relsim,
                   use_interpretable_features=False,
                   cv=10):
    """
    Entry point for building a fact-checking classifier.
    Performs three steps:
    1. Path extraction (features)
    2a. Path selection using information gain
    2b. Filtering most informative discriminative predicate paths
    3. Building logistic regression model

    Parameters:
    -----------
    G: rgraph
        Knowledge graph.
    triples: dataframe
        A data frame consisting of at least four columns, including
        sid, pid, oid, class.
    use_interpretable_features: bool
        Whether or not to perform 2b.
    cv: int
        Number of cross-validation folds.

    Returns:
    --------
    vec: DictVectorizer
        Useful for preprocessing future triples.
    model: dict
        A dictionary containing 'clf' as the built model,
        and two other key-value pairs, including best parameter
        and best AUROC score.
    """
    y = triples['class']  # ground truth
    triples = triples[['sid', 'pid', 'oid']].to_dict(orient='records')

    pid = triples[0]['pid']
    log.info('PID is: {}, with type: {}'.format(pid, pid.dtype))

    if np.DataSource().exists(join(HOME, "sm", "G_fil_val_{}.npz".format(int(pid)) ))\
       and np.DataSource().exists(join(HOME, "sm", "G_fil_rel_{}.npz".format(int(pid)) )):
        Gr = load_npz(join(HOME, 'sm', 'G_fil_rel_{}.npz'.format(int(pid))))
        Gv = load_npz(join(HOME, 'sm', 'G_fil_val_{}.npz'.format(int(pid))))
    else:
        # set weights
        indegsim = weighted_degree(G.indeg_vec, weight=WTFN).reshape((1, G.N))
        indegsim = indegsim.ravel()
        targets = G.csr.indices % G.N
        relations = (G.csr.indices - targets) / G.N
        relsimvec = np.array(relsim[int(pid), :])  # specific to predicate p
        relsim_wt = relsimvec[
            relations]  # with the size of relations as the number of relations
        ######################################################
        specificity_wt = indegsim[targets]  # specificity

        ## Removing all the edges with the predicte p in between any nodes.
        log.info('=> Removing predicate {} from KG.\n\n'.format(pid))
        eraseedges_mask = ((G.csr.indices - (G.csr.indices % G.N)) /
                           G.N) == pid
        specificity_wt[eraseedges_mask] = 0
        relsim_wt[eraseedges_mask] = 0
        G.csr.data = specificity_wt.copy()

        G.csr.data = np.multiply(relsim_wt, G.csr.data)
        log.info("Constructing adjacency matrix for: {}".format(pid))
        adj_list_data = []
        adj_list_s = []
        adj_list_p = []
        adj_list_o = []
        sel_data = np.array([])
        sel_relations = np.array([])
        dicti = {}
        num_nodes = len(G.csr.indptr) - 1
        for node in tqdm(xrange(num_nodes)):
            dicti = {}
            start = G.csr.indptr[node]
            end = G.csr.indptr[node + 1]

            sel_data = G.csr.data[start:end]
            sel_relations = relations[start:end]
            for i, sel_tar in enumerate(targets[start:end]):
                if sel_tar in dicti:
                    if dicti[sel_tar][0] < sel_data[i]:
                        dicti[sel_tar] = (sel_data[i], sel_relations[i])
                else:
                    dicti[sel_tar] = (sel_data[i], sel_relations[i])
            for key, value in dicti.iteritems():
                if value[0] != 0:
                    adj_list_data.append(value[0])
                    adj_list_s.append(node)
                    adj_list_p.append(value[1])
                    adj_list_o.append(key)
        Gr = csr_matrix((adj_list_p, (adj_list_s, adj_list_o)),
                        shape=(num_nodes, num_nodes))
        Gv = csr_matrix((adj_list_data, (adj_list_s, adj_list_o)),
                        shape=(num_nodes, num_nodes))
        save_npz(join(HOME, 'sm', 'G_fil_rel_{}.npz'.format(int(pid))), Gr)
        save_npz(join(HOME, 'sm', 'G_fil_val_{}.npz'.format(int(pid))), Gv)

    ############# Path extraction ###################
    log.info('=> Path extraction..(this can take a while)')
    t1 = time()
    features, pos_features, neg_features, measurements = extract_paths_sm_par(
        Gv, Gr, triples, y)
    gc.collect()
    log.info('P: +:{}, -:{}, unique tot:{}'.format(len(pos_features),
                                                   len(neg_features),
                                                   len(features)))
    vec = DictVectorizer()
    X = vec.fit_transform(measurements)
    n, m = X.shape
    log.info('Time taken: {:.2f}s\n\n'.format(time() - t1))

    ########### Path selection ###############
    log.info('=> Path selection..')
    t1 = time()
    pathselect = SelectKBest(mutual_info_classif, k=min(100, m))
    X_select = pathselect.fit_transform(X, y)
    selectidx = pathselect.get_support(
        indices=True)  # selected feature indices
    vec = vec.restrict(selectidx, indices=True)
    select_pos_features, select_neg_features = set(), set()
    for feature in vec.get_feature_names():
        if feature in pos_features:
            select_pos_features.add(feature)
        if feature in neg_features:
            select_neg_features.add(feature)
    log.info('D: +:{}, -:{}, tot:{}'.format(len(select_pos_features),
                                            len(select_neg_features),
                                            X_select.shape[1]))
    log.info('Time taken: {:.2f}s\n'.format(time() - t1))

    # Fact interpretation
    if use_interpretable_features and len(select_neg_features) > 0:
        log.info('=> Fact interpretation..')
        t1 = time()
        theta = 10
        select_neg_idx = [
            i for i, f in enumerate(vec.get_feature_names())
            if f in select_neg_features
        ]
        removemask = np.where(
            np.sum(X_select[:, select_neg_idx], axis=0) >= theta)[0]
        restrictidx = select_neg_idx[removemask]
        keepidx = []
        for i, f in enumerate(vec.get_feature_names()):
            if i not in restrictidx:
                keepidx.append(i)
            else:
                select_neg_features.remove(f)
        vec = vec.restrictidx(keepidx, indices=True)
        X_select = X_select[:, keepidx]
        log.info('D*: +:{}, -:{}, tot:{}'.format(len(select_pos_features),
                                                 len(select_neg_features),
                                                 X_select.shape[1]))
        log.info('Time taken: {:.2f}s\n'.format(time() - t1))

    # Model creation
    log.info('=> Model building..')
    t1 = time()
    model = find_best_model(X_select, y, cv=cv)
    log.info('#Features: {}, best-AUROC: {:.5f}'.format(
        X_select.shape[1], model['best_score']))
    log.info('Time taken: {:.2f}s\n'.format(time() - t1))

    return vec, model
Ejemplo n.º 5
0
def test_graph2():
    sym = True
    adj = np.array([[0, 1, 0, 16], [2, 4, 0, 14], [4, 5, 0, 4], [0, 2, 1, 13],
                    [2, 1, 1, 4], [3, 5, 1, 20], [1, 3, 2, 12], [3, 2, 2, 9],
                    [4, 3, 2, 7]])
    shape = (6, 6, 3)
    G = make_graph(adj[:, :3], shape, values=adj[:, 3], sym=sym, display=False)
    print "Original graph:\n", G

    # set weights
    indegsim = weighted_degree(G.indeg_vec, weight='degree').reshape((1, G.N))
    indegsim = indegsim.ravel()
    targets = G.csr.indices % G.N
    specificity_wt = indegsim[targets]  # specificity
    G.csr.data = specificity_wt.copy()

    # back up
    data = G.csr.data.copy()
    indices = G.csr.indices.copy()
    indptr = G.csr.indptr.copy()

    # Closure
    expect = [[0, 0, 1, 0.20000000000000001, [0, 2, 1], [-1, 1, 1]],
              [0, 0, 2, 1.0, [0, 2], [-1, 1]],
              [0, 0, 3, 0.25, [0, 1, 3], [-1, 0, 2]],
              [0, 0, 4, 0.20000000000000001, [0, 2, 4], [-1, 1, 0]],
              [0, 0, 5, 0.125, [0, 1, 3, 5], [-1, 0, 2, 1]],
              [0, 1, 1, 1.0, [0, 1], [-1, 0]],
              [0, 1, 2, 0.25, [0, 1, 2], [-1, 0, 1]],
              [0, 1, 3, 0.25, [0, 1, 3], [-1, 0, 2]],
              [0, 1, 4, 0.20000000000000001, [0, 2, 4], [-1, 1, 0]],
              [0, 1, 5, 0.125, [0, 1, 3, 5], [-1, 0, 2, 1]],
              [0, 2, 1, 1.0, [0, 1], [-1, 0]], [0, 2, 2, 1.0, [0, 2], [-1, 1]],
              [0, 2, 3, 0.25, [0, 1, 3], [-1, 0, 2]],
              [0, 2, 4, 0.20000000000000001, [0, 2, 4], [-1, 1, 0]],
              [0, 2, 5, 0.125, [0, 1, 3, 5], [-1, 0, 2, 1]],
              [1, 0, 0, 0.20000000000000001, [1, 2, 0], [-1, 1, 1]],
              [1, 0, 2, 1.0, [1, 2], [-1, 1]], [1, 0, 3, 1.0, [1, 3], [-1, 2]],
              [1, 0, 4, 0.20000000000000001, [1, 3, 4], [-1, 2, 2]],
              [1, 0, 5, 0.20000000000000001, [1, 3, 5], [-1, 2, 1]],
              [1, 1, 0, 1.0, [1, 0], [-1, 0]],
              [1, 1, 2, 0.33333333333333331, [1, 0, 2], [-1, 0, 1]],
              [1, 1, 3, 1.0, [1, 3], [-1, 2]],
              [1, 1, 4, 0.20000000000000001, [1, 3, 4], [-1, 2, 2]],
              [1, 1, 5, 0.20000000000000001, [1, 3, 5], [-1, 2, 1]],
              [1, 2, 0, 1.0, [1, 0], [-1, 0]], [1, 2, 2, 1.0, [1, 2], [-1, 1]],
              [1, 2, 3, 0.20000000000000001, [1, 2, 3], [-1, 1, 2]],
              [1, 2, 4, 0.20000000000000001, [1, 3, 4], [-1, 2, 2]],
              [1, 2, 5, 0.20000000000000001, [1, 3, 5], [-1, 2, 1]],
              [2, 0, 0, 1.0, [2, 0], [-1, 1]], [2, 0, 1, 1.0, [2, 1], [-1, 1]],
              [2, 0, 3, 1.0, [2, 3], [-1, 2]],
              [2, 0, 4, 0.20000000000000001, [2, 3, 4], [-1, 2, 2]],
              [2, 0, 5, 0.25, [2, 4, 5], [-1, 0, 0]],
              [2, 1, 0, 0.25, [2, 1, 0], [-1, 1, 0]],
              [2, 1, 1, 0.33333333333333331, [2, 0, 1], [-1, 1, 0]],
              [2, 1, 3, 1.0, [2, 3], [-1, 2]], [2, 1, 4, 1.0, [2, 4], [-1, 0]],
              [2, 1, 5, 0.25, [2, 4, 5], [-1, 0, 0]],
              [2, 2, 0, 1.0, [2, 0], [-1, 1]], [2, 2, 1, 1.0, [2, 1], [-1, 1]],
              [2, 2, 3, 0.25, [2, 1, 3], [-1, 1, 2]],
              [2, 2, 4, 1.0, [2, 4], [-1, 0]],
              [2, 2, 5, 0.25, [2, 4, 5], [-1, 0, 0]],
              [3, 0, 0, 0.25, [3, 1, 0], [-1, 2, 0]],
              [3, 0, 1, 1.0, [3, 1], [-1, 2]], [3, 0, 2, 1.0, [3, 2], [-1, 2]],
              [3, 0, 4, 1.0, [3, 4], [-1, 2]], [3, 0, 5, 1.0, [3, 5], [-1, 1]],
              [3, 1, 0, 0.25, [3, 1, 0], [-1, 2, 0]],
              [3, 1, 1, 1.0, [3, 1], [-1, 2]], [3, 1, 2, 1.0, [3, 2], [-1, 2]],
              [3, 1, 4, 1.0, [3, 4], [-1, 2]],
              [3, 1, 5, 0.25, [3, 4, 5], [-1, 2, 0]],
              [3, 2, 0, 0.25, [3, 1, 0], [-1, 2, 0]],
              [3, 2, 1, 0.20000000000000001, [3, 2, 1], [-1, 2, 1]],
              [3, 2, 2, 0.25, [3, 4, 2], [-1, 2, 0]],
              [3, 2, 4, 0.33333333333333331, [3, 5, 4], [-1, 1, 0]],
              [3, 2, 5, 1.0, [3, 5], [-1, 1]],
              [4, 0, 0, 0.20000000000000001, [4, 2, 0], [-1, 0, 1]],
              [4, 0, 1, 0.20000000000000001, [4, 3, 1], [-1, 2, 2]],
              [4, 0, 2, 0.20000000000000001, [4, 3, 2], [-1, 2, 2]],
              [4, 0, 3, 1.0, [4, 3], [-1, 2]],
              [4, 0, 5, 0.20000000000000001, [4, 3, 5], [-1, 2, 1]],
              [4, 1, 0, 0.20000000000000001, [4, 2, 0], [-1, 0, 1]],
              [4, 1, 1, 0.20000000000000001, [4, 3, 1], [-1, 2, 2]],
              [4, 1, 2, 1.0, [4, 2], [-1, 0]], [4, 1, 3, 1.0, [4, 3], [-1, 2]],
              [4, 1, 5, 1.0, [4, 5], [-1, 0]],
              [4, 2, 0, 0.20000000000000001, [4, 2, 0], [-1, 0, 1]],
              [4, 2, 1, 0.20000000000000001, [4, 3, 1], [-1, 2, 2]],
              [4, 2, 2, 1.0, [4, 2], [-1, 0]],
              [4, 2, 3, 0.33333333333333331, [4, 5, 3], [-1, 0, 1]],
              [4, 2, 5, 1.0, [4, 5], [-1, 0]],
              [5, 0, 0, 0.125, [5, 4, 2, 0], [-1, 0, 0, 1]],
              [5, 0, 1, 0.20000000000000001, [5, 3, 1], [-1, 1, 2]],
              [5, 0, 2, 0.25, [5, 4, 2], [-1, 0, 0]],
              [5, 0, 3, 1.0, [5, 3], [-1, 1]],
              [5, 0, 4, 0.20000000000000001, [5, 3, 4], [-1, 1, 2]],
              [5, 1, 0, 0.125, [5, 4, 2, 0], [-1, 0, 0, 1]],
              [5, 1, 1, 0.20000000000000001, [5, 3, 1], [-1, 1, 2]],
              [5, 1, 2, 0.25, [5, 4, 2], [-1, 0, 0]],
              [5, 1, 3, 0.25, [5, 4, 3], [-1, 0, 2]],
              [5, 1, 4, 1.0, [5, 4], [-1, 0]],
              [5, 2, 0, 0.125, [5, 4, 2, 0], [-1, 0, 0, 1]],
              [5, 2, 1, 0.20000000000000001, [5, 3, 1], [-1, 1, 2]],
              [5, 2, 2, 0.25, [5, 4, 2], [-1, 0, 0]],
              [5, 2, 3, 1.0, [5, 3], [-1, 1]], [5, 2, 4, 1.0, [5, 4], [-1, 0]]]
    results = []
    itr = 0
    for s in xrange(G.N):
        for p in xrange(G.R):
            for o in xrange(G.N):
                if s == o:
                    continue
                G.csr.data[targets == o] = 1
                rp = relclosure(G, s, p, o, kind='metric', linkpred=True)
                tmp = [
                    rp.source, rp.relation, rp.target, rp.score, rp.path,
                    rp.relational_path
                ]
                results.append(tmp)
                assert allclose(expect[itr], tmp)
                itr += 1
                G.csr.data = data.copy()
                G.csr.indices = indices.copy()
                G.csr.indptr = indptr.copy()