Ejemplo n.º 1
0
def semihin_experiment(scope, scope_name, count, X, newIds, label_num=5):
    experiment_path = 'data/local/split/' + scope_name + '/'

    with open('data/local/' + scope_name + '.dmp') as f:
        hin = pk.load(f)

    n = X.shape[0]
    e = X.shape[1]
    if not type(X) is np.ndarray:
        X = X.toarray()

    graph = np.zeros((n + e, n + e))
    graph[0:n, n:n + e] = X
    graph[n:n + e, 0:n] = X.transpose()
    graph = sparse.csc_matrix(graph)

    newLabel = GraphGenerator.getNewLabels(hin)
    lp_param = {'alpha': 0.98, 'normalization_factor': 5, 'method': 'variant'}

    ssl = SSLClassifier(graph,
                        newLabel,
                        scope,
                        lp_param,
                        repeatTimes=50,
                        trainNumbers=label_num,
                        classCount=count)
    ssl.repeatedFixedExperimentwithNewIds(pathPrefix=experiment_path + 'lb' +
                                          str(label_num).zfill(3) + '_',
                                          newIds=newIds)
    return ssl.get_mean()
Ejemplo n.º 2
0
def knowsim_experiment(scope,
                       scope_name,
                       type_list,
                       count,
                       newLabels,
                       tau=1,
                       kNeighbors=10,
                       label_num=5):
    split_path = 'data/local/split/' + scope_name + '/'
    with open('data/local/' + scope_name + '.dmp') as f:
        hin = pk.load(f)

    repeats = 50
    tf_param = {'word': True, 'entity': False, 'we_weight': 0.1}
    X_word, newIds, entityIds = GraphGenerator.getTFVectorX(hin, tf_param)
    n = X_word.shape[0]

    knowsim = sparse.lil_matrix((n, n))
    for t in type_list:
        tf_param = {'word': True, 'entity': True, 'we_weight': 0.1}
        X_typed, newIds, entityIds = GraphGenerator.getTFVectorX(
            hin, tf_param, t)

        # make similarity graph
        cosX = cosine_similarity(X_typed)
        graph = sparse.lil_matrix((n, n))
        for i in range(n):
            for j in np.argpartition(cosX[i], -kNeighbors)[-kNeighbors:]:
                if j == i:
                    continue
                graph[i, j] = cosX[i, j]  # np.exp(- (1 - cosX[i, j]) / 0.03) #
                graph[j, i] = cosX[i, j]  # np.exp(- (1 - cosX[i, j]) / 0.03) #

        # calculate laplacian scores
        row_sum = graph.sum(axis=1)
        laplacian_score = generate_laplacian_score(row_sum, X_word, kNeighbors)

        # add meta-path-based similarity to the knowsim
        knowsim = knowsim + np.exp(-tau * laplacian_score) * graph

    knowsim = knowsim.tocsr()
    print 'running lp'
    lp_param = {'alpha': 0.98, 'normalization_factor': 5}

    ssl = SSLClassifier(knowsim,
                        newLabels,
                        scope,
                        lp_param,
                        repeatTimes=50,
                        trainNumbers=label_num,
                        classCount=count)
    ssl.repeatedFixedExperimentwithNewIds(pathPrefix=split_path + 'lb' +
                                          str(label_num).zfill(3) + '_',
                                          newIds=newIds)
    return ssl.get_mean()
Ejemplo n.º 3
0
def lp_experiment(scope, scope_name, count, graph, labels, newIds):
    experiment_path = 'data/local/split/' + scope_name + '/'
    lp_param = {'alpha': 0.98, 'normalization_factor': 5}
    lp = 5
    ssl = SSLClassifier(graph,
                        labels,
                        scope,
                        lp_param,
                        repeatTimes=50,
                        trainNumbers=lp,
                        classCount=count)
    ssl.repeatedFixedExperimentwithNewIds(pathPrefix=experiment_path + 'lb' +
                                          str(lp).zfill(3) + '_',
                                          newIds=newIds)
    return ssl.get_mean()
Ejemplo n.º 4
0
def ensemble_cotrain_experiment(scope,
                                scope_name,
                                type_list,
                                threshold,
                                weight,
                                count,
                                label_num=5):

    pred_path = 'data/local/cotrain/' + scope_name + '/'
    split_path = 'data/local/split/' + scope_name + '/'
    if not os.path.exists(pred_path):
        os.makedirs(pred_path)

    with open('data/local/' + scope_name + '.dmp') as f:
        hin = pk.load(f)

    tf_param = {'word': True, 'entity': True, 'we_weight': 0.1}
    c = len(scope)
    lb_cand = [label_num]
    repeats = 50

    # rounds for alternating optimization
    rounds = 2
    best_res = 0
    X_s = {}

    tf_param = {'word': True, 'entity': False, 'we_weight': 0.112}
    X_word, newIds, entity_new_ids = GraphGenerator.getTFVectorX(
        hin, param=tf_param, entity_types=None)

    for t in type_list:
        if not os.path.exists(pred_path + str(t) + '/'):
            os.makedirs(pred_path + str(t) + '/')

        with open('data/local/laplacian/' + scope_name + '/' + str(t) +
                  '_scores') as f:
            laplacian_score = pk.load(f)
        tf_param = {'word': True, 'entity': True, 'we_weight': 0.112}
        X_typed, newIds, entityIds = GraphGenerator.getTFVectorX(
            hin, tf_param, t)
        laplacian_score = 20 * np.exp(-laplacian_score * 0.01)
        # laplacian_score = laplacian_score / np.sum(laplacian_score) * laplacian_score.shape[0]
        D = sparse.diags(laplacian_score)
        X_typed = X_typed * D
        X_s[str(t)] = X_typed

    for rd in range(rounds):
        round_best_res = 0
        round_best_t = ''

        # step 1:
        # generate output of each meta-path
        for t in type_list:

            X = X_s[str(t)].toarray()
            n = X.shape[0]
            e = X.shape[1]
            graph = np.zeros((n + e, n + e))
            graph[0:n, n:n + e] = X
            graph[n:n + e, 0:n] = X.transpose()
            graph = sparse.csc_matrix(graph)

            newLabel = GraphGenerator.getNewLabels(hin)
            lp_param = {
                'alpha': 0.98,
                'normalization_factor': 5,
                'method': 'variant'
            }

            lb = label_num
            ssl = SSLClassifier(graph,
                                newLabel,
                                scope,
                                lp_param,
                                repeatTimes=repeats,
                                trainNumbers=lb,
                                classCount=count)
            if rd == 0:
                ssl.repeatedFixedExperimentwithNewIds(
                    pathPrefix=split_path + 'lb' + str(lb).zfill(3) + '_',
                    newIds=newIds,
                    saveProb=True,
                    savePathPrefix=pred_path + str(t) + '/lb' +
                    str(lb).zfill(3))
            else:
                inputPredPath = 'data/local/cotrain/' + scope_name + '/lb' + str(
                    lb).zfill(3) + '_pred_rd_' + str(rd - 1).zfill(3)
                ssl.repeatedFixedExpeimentwithInput(
                    pathPrefix=split_path + 'lb' + str(lb).zfill(3) + '_',
                    newIds=newIds,
                    saveProb=True,
                    savePathPrefix=pred_path + 'lb' + str(lb).zfill(3) + '_' +
                    str(t),
                    inputPredPath=inputPredPath)
            res = ssl.get_mean()
            if res > best_res:
                best_res = res
                best_t = t
            if res > round_best_res:
                round_best_res = res
                round_best_t = t
        print 'Round %d\t%.4f\t%s' % (rd, round_best_res, str(round_best_t))

        # step 2:
        # propagate pseudo-label for other path
        for lb in lb_cand:
            results = []
            for r in range(repeats):
                with open('data/local/split/' + scope_name + '/lb' +
                          str(lb).zfill(3) + '_' + str(r).zfill(3) +
                          '_train') as f:
                    trainLabel = pk.load(f)
                with open('data/local/split/' + scope_name + '/lb' +
                          str(lb).zfill(3) + '_' + str(r).zfill(3) +
                          '_test') as f:
                    testLabel = pk.load(f)

                numTrain = len(trainLabel)
                numTest = len(testLabel)
                n = numTrain + numTest

                # write output probability
                outPred = np.zeros((n, c))
                for t in type_list:
                    typePred = np.zeros((n, c))
                    with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) +
                              '_' + str(r).zfill(3) + '_train') as f:
                        trainPred = pk.load(f)
                        for i, k in enumerate(trainLabel.keys()):
                            typePred[k, :] = trainPred[i, :]

                    with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) +
                              '_' + str(r).zfill(3) + '_test') as f:
                        testPred = pk.load(f)
                        for i, k in enumerate(testLabel.keys()):
                            #typePred[k,:] = testPred[i,:]

                            # some potential improvement: set a threshold for random walk number to block
                            # 'unconfident' data points
                            max = np.max(testPred[i, :])
                            if max > threshold[str(t)]:
                                typePred[k, :] = testPred[i, :]
                    # add meta-path probability to global probability
                    outPred += typePred * weight[str(t)]

                with open(
                        'data/local/cotrain/' + scope_name + '/lb' +
                        str(lb).zfill(3) + '_pred_rd_' + str(rd).zfill(3) +
                        '_' + str(r).zfill(3), 'w') as f:
                    pk.dump(outPred, f)
    return best_res
Ejemplo n.º 5
0
def lp_meta_experiment(scope,
                       scope_name,
                       type_list,
                       threshold,
                       weight,
                       count,
                       label_num=5):

    pred_path = 'data/local/lpmeta/' + scope_name + '/'
    if not os.path.exists(pred_path):
        os.makedirs(pred_path)
    split_path = 'data/local/split/' + scope_name + '/'

    with open('data/local/' + scope_name + '.dmp') as f:
        hin = pk.load(f)

    tf_param = {'word': True, 'entity': True, 'we_weight': 0.1}
    c = len(scope)
    lb_cand = [label_num]
    repeats = 50

    # rounds for alternating optimization
    rounds = 2

    best_res = 0

    for rd in range(rounds):

        # step 1:
        # generate output of each meta-path
        for t in type_list:
            if not os.path.exists(pred_path + str(t)):
                os.makedirs(pred_path + str(t))
            graph, newIds = GraphGenerator.getMetaPathGraph(hin, tf_param, t)

            newLabel = GraphGenerator.getNewLabels(hin)
            lp_param = {'alpha': 0.99, 'normalization_factor': 0.01}
            #    lp_param = {'alpha':0.98, 'normalization_factor':5}
            # 3-class classification

            lb = label_num
            ssl = SSLClassifier(graph,
                                newLabel,
                                scope,
                                lp_param,
                                repeatTimes=repeats,
                                trainNumbers=lb,
                                classCount=count)
            if rd == 0:
                ssl.repeatedFixedExperimentwithNewIds(
                    pathPrefix=split_path + 'lb' + str(lb).zfill(3) + '_',
                    newIds=newIds,
                    saveProb=True,
                    savePathPrefix=pred_path + str(t) + '/lb' +
                    str(lb).zfill(3))
            else:
                inputPredPath = 'data/local/lpmeta/' + scope_name + '/lb' + str(
                    lb).zfill(3) + '_pred_rd_' + str(rd - 1).zfill(3)
                ssl.repeatedFixedExpeimentwithInput(
                    pathPrefix=split_path + 'lb' + str(lb).zfill(3) + '_',
                    newIds=newIds,
                    saveProb=True,
                    savePathPrefix=pred_path + str(t) + '/lb' +
                    str(lb).zfill(3),
                    inputPredPath=inputPredPath)
            res = ssl.get_mean()
            if res > best_res:
                best_res = res

        # step 2:
        # propagate pseudo-label for other path
        for lb in lb_cand:
            results = []
            for r in range(repeats):
                with open(split_path + 'lb' + str(lb).zfill(3) + '_' +
                          str(r).zfill(3) + '_train') as f:
                    trainLabel = pk.load(f)
                with open(split_path + 'lb' + str(lb).zfill(3) + '_' +
                          str(r).zfill(3) + '_test') as f:
                    testLabel = pk.load(f)

                numTrain = len(trainLabel)
                numTest = len(testLabel)
                n = numTrain + numTest

                # write get-another-label label file
                outPred = np.zeros((n, c))
                for t in type_list:
                    typePred = np.zeros((n, c))
                    with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) +
                              '_' + str(r).zfill(3) + '_train') as f:
                        trainPred = pk.load(f)
                        for i, k in enumerate(trainLabel.keys()):
                            typePred[k, :] = trainPred[i, :]

                    with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) +
                              '_' + str(r).zfill(3) + '_test') as f:
                        testPred = pk.load(f)
                        for i, k in enumerate(testLabel.keys()):
                            typePred[k, :] = testPred[i, :]

                    # add meta-path probability to global probability
                    outPred += typePred * weight[str(t)]

                with open(
                        'data/local/lpmeta/' + scope_name + '/lb' +
                        str(lb).zfill(3) + '_pred_rd_' + str(rd).zfill(3) +
                        '_' + str(r).zfill(3), 'w') as f:
                    pk.dump(outPred, f)
    return best_res