Ejemplo n.º 1
0
def run_knowsim():
    # 20NG
    for i in range(2):
        scope_name = ng20_scope_names[i]
        scope = ng20_scopes[i]
        count = ng20_counts[i]
        with open('data/local/' + scope_name + '.dmp') as f:
            hin = pk.load(f)
        newLabels = GraphGenerator.getNewLabels(hin)
        print scope_name + ' knowsim'
        res = knowsim_experiment(scope, scope_name, NG20TypeList, count,
                                 newLabels)
        result[i, 6] = res

    # GCAT
    for i in range(2):
        scope_name = gcat_scope_names[i]
        scope = gcat_scopes[i]
        count = gcat_counts[i]
        with open('data/local/' + scope_name + '.dmp') as f:
            hin = pk.load(f)
        type_list = GCATTypeList[i]
        newLabels = GraphGenerator.getNewLabels(hin)
        print scope_name + ' knowsim'
        result[i + 2, 6] = knowsim_experiment(scope, scope_name, type_list,
                                              count, newLabels)
Ejemplo n.º 2
0
def run_lp():
    # 20NG
    for i in range(2):
        scope_name = ng20_scope_names[i]
        scope = ng20_scopes[i]
        count = ng20_counts[i]
        with open('data/local/' + scope_name + '.dmp') as f:
            hin = pk.load(f)
        newLabels = GraphGenerator.getNewLabels(hin)

        tf_param = {'word': True, 'entity': True, 'we_weight': 0.112}
        graph, newIds = GraphGenerator.generateCosineNeighborGraph(
            hin, 10, tf_param)
        print scope_name + ' lp+entity'
        result[i, 4] = lp_experiment(scope, scope_name, count, graph,
                                     newLabels, newIds)

    # GCAT
    for i in range(2):
        scope_name = gcat_scope_names[i]
        scope = gcat_scopes[i]
        count = gcat_counts[i]
        with open('data/local/' + scope_name + '.dmp') as f:
            hin = pk.load(f)
        tf_param = {'word': True, 'entity': True, 'we_weight': 0.112}
        newLabels = GraphGenerator.getNewLabels(hin)
        graph, newIds = GraphGenerator.generateCosineNeighborGraph(
            hin, 10, tf_param)

        with open('data/local/laplacian/' + scope_name + '.x') as f:
            X = pk.load(f)
        graph = GraphGenerator.generateCosineNeighborGraphfromX(X)
        print scope_name + ' lp+entity'
        result[i + 2, 4] = lp_experiment(scope, scope_name, count, graph,
                                         newLabels, newIds)
Ejemplo n.º 3
0
def generate_train_test_split():
    # generate random train-test split for 2 data set * 2 scopes
    repeat_times = 50

    lp_candidate = [5]

    # 20ng
    for i in range(2):
        scope_name = ng20_scope_names[i]
        scope = ng20_scopes[i]
        count = ng20_counts[i]
        experiment_path = 'data/local/split/' + scope_name + '/'
        if not os.path.exists('data/local/split/' + scope_name):
            os.makedirs('data/local/split/' + scope_name)
        with open('data/local/' + scope_name + '.dmp') as f:
            hin = pk.load(f)
        tf_param = {'word': True, 'entity': True, 'we_weight': 0.1}
        lp_param = {'alpha': 0.99, 'normalization_factor': 0.01}
        graph, newIds = GraphGenerator.generateCosineNeighborGraph(
            hin, kNeighbors=10, tf_param=tf_param)
        new_label = GraphGenerator.getNewLabels(hin)
        for lp in lp_candidate:
            ssl = SSLClassifier(graph,
                                new_label,
                                scope,
                                lp_param,
                                repeatTimes=repeat_times,
                                trainNumbers=lp,
                                classCount=count)
            ssl.repeatedExperiment(savePathPrefix=experiment_path + 'lb' +
                                   str(lp).zfill(3) + '_')

    # gcat
    for i in range(2):
        scope_name = gcat_scope_names[i]
        scope = gcat_scopes[i]
        count = gcat_counts[i]
        if not os.path.exists('data/local/split/' + scope_name):
            os.makedirs('data/local/split/' + scope_name)
        experiment_path = 'data/local/split/' + scope_name + '/'
        with open('data/local/' + scope_name + '.dmp') as f:
            hin = pk.load(f)
        tf_param = {'word': True, 'entity': True, 'we_weight': 0.1}
        lp_param = {'alpha': 0.99, 'normalization_factor': 0.01}
        graph, newIds = GraphGenerator.generateCosineNeighborGraph(
            hin, kNeighbors=10, tf_param=tf_param)
        new_label = GraphGenerator.getNewLabels(hin)
        for lp in lp_candidate:
            ssl = SSLClassifier(graph,
                                new_label,
                                scope,
                                lp_param,
                                repeatTimes=repeat_times,
                                trainNumbers=lp,
                                classCount=count)
            ssl.repeatedExperiment(savePathPrefix=experiment_path + 'lb' +
                                   str(lp).zfill(3) + '_')
Ejemplo n.º 4
0
def knowsim_experiment(scope,
                       scope_name,
                       type_list,
                       count,
                       newLabels,
                       tau=1,
                       kNeighbors=10,
                       label_num=5):
    split_path = 'data/local/split/' + scope_name + '/'
    with open('data/local/' + scope_name + '.dmp') as f:
        hin = pk.load(f)

    repeats = 50
    tf_param = {'word': True, 'entity': False, 'we_weight': 0.1}
    X_word, newIds, entityIds = GraphGenerator.getTFVectorX(hin, tf_param)
    n = X_word.shape[0]

    knowsim = sparse.lil_matrix((n, n))
    for t in type_list:
        tf_param = {'word': True, 'entity': True, 'we_weight': 0.1}
        X_typed, newIds, entityIds = GraphGenerator.getTFVectorX(
            hin, tf_param, t)

        # make similarity graph
        cosX = cosine_similarity(X_typed)
        graph = sparse.lil_matrix((n, n))
        for i in range(n):
            for j in np.argpartition(cosX[i], -kNeighbors)[-kNeighbors:]:
                if j == i:
                    continue
                graph[i, j] = cosX[i, j]  # np.exp(- (1 - cosX[i, j]) / 0.03) #
                graph[j, i] = cosX[i, j]  # np.exp(- (1 - cosX[i, j]) / 0.03) #

        # calculate laplacian scores
        row_sum = graph.sum(axis=1)
        laplacian_score = generate_laplacian_score(row_sum, X_word, kNeighbors)

        # add meta-path-based similarity to the knowsim
        knowsim = knowsim + np.exp(-tau * laplacian_score) * graph

    knowsim = knowsim.tocsr()
    print 'running lp'
    lp_param = {'alpha': 0.98, 'normalization_factor': 5}

    ssl = SSLClassifier(knowsim,
                        newLabels,
                        scope,
                        lp_param,
                        repeatTimes=50,
                        trainNumbers=label_num,
                        classCount=count)
    ssl.repeatedFixedExperimentwithNewIds(pathPrefix=split_path + 'lb' +
                                          str(label_num).zfill(3) + '_',
                                          newIds=newIds)
    return ssl.get_mean()
Ejemplo n.º 5
0
def semihin_experiment(scope, scope_name, count, X, newIds, label_num=5):
    experiment_path = 'data/local/split/' + scope_name + '/'

    with open('data/local/' + scope_name + '.dmp') as f:
        hin = pk.load(f)

    n = X.shape[0]
    e = X.shape[1]
    if not type(X) is np.ndarray:
        X = X.toarray()

    graph = np.zeros((n + e, n + e))
    graph[0:n, n:n + e] = X
    graph[n:n + e, 0:n] = X.transpose()
    graph = sparse.csc_matrix(graph)

    newLabel = GraphGenerator.getNewLabels(hin)
    lp_param = {'alpha': 0.98, 'normalization_factor': 5, 'method': 'variant'}

    ssl = SSLClassifier(graph,
                        newLabel,
                        scope,
                        lp_param,
                        repeatTimes=50,
                        trainNumbers=label_num,
                        classCount=count)
    ssl.repeatedFixedExperimentwithNewIds(pathPrefix=experiment_path + 'lb' +
                                          str(label_num).zfill(3) + '_',
                                          newIds=newIds)
    return ssl.get_mean()
Ejemplo n.º 6
0
def generate_meta_graph(scope, scope_name, type_list, count):
    split_path = 'data/local/split/' + scope_name + '/'
    pred_path = 'data/local/metagraph/' + scope_name + '/'
    if not os.path.exists('data/local/metagraph/' + scope_name + '/'):
        os.makedirs('data/local/metagraph/' + scope_name + '/')

    with open('data/local/' + scope_name + '.dmp') as f:
        hin = pk.load(f)
    tf_param = {'word': True, 'entity': True, 'we_weight': 0.1}

    for t in type_list:
        #print t
        X, newIds, entitynewIds = GraphGenerator.getTFVectorX(hin, tf_param, t)
        n = X.shape[0]
        e = X.shape[1]
        with open('data/local/laplacian/' + scope_name + '/' + str(t) +
                  '_scores') as f:
            laplacian_score = pk.load(f)
        laplacian_score = 20 * np.exp(-laplacian_score * 0.01)
        D = sparse.diags(laplacian_score)
        X = X * D
        X = X.toarray()
        graph = np.zeros((n + e, n + e))
        graph[0:n, n:n + e] = X
        graph[n:n + e, 0:n] = X.transpose()
        graph = sparse.csc_matrix(graph)

        newLabel = GraphGenerator.getNewLabels(hin)
        lp_param = {'alpha': 0.98, 'normalization_factor': 5}
        # 3-class classification
        lp_candidate = [5]
        for lp in lp_candidate:
            ssl = SSLClassifier(graph,
                                newLabel,
                                scope,
                                lp_param,
                                repeatTimes=50,
                                trainNumbers=lp,
                                classCount=count)
            if not os.path.exists(pred_path + str(t) + '/'):
                os.makedirs(pred_path + str(t) + '/')
            ssl.repeatedFixedExperimentwithNewIds(
                pathPrefix=split_path + 'lb' + str(lp).zfill(3) + '_',
                newIds=newIds,
                saveProb=True,
                savePathPrefix=pred_path + str(t) + '/' + 'lb' +
                str(lp).zfill(3))
Ejemplo n.º 7
0
def run_semihin():
    # 20NG
    for i in range(2):
        scope_name = ng20_scope_names[i]
        scope = ng20_scopes[i]
        count = ng20_counts[i]

        with open('data/local/' + scope_name + '.dmp') as f:
            hin = pk.load(f)
        newLabels = GraphGenerator.getNewLabels(hin)

        tf_param = {'word': True, 'entity': False, 'we_weight': 0.112}
        X, newIds, entity_new_ids = GraphGenerator.getTFVectorX(
            hin, param=tf_param, entity_types=None)
        print scope_name + ' semihin'
        result[i, 7] = semihin_experiment(scope, scope_name, count, X, newIds)

        tf_param = {'word': True, 'entity': True, 'we_weight': 0.112}
        X, newIds, entity_new_ids = GraphGenerator.getTFVectorX(
            hin, param=tf_param, entity_types=None)
        print scope_name + ' semihin+entity'
        result[i, 8] = semihin_experiment(scope, scope_name, count, X, newIds)

    # GCAT
    for i in range(2):
        scope_name = gcat_scope_names[i]
        scope = gcat_scopes[i]
        count = gcat_counts[i]
        with open('data/local/' + scope_name + '.dmp') as f:
            hin = pk.load(f)
        newLabels = GraphGenerator.getNewLabels(hin)

        tf_param = {'word': True, 'entity': False, 'we_weight': 0.112}
        X, newIds, entity_new_ids = GraphGenerator.getTFVectorX(
            hin, param=tf_param, entity_types=None)
        print scope_name + ' semihin'
        result[i + 2, 6] = semihin_experiment(scope, scope_name, count, X,
                                              newIds)

        with open('data/local/laplacian/' + scope_name + '.x') as f:
            X = pk.load(f)
        print scope_name + ' semihin+entity'
        result[i + 2, 7] = semihin_experiment(scope, scope_name, count, X,
                                              newIds)
Ejemplo n.º 8
0
def run_generate_laplacian_score():
    print 'generate laplacian score for feature reweighting'
    # 20NG
    for i in range(2):
        scope_name = ng20_scope_names[i]
        with open('data/local/' + scope_name + '.dmp') as f:
            hin = pk.load(f)
        tf_param = {'word': True, 'entity': True, 'we_weight': 0.1}
        X_word, newIds, entityIds = GraphGenerator.getTFVectorX(hin, tf_param)
        for t in NG20TypeList:
            tf_param = {'word': True, 'entity': True, 'we_weight': 0.112}
            X_typed, newIds, entityIds = GraphGenerator.getTFVectorX(
                hin, tf_param, t)
            laplacian_score = generate_laplacian_score_vector(
                X_typed, X_word, 100)
            if not os.path.exists('data/local/laplacian/' + scope_name):
                os.makedirs('data/local/laplacian/' + scope_name)
            with open(
                    'data/local/laplacian/' + scope_name + '/' + str(t) +
                    '_scores', 'w') as f:
                pk.dump(laplacian_score, f)
    # GCAT
    for i in range(2):
        scope_name = gcat_scope_names[i]
        type_list = GCATTypeList[i]
        with open('data/local/' + scope_name + '.dmp') as f:
            hin = pk.load(f)
        tf_param = {'word': True, 'entity': True, 'we_weight': 0.1}
        X_word, newIds, entityIds = GraphGenerator.getTFVectorX(hin, tf_param)
        for t in type_list:
            tf_param = {'word': True, 'entity': True, 'we_weight': 0.112}
            X_typed, newIds, entityIds = GraphGenerator.getTFVectorX(
                hin, tf_param, t)
            laplacian_score = generate_laplacian_score_vector(
                X_typed, X_word, 100)
            if not os.path.exists('data/local/laplacian/' + scope_name):
                os.makedirs('data/local/laplacian/' + scope_name)
            with open(
                    'data/local/laplacian/' + scope_name + '/' + str(t) +
                    '_scores', 'w') as f:
                pk.dump(laplacian_score, f)
Ejemplo n.º 9
0
def run_svm():
    # 20NG
    for i in range(2):
        scope_name = ng20_scope_names[i]
        scope = ng20_scopes[i]
        with open('data/local/' + scope_name + '.dmp') as f:
            hin = pk.load(f)

        print scope_name + ' svm'
        tf_param = {'word': True, 'entity': False, 'we_weight': 0.1}
        X, doc_new_ids, entity_new_ids = GraphGenerator.getTFVectorX(
            hin, param=tf_param, entity_types=None)
        y = GraphGenerator.gety(hin)
        result[i, 2] = svm_experiment(scope_name, X, y)

        print scope_name + ' svm+entity'
        tf_param = {'word': True, 'entity': True, 'we_weight': 0.1}
        X, doc_new_ids, entity_new_ids = GraphGenerator.getTFVectorX(
            hin, param=tf_param, entity_types=None)
        y = GraphGenerator.gety(hin)
        result[i, 3] = svm_experiment(scope_name, X, y)

    # GCAT
    for i in range(2):
        scope_name = gcat_scope_names[i]
        scope = gcat_scopes[i]
        with open('data/local/' + scope_name + '.dmp') as f:
            hin = pk.load(f)

        print scope_name + ' svm'
        tf_param = {'word': True, 'entity': False, 'we_weight': 0.1}
        X, doc_new_ids, entity_new_ids = GraphGenerator.getTFVectorX(
            hin, param=tf_param, entity_types=None)
        y = GraphGenerator.gety(hin)
        result[i + 2, 2] = svm_experiment(scope_name, X, y)

        print scope_name + ' svm+entity'
        with open('data/local/laplacian/' + scope_name + '.x') as f:
            X = pk.load(f)
        y = GraphGenerator.gety(hin)
        result[i + 2, 3] = svm_experiment(scope_name, X, y)
Ejemplo n.º 10
0
def generate_data(V, E, T, datapath):
    print(f"V = {V}, E = {E}, T = {T}")
    graph = GraphGenerator.gen_adjacencies(V, E, T)
    DataGenerator.gen(graph, V, E, T, datapath)
Ejemplo n.º 11
0
def ensemble_cotrain_experiment(scope,
                                scope_name,
                                type_list,
                                threshold,
                                weight,
                                count,
                                label_num=5):

    pred_path = 'data/local/cotrain/' + scope_name + '/'
    split_path = 'data/local/split/' + scope_name + '/'
    if not os.path.exists(pred_path):
        os.makedirs(pred_path)

    with open('data/local/' + scope_name + '.dmp') as f:
        hin = pk.load(f)

    tf_param = {'word': True, 'entity': True, 'we_weight': 0.1}
    c = len(scope)
    lb_cand = [label_num]
    repeats = 50

    # rounds for alternating optimization
    rounds = 2
    best_res = 0
    X_s = {}

    tf_param = {'word': True, 'entity': False, 'we_weight': 0.112}
    X_word, newIds, entity_new_ids = GraphGenerator.getTFVectorX(
        hin, param=tf_param, entity_types=None)

    for t in type_list:
        if not os.path.exists(pred_path + str(t) + '/'):
            os.makedirs(pred_path + str(t) + '/')

        with open('data/local/laplacian/' + scope_name + '/' + str(t) +
                  '_scores') as f:
            laplacian_score = pk.load(f)
        tf_param = {'word': True, 'entity': True, 'we_weight': 0.112}
        X_typed, newIds, entityIds = GraphGenerator.getTFVectorX(
            hin, tf_param, t)
        laplacian_score = 20 * np.exp(-laplacian_score * 0.01)
        # laplacian_score = laplacian_score / np.sum(laplacian_score) * laplacian_score.shape[0]
        D = sparse.diags(laplacian_score)
        X_typed = X_typed * D
        X_s[str(t)] = X_typed

    for rd in range(rounds):
        round_best_res = 0
        round_best_t = ''

        # step 1:
        # generate output of each meta-path
        for t in type_list:

            X = X_s[str(t)].toarray()
            n = X.shape[0]
            e = X.shape[1]
            graph = np.zeros((n + e, n + e))
            graph[0:n, n:n + e] = X
            graph[n:n + e, 0:n] = X.transpose()
            graph = sparse.csc_matrix(graph)

            newLabel = GraphGenerator.getNewLabels(hin)
            lp_param = {
                'alpha': 0.98,
                'normalization_factor': 5,
                'method': 'variant'
            }

            lb = label_num
            ssl = SSLClassifier(graph,
                                newLabel,
                                scope,
                                lp_param,
                                repeatTimes=repeats,
                                trainNumbers=lb,
                                classCount=count)
            if rd == 0:
                ssl.repeatedFixedExperimentwithNewIds(
                    pathPrefix=split_path + 'lb' + str(lb).zfill(3) + '_',
                    newIds=newIds,
                    saveProb=True,
                    savePathPrefix=pred_path + str(t) + '/lb' +
                    str(lb).zfill(3))
            else:
                inputPredPath = 'data/local/cotrain/' + scope_name + '/lb' + str(
                    lb).zfill(3) + '_pred_rd_' + str(rd - 1).zfill(3)
                ssl.repeatedFixedExpeimentwithInput(
                    pathPrefix=split_path + 'lb' + str(lb).zfill(3) + '_',
                    newIds=newIds,
                    saveProb=True,
                    savePathPrefix=pred_path + 'lb' + str(lb).zfill(3) + '_' +
                    str(t),
                    inputPredPath=inputPredPath)
            res = ssl.get_mean()
            if res > best_res:
                best_res = res
                best_t = t
            if res > round_best_res:
                round_best_res = res
                round_best_t = t
        print 'Round %d\t%.4f\t%s' % (rd, round_best_res, str(round_best_t))

        # step 2:
        # propagate pseudo-label for other path
        for lb in lb_cand:
            results = []
            for r in range(repeats):
                with open('data/local/split/' + scope_name + '/lb' +
                          str(lb).zfill(3) + '_' + str(r).zfill(3) +
                          '_train') as f:
                    trainLabel = pk.load(f)
                with open('data/local/split/' + scope_name + '/lb' +
                          str(lb).zfill(3) + '_' + str(r).zfill(3) +
                          '_test') as f:
                    testLabel = pk.load(f)

                numTrain = len(trainLabel)
                numTest = len(testLabel)
                n = numTrain + numTest

                # write output probability
                outPred = np.zeros((n, c))
                for t in type_list:
                    typePred = np.zeros((n, c))
                    with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) +
                              '_' + str(r).zfill(3) + '_train') as f:
                        trainPred = pk.load(f)
                        for i, k in enumerate(trainLabel.keys()):
                            typePred[k, :] = trainPred[i, :]

                    with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) +
                              '_' + str(r).zfill(3) + '_test') as f:
                        testPred = pk.load(f)
                        for i, k in enumerate(testLabel.keys()):
                            #typePred[k,:] = testPred[i,:]

                            # some potential improvement: set a threshold for random walk number to block
                            # 'unconfident' data points
                            max = np.max(testPred[i, :])
                            if max > threshold[str(t)]:
                                typePred[k, :] = testPred[i, :]
                    # add meta-path probability to global probability
                    outPred += typePred * weight[str(t)]

                with open(
                        'data/local/cotrain/' + scope_name + '/lb' +
                        str(lb).zfill(3) + '_pred_rd_' + str(rd).zfill(3) +
                        '_' + str(r).zfill(3), 'w') as f:
                    pk.dump(outPred, f)
    return best_res
Ejemplo n.º 12
0
def ensemble_gal_experiment(scope, scope_name, type_list, threshold):
    # this section should be changed between different scopes
    pred_path = 'data/local/metagraph/' + scope_name + '/'

    lb_cand = [5]
    repeats = 50

    with open('data/local/' + scope_name + '.dmp') as f:
        hin = pk.load(f)
    #X, newIds = GraphGenerator.getTFVectorX(hin, param={'word': True, 'entity': False, 'we_weight': 0.1})
    y = GraphGenerator.gety(hin)

    if sys.platform == 'win32':
        command_file = open('galm.bat', 'a')
    else:
        command_file = open('galm.sh', 'a')

    for lb in lb_cand:
        results = []
        for r in range(repeats):
            with open('data/local/split/' + scope_name + '/lb' +
                      str(lb).zfill(3) + '_' + str(r).zfill(3) +
                      '_train') as f:
                trainLabel = pk.load(f)
            with open('data/local/split/' + scope_name + '/lb' +
                      str(lb).zfill(3) + '_' + str(r).zfill(3) + '_test') as f:
                testLabel = pk.load(f)

            if not os.path.exists('data/local/gal/' + scope_name + '/'):
                os.makedirs('data/local/gal/' + scope_name + '/')
            label_file = open(
                'data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) +
                '_' + str(r).zfill(3) + '_label.txt', 'w')
            gold_file = open(
                'data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) +
                '_' + str(r).zfill(3) + '_gold.txt', 'w')
            eval_file = open(
                'data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) +
                '_' + str(r).zfill(3) + '_eval.txt', 'w')

            # write get-another-label gold file
            for k, v in trainLabel.items():
                gold_file.write(str(k) + '\t' + v + '\n')

            # write get-another-label eval file
            for k, v in testLabel.items():
                eval_file.write(str(k) + '\t' + v + '\n')

            # write get-another-label label file
            for t in type_list:
                with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) + '_' +
                          str(r).zfill(3) + '_train') as f:
                    trainPred = pk.load(f)
                    for i, k in enumerate(trainLabel.keys()):
                        v = scope[np.argmax(trainPred[i, :])]
                        label_file.write(
                            str(t) + '\t' + str(k) + '\t' + v + '\n')

                with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) + '_' +
                          str(r).zfill(3) + '_test') as f:
                    testPred = pk.load(f)
                    for i, k in enumerate(testLabel.keys()):
                        v = scope[np.argmax(testPred[i, :])]
                        max = np.max(testPred[i, :])
                        if max > threshold[str(t)]:
                            label_file.write(
                                str(t) + '\t' + str(k) + '\t' + v + '\n')

            # run get-another-label batch
            if sys.platform == 'win32':
                command = r'call galm/bin/get-another-label.bat ' + \
                    '--categories galm/settings/' + scope_name + '_categories.txt ' + \
                    '--cost galm/settings/' + scope_name + '_costs.txt ' + \
                    '--gold data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + \
                    '_' + str(r).zfill(3) + '_gold.txt ' + \
                    '--input data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + \
                    '_' + str(r).zfill(3) + '_label.txt ' + \
                    '--eval data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + \
                    '_' + str(r).zfill(3) + '_eval.txt ' + \
                    '> data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + \
                    str(r).zfill(3) + '_result.txt'
            else:
                command = r'galm/bin/get-another-label.sh ' + \
                    '--categories /home/hejiang/results/gal/' + scope_name + '_categories.txt ' + \
                    '--cost /home/hejiang/results/gal/' + scope_name + '_costs.txt ' + \
                    '--gold data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + \
                    '_' + str(r).zfill(3) + '_gold.txt ' + \
                    '--input data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + \
                    '_' + str(r).zfill(3) + '_label.txt ' + \
                    '--eval data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + \
                    '_' + str(r).zfill(3) + '_eval.txt ' + \
                    '> data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + \
                    str(r).zfill(3) + '_result.txt'

            command_file.write(command + '\r\n')
Ejemplo n.º 13
0
def ensemble_svm_experiment(scope, scope_name, type_list, threshold):
    # this section should be changed between different scopes
    experiment_path = 'data/local/metagraph/' + scope_name + '/'

    lb_cand = [5]
    repeats = 50

    with open('data/local/' + scope_name + '.dmp') as f:
        hin = pk.load(f)
    #X, newIds = GraphGenerator.getTFVectorX(hin, param={'word': True, 'entity': False, 'we_weight': 0.1})
    y = GraphGenerator.gety(hin)

    for lb in lb_cand:
        results = []
        for r in range(repeats):
            with open('data/local/split/' + scope_name + '/lb' +
                      str(lb).zfill(3) + '_' + str(r).zfill(3) +
                      '_train') as f:
                trainLabel = pk.load(f)
            with open('data/local/split/' + scope_name + '/lb' +
                      str(lb).zfill(3) + '_' + str(r).zfill(3) + '_test') as f:
                testLabel = pk.load(f)

            yTrain = y[trainLabel.keys()]
            yTest = y[testLabel.keys()]

            numTrain = len(trainLabel)
            numTest = len(testLabel)
            XTrain = np.zeros((numTrain, 0))
            XTest = np.zeros((numTest, 0))

            for t in type_list:
                with open(experiment_path + str(t) + '/lb' + str(lb).zfill(3) +
                          '_' + str(r).zfill(3) + '_train') as f:
                    trainPred = pk.load(f)
                with open(experiment_path + str(t) + '/lb' + str(lb).zfill(3) +
                          '_' + str(r).zfill(3) + '_test') as f:
                    testPred = pk.load(f)

                # threshold each meta-graph
                XTraint = np.zeros((numTrain, 3))
                XTestt = np.zeros((numTest, 3))
                for i, k in enumerate(trainLabel.items()):
                    v = np.argmax(trainPred[i, :])
                    max = np.max(trainPred[i, :])
                    if max > threshold[str(t)]:
                        # zero-one prediction
                        XTraint[i, v] = 1
                        # raw prediction
                        #XTraint[i, :] = trainPred[i, :]

                for i, k in enumerate(testLabel.items()):
                    v = np.argmax(testPred[i, :])
                    max = np.max(testPred[i, :])
                    if max > threshold[str(t)]:
                        # zero-one prediction
                        XTestt[i, v] = 1
                        # raw prediction
                        #XTestt[i, :] = testPred[i, :]

                XTrain = np.concatenate((XTrain, XTraint), axis=1)
                XTest = np.concatenate((XTest, XTestt), axis=1)

                # use raw input
                #XTrain = np.concatenate((XTrain,trainPred),axis=1)
                #XTest = np.concatenate((XTest,testPred),axis=1)

            # train
            clf = LinearSVC(C=0.1)
            clf.fit(XTrain, yTrain)

            # test
            pred = clf.predict(XTest)
            results.append(sum(pred == yTest) / float(yTest.shape[0]))
        return np.mean(results)
Ejemplo n.º 14
0
def lp_meta_experiment(scope,
                       scope_name,
                       type_list,
                       threshold,
                       weight,
                       count,
                       label_num=5):

    pred_path = 'data/local/lpmeta/' + scope_name + '/'
    if not os.path.exists(pred_path):
        os.makedirs(pred_path)
    split_path = 'data/local/split/' + scope_name + '/'

    with open('data/local/' + scope_name + '.dmp') as f:
        hin = pk.load(f)

    tf_param = {'word': True, 'entity': True, 'we_weight': 0.1}
    c = len(scope)
    lb_cand = [label_num]
    repeats = 50

    # rounds for alternating optimization
    rounds = 2

    best_res = 0

    for rd in range(rounds):

        # step 1:
        # generate output of each meta-path
        for t in type_list:
            if not os.path.exists(pred_path + str(t)):
                os.makedirs(pred_path + str(t))
            graph, newIds = GraphGenerator.getMetaPathGraph(hin, tf_param, t)

            newLabel = GraphGenerator.getNewLabels(hin)
            lp_param = {'alpha': 0.99, 'normalization_factor': 0.01}
            #    lp_param = {'alpha':0.98, 'normalization_factor':5}
            # 3-class classification

            lb = label_num
            ssl = SSLClassifier(graph,
                                newLabel,
                                scope,
                                lp_param,
                                repeatTimes=repeats,
                                trainNumbers=lb,
                                classCount=count)
            if rd == 0:
                ssl.repeatedFixedExperimentwithNewIds(
                    pathPrefix=split_path + 'lb' + str(lb).zfill(3) + '_',
                    newIds=newIds,
                    saveProb=True,
                    savePathPrefix=pred_path + str(t) + '/lb' +
                    str(lb).zfill(3))
            else:
                inputPredPath = 'data/local/lpmeta/' + scope_name + '/lb' + str(
                    lb).zfill(3) + '_pred_rd_' + str(rd - 1).zfill(3)
                ssl.repeatedFixedExpeimentwithInput(
                    pathPrefix=split_path + 'lb' + str(lb).zfill(3) + '_',
                    newIds=newIds,
                    saveProb=True,
                    savePathPrefix=pred_path + str(t) + '/lb' +
                    str(lb).zfill(3),
                    inputPredPath=inputPredPath)
            res = ssl.get_mean()
            if res > best_res:
                best_res = res

        # step 2:
        # propagate pseudo-label for other path
        for lb in lb_cand:
            results = []
            for r in range(repeats):
                with open(split_path + 'lb' + str(lb).zfill(3) + '_' +
                          str(r).zfill(3) + '_train') as f:
                    trainLabel = pk.load(f)
                with open(split_path + 'lb' + str(lb).zfill(3) + '_' +
                          str(r).zfill(3) + '_test') as f:
                    testLabel = pk.load(f)

                numTrain = len(trainLabel)
                numTest = len(testLabel)
                n = numTrain + numTest

                # write get-another-label label file
                outPred = np.zeros((n, c))
                for t in type_list:
                    typePred = np.zeros((n, c))
                    with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) +
                              '_' + str(r).zfill(3) + '_train') as f:
                        trainPred = pk.load(f)
                        for i, k in enumerate(trainLabel.keys()):
                            typePred[k, :] = trainPred[i, :]

                    with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) +
                              '_' + str(r).zfill(3) + '_test') as f:
                        testPred = pk.load(f)
                        for i, k in enumerate(testLabel.keys()):
                            typePred[k, :] = testPred[i, :]

                    # add meta-path probability to global probability
                    outPred += typePred * weight[str(t)]

                with open(
                        'data/local/lpmeta/' + scope_name + '/lb' +
                        str(lb).zfill(3) + '_pred_rd_' + str(rd).zfill(3) +
                        '_' + str(r).zfill(3), 'w') as f:
                    pk.dump(outPred, f)
    return best_res