Example #1
0
def run_svm():
    # 20NG
    for i in range(2):
        scope_name = ng20_scope_names[i]
        scope = ng20_scopes[i]
        with open('data/local/' + scope_name + '.dmp') as f:
            hin = pk.load(f)

        print scope_name + ' svm'
        tf_param = {'word': True, 'entity': False, 'we_weight': 0.1}
        X, doc_new_ids, entity_new_ids = GraphGenerator.getTFVectorX(
            hin, param=tf_param, entity_types=None)
        y = GraphGenerator.gety(hin)
        result[i, 2] = svm_experiment(scope_name, X, y)

        print scope_name + ' svm+entity'
        tf_param = {'word': True, 'entity': True, 'we_weight': 0.1}
        X, doc_new_ids, entity_new_ids = GraphGenerator.getTFVectorX(
            hin, param=tf_param, entity_types=None)
        y = GraphGenerator.gety(hin)
        result[i, 3] = svm_experiment(scope_name, X, y)

    # GCAT
    for i in range(2):
        scope_name = gcat_scope_names[i]
        scope = gcat_scopes[i]
        with open('data/local/' + scope_name + '.dmp') as f:
            hin = pk.load(f)

        print scope_name + ' svm'
        tf_param = {'word': True, 'entity': False, 'we_weight': 0.1}
        X, doc_new_ids, entity_new_ids = GraphGenerator.getTFVectorX(
            hin, param=tf_param, entity_types=None)
        y = GraphGenerator.gety(hin)
        result[i + 2, 2] = svm_experiment(scope_name, X, y)

        print scope_name + ' svm+entity'
        with open('data/local/laplacian/' + scope_name + '.x') as f:
            X = pk.load(f)
        y = GraphGenerator.gety(hin)
        result[i + 2, 3] = svm_experiment(scope_name, X, y)
Example #2
0
def ensemble_gal_experiment(scope, scope_name, type_list, threshold):
    # this section should be changed between different scopes
    pred_path = 'data/local/metagraph/' + scope_name + '/'

    lb_cand = [5]
    repeats = 50

    with open('data/local/' + scope_name + '.dmp') as f:
        hin = pk.load(f)
    #X, newIds = GraphGenerator.getTFVectorX(hin, param={'word': True, 'entity': False, 'we_weight': 0.1})
    y = GraphGenerator.gety(hin)

    if sys.platform == 'win32':
        command_file = open('galm.bat', 'a')
    else:
        command_file = open('galm.sh', 'a')

    for lb in lb_cand:
        results = []
        for r in range(repeats):
            with open('data/local/split/' + scope_name + '/lb' +
                      str(lb).zfill(3) + '_' + str(r).zfill(3) +
                      '_train') as f:
                trainLabel = pk.load(f)
            with open('data/local/split/' + scope_name + '/lb' +
                      str(lb).zfill(3) + '_' + str(r).zfill(3) + '_test') as f:
                testLabel = pk.load(f)

            if not os.path.exists('data/local/gal/' + scope_name + '/'):
                os.makedirs('data/local/gal/' + scope_name + '/')
            label_file = open(
                'data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) +
                '_' + str(r).zfill(3) + '_label.txt', 'w')
            gold_file = open(
                'data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) +
                '_' + str(r).zfill(3) + '_gold.txt', 'w')
            eval_file = open(
                'data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) +
                '_' + str(r).zfill(3) + '_eval.txt', 'w')

            # write get-another-label gold file
            for k, v in trainLabel.items():
                gold_file.write(str(k) + '\t' + v + '\n')

            # write get-another-label eval file
            for k, v in testLabel.items():
                eval_file.write(str(k) + '\t' + v + '\n')

            # write get-another-label label file
            for t in type_list:
                with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) + '_' +
                          str(r).zfill(3) + '_train') as f:
                    trainPred = pk.load(f)
                    for i, k in enumerate(trainLabel.keys()):
                        v = scope[np.argmax(trainPred[i, :])]
                        label_file.write(
                            str(t) + '\t' + str(k) + '\t' + v + '\n')

                with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) + '_' +
                          str(r).zfill(3) + '_test') as f:
                    testPred = pk.load(f)
                    for i, k in enumerate(testLabel.keys()):
                        v = scope[np.argmax(testPred[i, :])]
                        max = np.max(testPred[i, :])
                        if max > threshold[str(t)]:
                            label_file.write(
                                str(t) + '\t' + str(k) + '\t' + v + '\n')

            # run get-another-label batch
            if sys.platform == 'win32':
                command = r'call galm/bin/get-another-label.bat ' + \
                    '--categories galm/settings/' + scope_name + '_categories.txt ' + \
                    '--cost galm/settings/' + scope_name + '_costs.txt ' + \
                    '--gold data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + \
                    '_' + str(r).zfill(3) + '_gold.txt ' + \
                    '--input data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + \
                    '_' + str(r).zfill(3) + '_label.txt ' + \
                    '--eval data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + \
                    '_' + str(r).zfill(3) + '_eval.txt ' + \
                    '> data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + \
                    str(r).zfill(3) + '_result.txt'
            else:
                command = r'galm/bin/get-another-label.sh ' + \
                    '--categories /home/hejiang/results/gal/' + scope_name + '_categories.txt ' + \
                    '--cost /home/hejiang/results/gal/' + scope_name + '_costs.txt ' + \
                    '--gold data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + \
                    '_' + str(r).zfill(3) + '_gold.txt ' + \
                    '--input data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + \
                    '_' + str(r).zfill(3) + '_label.txt ' + \
                    '--eval data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + \
                    '_' + str(r).zfill(3) + '_eval.txt ' + \
                    '> data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + \
                    str(r).zfill(3) + '_result.txt'

            command_file.write(command + '\r\n')
Example #3
0
def ensemble_svm_experiment(scope, scope_name, type_list, threshold):
    # this section should be changed between different scopes
    experiment_path = 'data/local/metagraph/' + scope_name + '/'

    lb_cand = [5]
    repeats = 50

    with open('data/local/' + scope_name + '.dmp') as f:
        hin = pk.load(f)
    #X, newIds = GraphGenerator.getTFVectorX(hin, param={'word': True, 'entity': False, 'we_weight': 0.1})
    y = GraphGenerator.gety(hin)

    for lb in lb_cand:
        results = []
        for r in range(repeats):
            with open('data/local/split/' + scope_name + '/lb' +
                      str(lb).zfill(3) + '_' + str(r).zfill(3) +
                      '_train') as f:
                trainLabel = pk.load(f)
            with open('data/local/split/' + scope_name + '/lb' +
                      str(lb).zfill(3) + '_' + str(r).zfill(3) + '_test') as f:
                testLabel = pk.load(f)

            yTrain = y[trainLabel.keys()]
            yTest = y[testLabel.keys()]

            numTrain = len(trainLabel)
            numTest = len(testLabel)
            XTrain = np.zeros((numTrain, 0))
            XTest = np.zeros((numTest, 0))

            for t in type_list:
                with open(experiment_path + str(t) + '/lb' + str(lb).zfill(3) +
                          '_' + str(r).zfill(3) + '_train') as f:
                    trainPred = pk.load(f)
                with open(experiment_path + str(t) + '/lb' + str(lb).zfill(3) +
                          '_' + str(r).zfill(3) + '_test') as f:
                    testPred = pk.load(f)

                # threshold each meta-graph
                XTraint = np.zeros((numTrain, 3))
                XTestt = np.zeros((numTest, 3))
                for i, k in enumerate(trainLabel.items()):
                    v = np.argmax(trainPred[i, :])
                    max = np.max(trainPred[i, :])
                    if max > threshold[str(t)]:
                        # zero-one prediction
                        XTraint[i, v] = 1
                        # raw prediction
                        #XTraint[i, :] = trainPred[i, :]

                for i, k in enumerate(testLabel.items()):
                    v = np.argmax(testPred[i, :])
                    max = np.max(testPred[i, :])
                    if max > threshold[str(t)]:
                        # zero-one prediction
                        XTestt[i, v] = 1
                        # raw prediction
                        #XTestt[i, :] = testPred[i, :]

                XTrain = np.concatenate((XTrain, XTraint), axis=1)
                XTest = np.concatenate((XTest, XTestt), axis=1)

                # use raw input
                #XTrain = np.concatenate((XTrain,trainPred),axis=1)
                #XTest = np.concatenate((XTest,testPred),axis=1)

            # train
            clf = LinearSVC(C=0.1)
            clf.fit(XTrain, yTrain)

            # test
            pred = clf.predict(XTest)
            results.append(sum(pred == yTest) / float(yTest.shape[0]))
        return np.mean(results)