def run_svm(): # 20NG for i in range(2): scope_name = ng20_scope_names[i] scope = ng20_scopes[i] with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) print scope_name + ' svm' tf_param = {'word': True, 'entity': False, 'we_weight': 0.1} X, doc_new_ids, entity_new_ids = GraphGenerator.getTFVectorX( hin, param=tf_param, entity_types=None) y = GraphGenerator.gety(hin) result[i, 2] = svm_experiment(scope_name, X, y) print scope_name + ' svm+entity' tf_param = {'word': True, 'entity': True, 'we_weight': 0.1} X, doc_new_ids, entity_new_ids = GraphGenerator.getTFVectorX( hin, param=tf_param, entity_types=None) y = GraphGenerator.gety(hin) result[i, 3] = svm_experiment(scope_name, X, y) # GCAT for i in range(2): scope_name = gcat_scope_names[i] scope = gcat_scopes[i] with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) print scope_name + ' svm' tf_param = {'word': True, 'entity': False, 'we_weight': 0.1} X, doc_new_ids, entity_new_ids = GraphGenerator.getTFVectorX( hin, param=tf_param, entity_types=None) y = GraphGenerator.gety(hin) result[i + 2, 2] = svm_experiment(scope_name, X, y) print scope_name + ' svm+entity' with open('data/local/laplacian/' + scope_name + '.x') as f: X = pk.load(f) y = GraphGenerator.gety(hin) result[i + 2, 3] = svm_experiment(scope_name, X, y)
def ensemble_gal_experiment(scope, scope_name, type_list, threshold): # this section should be changed between different scopes pred_path = 'data/local/metagraph/' + scope_name + '/' lb_cand = [5] repeats = 50 with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) #X, newIds = GraphGenerator.getTFVectorX(hin, param={'word': True, 'entity': False, 'we_weight': 0.1}) y = GraphGenerator.gety(hin) if sys.platform == 'win32': command_file = open('galm.bat', 'a') else: command_file = open('galm.sh', 'a') for lb in lb_cand: results = [] for r in range(repeats): with open('data/local/split/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_train') as f: trainLabel = pk.load(f) with open('data/local/split/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_test') as f: testLabel = pk.load(f) if not os.path.exists('data/local/gal/' + scope_name + '/'): os.makedirs('data/local/gal/' + scope_name + '/') label_file = open( 'data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_label.txt', 'w') gold_file = open( 'data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_gold.txt', 'w') eval_file = open( 'data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_eval.txt', 'w') # write get-another-label gold file for k, v in trainLabel.items(): gold_file.write(str(k) + '\t' + v + '\n') # write get-another-label eval file for k, v in testLabel.items(): eval_file.write(str(k) + '\t' + v + '\n') # write get-another-label label file for t in type_list: with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_train') as f: trainPred = pk.load(f) for i, k in enumerate(trainLabel.keys()): v = scope[np.argmax(trainPred[i, :])] label_file.write( str(t) + '\t' + str(k) + '\t' + v + '\n') with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_test') as f: testPred = pk.load(f) for i, k in enumerate(testLabel.keys()): v = scope[np.argmax(testPred[i, :])] max = np.max(testPred[i, :]) if max > threshold[str(t)]: label_file.write( str(t) + '\t' + str(k) + '\t' + v + '\n') # run get-another-label batch if sys.platform == 'win32': command = r'call galm/bin/get-another-label.bat ' + \ '--categories galm/settings/' + scope_name + '_categories.txt ' + \ '--cost galm/settings/' + scope_name + '_costs.txt ' + \ '--gold data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + \ '_' + str(r).zfill(3) + '_gold.txt ' + \ '--input data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + \ '_' + str(r).zfill(3) + '_label.txt ' + \ '--eval data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + \ '_' + str(r).zfill(3) + '_eval.txt ' + \ '> data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + \ str(r).zfill(3) + '_result.txt' else: command = r'galm/bin/get-another-label.sh ' + \ '--categories /home/hejiang/results/gal/' + scope_name + '_categories.txt ' + \ '--cost /home/hejiang/results/gal/' + scope_name + '_costs.txt ' + \ '--gold data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + \ '_' + str(r).zfill(3) + '_gold.txt ' + \ '--input data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + \ '_' + str(r).zfill(3) + '_label.txt ' + \ '--eval data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + \ '_' + str(r).zfill(3) + '_eval.txt ' + \ '> data/local/gal/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + \ str(r).zfill(3) + '_result.txt' command_file.write(command + '\r\n')
def ensemble_svm_experiment(scope, scope_name, type_list, threshold): # this section should be changed between different scopes experiment_path = 'data/local/metagraph/' + scope_name + '/' lb_cand = [5] repeats = 50 with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) #X, newIds = GraphGenerator.getTFVectorX(hin, param={'word': True, 'entity': False, 'we_weight': 0.1}) y = GraphGenerator.gety(hin) for lb in lb_cand: results = [] for r in range(repeats): with open('data/local/split/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_train') as f: trainLabel = pk.load(f) with open('data/local/split/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_test') as f: testLabel = pk.load(f) yTrain = y[trainLabel.keys()] yTest = y[testLabel.keys()] numTrain = len(trainLabel) numTest = len(testLabel) XTrain = np.zeros((numTrain, 0)) XTest = np.zeros((numTest, 0)) for t in type_list: with open(experiment_path + str(t) + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_train') as f: trainPred = pk.load(f) with open(experiment_path + str(t) + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_test') as f: testPred = pk.load(f) # threshold each meta-graph XTraint = np.zeros((numTrain, 3)) XTestt = np.zeros((numTest, 3)) for i, k in enumerate(trainLabel.items()): v = np.argmax(trainPred[i, :]) max = np.max(trainPred[i, :]) if max > threshold[str(t)]: # zero-one prediction XTraint[i, v] = 1 # raw prediction #XTraint[i, :] = trainPred[i, :] for i, k in enumerate(testLabel.items()): v = np.argmax(testPred[i, :]) max = np.max(testPred[i, :]) if max > threshold[str(t)]: # zero-one prediction XTestt[i, v] = 1 # raw prediction #XTestt[i, :] = testPred[i, :] XTrain = np.concatenate((XTrain, XTraint), axis=1) XTest = np.concatenate((XTest, XTestt), axis=1) # use raw input #XTrain = np.concatenate((XTrain,trainPred),axis=1) #XTest = np.concatenate((XTest,testPred),axis=1) # train clf = LinearSVC(C=0.1) clf.fit(XTrain, yTrain) # test pred = clf.predict(XTest) results.append(sum(pred == yTest) / float(yTest.shape[0])) return np.mean(results)