def semihin_experiment(scope, scope_name, count, X, newIds, label_num=5): experiment_path = 'data/local/split/' + scope_name + '/' with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) n = X.shape[0] e = X.shape[1] if not type(X) is np.ndarray: X = X.toarray() graph = np.zeros((n + e, n + e)) graph[0:n, n:n + e] = X graph[n:n + e, 0:n] = X.transpose() graph = sparse.csc_matrix(graph) newLabel = GraphGenerator.getNewLabels(hin) lp_param = {'alpha': 0.98, 'normalization_factor': 5, 'method': 'variant'} ssl = SSLClassifier(graph, newLabel, scope, lp_param, repeatTimes=50, trainNumbers=label_num, classCount=count) ssl.repeatedFixedExperimentwithNewIds(pathPrefix=experiment_path + 'lb' + str(label_num).zfill(3) + '_', newIds=newIds) return ssl.get_mean()
def knowsim_experiment(scope, scope_name, type_list, count, newLabels, tau=1, kNeighbors=10, label_num=5): split_path = 'data/local/split/' + scope_name + '/' with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) repeats = 50 tf_param = {'word': True, 'entity': False, 'we_weight': 0.1} X_word, newIds, entityIds = GraphGenerator.getTFVectorX(hin, tf_param) n = X_word.shape[0] knowsim = sparse.lil_matrix((n, n)) for t in type_list: tf_param = {'word': True, 'entity': True, 'we_weight': 0.1} X_typed, newIds, entityIds = GraphGenerator.getTFVectorX( hin, tf_param, t) # make similarity graph cosX = cosine_similarity(X_typed) graph = sparse.lil_matrix((n, n)) for i in range(n): for j in np.argpartition(cosX[i], -kNeighbors)[-kNeighbors:]: if j == i: continue graph[i, j] = cosX[i, j] # np.exp(- (1 - cosX[i, j]) / 0.03) # graph[j, i] = cosX[i, j] # np.exp(- (1 - cosX[i, j]) / 0.03) # # calculate laplacian scores row_sum = graph.sum(axis=1) laplacian_score = generate_laplacian_score(row_sum, X_word, kNeighbors) # add meta-path-based similarity to the knowsim knowsim = knowsim + np.exp(-tau * laplacian_score) * graph knowsim = knowsim.tocsr() print 'running lp' lp_param = {'alpha': 0.98, 'normalization_factor': 5} ssl = SSLClassifier(knowsim, newLabels, scope, lp_param, repeatTimes=50, trainNumbers=label_num, classCount=count) ssl.repeatedFixedExperimentwithNewIds(pathPrefix=split_path + 'lb' + str(label_num).zfill(3) + '_', newIds=newIds) return ssl.get_mean()
def lp_experiment(scope, scope_name, count, graph, labels, newIds): experiment_path = 'data/local/split/' + scope_name + '/' lp_param = {'alpha': 0.98, 'normalization_factor': 5} lp = 5 ssl = SSLClassifier(graph, labels, scope, lp_param, repeatTimes=50, trainNumbers=lp, classCount=count) ssl.repeatedFixedExperimentwithNewIds(pathPrefix=experiment_path + 'lb' + str(lp).zfill(3) + '_', newIds=newIds) return ssl.get_mean()
def generate_meta_graph(scope, scope_name, type_list, count): split_path = 'data/local/split/' + scope_name + '/' pred_path = 'data/local/metagraph/' + scope_name + '/' if not os.path.exists('data/local/metagraph/' + scope_name + '/'): os.makedirs('data/local/metagraph/' + scope_name + '/') with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) tf_param = {'word': True, 'entity': True, 'we_weight': 0.1} for t in type_list: #print t X, newIds, entitynewIds = GraphGenerator.getTFVectorX(hin, tf_param, t) n = X.shape[0] e = X.shape[1] with open('data/local/laplacian/' + scope_name + '/' + str(t) + '_scores') as f: laplacian_score = pk.load(f) laplacian_score = 20 * np.exp(-laplacian_score * 0.01) D = sparse.diags(laplacian_score) X = X * D X = X.toarray() graph = np.zeros((n + e, n + e)) graph[0:n, n:n + e] = X graph[n:n + e, 0:n] = X.transpose() graph = sparse.csc_matrix(graph) newLabel = GraphGenerator.getNewLabels(hin) lp_param = {'alpha': 0.98, 'normalization_factor': 5} # 3-class classification lp_candidate = [5] for lp in lp_candidate: ssl = SSLClassifier(graph, newLabel, scope, lp_param, repeatTimes=50, trainNumbers=lp, classCount=count) if not os.path.exists(pred_path + str(t) + '/'): os.makedirs(pred_path + str(t) + '/') ssl.repeatedFixedExperimentwithNewIds( pathPrefix=split_path + 'lb' + str(lp).zfill(3) + '_', newIds=newIds, saveProb=True, savePathPrefix=pred_path + str(t) + '/' + 'lb' + str(lp).zfill(3))
def generate_train_test_split(): # generate random train-test split for 2 data set * 2 scopes repeat_times = 50 lp_candidate = [5] # 20ng for i in range(2): scope_name = ng20_scope_names[i] scope = ng20_scopes[i] count = ng20_counts[i] experiment_path = 'data/local/split/' + scope_name + '/' if not os.path.exists('data/local/split/' + scope_name): os.makedirs('data/local/split/' + scope_name) with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) tf_param = {'word': True, 'entity': True, 'we_weight': 0.1} lp_param = {'alpha': 0.99, 'normalization_factor': 0.01} graph, newIds = GraphGenerator.generateCosineNeighborGraph( hin, kNeighbors=10, tf_param=tf_param) new_label = GraphGenerator.getNewLabels(hin) for lp in lp_candidate: ssl = SSLClassifier(graph, new_label, scope, lp_param, repeatTimes=repeat_times, trainNumbers=lp, classCount=count) ssl.repeatedExperiment(savePathPrefix=experiment_path + 'lb' + str(lp).zfill(3) + '_') # gcat for i in range(2): scope_name = gcat_scope_names[i] scope = gcat_scopes[i] count = gcat_counts[i] if not os.path.exists('data/local/split/' + scope_name): os.makedirs('data/local/split/' + scope_name) experiment_path = 'data/local/split/' + scope_name + '/' with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) tf_param = {'word': True, 'entity': True, 'we_weight': 0.1} lp_param = {'alpha': 0.99, 'normalization_factor': 0.01} graph, newIds = GraphGenerator.generateCosineNeighborGraph( hin, kNeighbors=10, tf_param=tf_param) new_label = GraphGenerator.getNewLabels(hin) for lp in lp_candidate: ssl = SSLClassifier(graph, new_label, scope, lp_param, repeatTimes=repeat_times, trainNumbers=lp, classCount=count) ssl.repeatedExperiment(savePathPrefix=experiment_path + 'lb' + str(lp).zfill(3) + '_')
def ensemble_cotrain_experiment(scope, scope_name, type_list, threshold, weight, count, label_num=5): pred_path = 'data/local/cotrain/' + scope_name + '/' split_path = 'data/local/split/' + scope_name + '/' if not os.path.exists(pred_path): os.makedirs(pred_path) with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) tf_param = {'word': True, 'entity': True, 'we_weight': 0.1} c = len(scope) lb_cand = [label_num] repeats = 50 # rounds for alternating optimization rounds = 2 best_res = 0 X_s = {} tf_param = {'word': True, 'entity': False, 'we_weight': 0.112} X_word, newIds, entity_new_ids = GraphGenerator.getTFVectorX( hin, param=tf_param, entity_types=None) for t in type_list: if not os.path.exists(pred_path + str(t) + '/'): os.makedirs(pred_path + str(t) + '/') with open('data/local/laplacian/' + scope_name + '/' + str(t) + '_scores') as f: laplacian_score = pk.load(f) tf_param = {'word': True, 'entity': True, 'we_weight': 0.112} X_typed, newIds, entityIds = GraphGenerator.getTFVectorX( hin, tf_param, t) laplacian_score = 20 * np.exp(-laplacian_score * 0.01) # laplacian_score = laplacian_score / np.sum(laplacian_score) * laplacian_score.shape[0] D = sparse.diags(laplacian_score) X_typed = X_typed * D X_s[str(t)] = X_typed for rd in range(rounds): round_best_res = 0 round_best_t = '' # step 1: # generate output of each meta-path for t in type_list: X = X_s[str(t)].toarray() n = X.shape[0] e = X.shape[1] graph = np.zeros((n + e, n + e)) graph[0:n, n:n + e] = X graph[n:n + e, 0:n] = X.transpose() graph = sparse.csc_matrix(graph) newLabel = GraphGenerator.getNewLabels(hin) lp_param = { 'alpha': 0.98, 'normalization_factor': 5, 'method': 'variant' } lb = label_num ssl = SSLClassifier(graph, newLabel, scope, lp_param, repeatTimes=repeats, trainNumbers=lb, classCount=count) if rd == 0: ssl.repeatedFixedExperimentwithNewIds( pathPrefix=split_path + 'lb' + str(lb).zfill(3) + '_', newIds=newIds, saveProb=True, savePathPrefix=pred_path + str(t) + '/lb' + str(lb).zfill(3)) else: inputPredPath = 'data/local/cotrain/' + scope_name + '/lb' + str( lb).zfill(3) + '_pred_rd_' + str(rd - 1).zfill(3) ssl.repeatedFixedExpeimentwithInput( pathPrefix=split_path + 'lb' + str(lb).zfill(3) + '_', newIds=newIds, saveProb=True, savePathPrefix=pred_path + 'lb' + str(lb).zfill(3) + '_' + str(t), inputPredPath=inputPredPath) res = ssl.get_mean() if res > best_res: best_res = res best_t = t if res > round_best_res: round_best_res = res round_best_t = t print 'Round %d\t%.4f\t%s' % (rd, round_best_res, str(round_best_t)) # step 2: # propagate pseudo-label for other path for lb in lb_cand: results = [] for r in range(repeats): with open('data/local/split/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_train') as f: trainLabel = pk.load(f) with open('data/local/split/' + scope_name + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_test') as f: testLabel = pk.load(f) numTrain = len(trainLabel) numTest = len(testLabel) n = numTrain + numTest # write output probability outPred = np.zeros((n, c)) for t in type_list: typePred = np.zeros((n, c)) with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_train') as f: trainPred = pk.load(f) for i, k in enumerate(trainLabel.keys()): typePred[k, :] = trainPred[i, :] with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_test') as f: testPred = pk.load(f) for i, k in enumerate(testLabel.keys()): #typePred[k,:] = testPred[i,:] # some potential improvement: set a threshold for random walk number to block # 'unconfident' data points max = np.max(testPred[i, :]) if max > threshold[str(t)]: typePred[k, :] = testPred[i, :] # add meta-path probability to global probability outPred += typePred * weight[str(t)] with open( 'data/local/cotrain/' + scope_name + '/lb' + str(lb).zfill(3) + '_pred_rd_' + str(rd).zfill(3) + '_' + str(r).zfill(3), 'w') as f: pk.dump(outPred, f) return best_res
def lp_meta_experiment(scope, scope_name, type_list, threshold, weight, count, label_num=5): pred_path = 'data/local/lpmeta/' + scope_name + '/' if not os.path.exists(pred_path): os.makedirs(pred_path) split_path = 'data/local/split/' + scope_name + '/' with open('data/local/' + scope_name + '.dmp') as f: hin = pk.load(f) tf_param = {'word': True, 'entity': True, 'we_weight': 0.1} c = len(scope) lb_cand = [label_num] repeats = 50 # rounds for alternating optimization rounds = 2 best_res = 0 for rd in range(rounds): # step 1: # generate output of each meta-path for t in type_list: if not os.path.exists(pred_path + str(t)): os.makedirs(pred_path + str(t)) graph, newIds = GraphGenerator.getMetaPathGraph(hin, tf_param, t) newLabel = GraphGenerator.getNewLabels(hin) lp_param = {'alpha': 0.99, 'normalization_factor': 0.01} # lp_param = {'alpha':0.98, 'normalization_factor':5} # 3-class classification lb = label_num ssl = SSLClassifier(graph, newLabel, scope, lp_param, repeatTimes=repeats, trainNumbers=lb, classCount=count) if rd == 0: ssl.repeatedFixedExperimentwithNewIds( pathPrefix=split_path + 'lb' + str(lb).zfill(3) + '_', newIds=newIds, saveProb=True, savePathPrefix=pred_path + str(t) + '/lb' + str(lb).zfill(3)) else: inputPredPath = 'data/local/lpmeta/' + scope_name + '/lb' + str( lb).zfill(3) + '_pred_rd_' + str(rd - 1).zfill(3) ssl.repeatedFixedExpeimentwithInput( pathPrefix=split_path + 'lb' + str(lb).zfill(3) + '_', newIds=newIds, saveProb=True, savePathPrefix=pred_path + str(t) + '/lb' + str(lb).zfill(3), inputPredPath=inputPredPath) res = ssl.get_mean() if res > best_res: best_res = res # step 2: # propagate pseudo-label for other path for lb in lb_cand: results = [] for r in range(repeats): with open(split_path + 'lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_train') as f: trainLabel = pk.load(f) with open(split_path + 'lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_test') as f: testLabel = pk.load(f) numTrain = len(trainLabel) numTest = len(testLabel) n = numTrain + numTest # write get-another-label label file outPred = np.zeros((n, c)) for t in type_list: typePred = np.zeros((n, c)) with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_train') as f: trainPred = pk.load(f) for i, k in enumerate(trainLabel.keys()): typePred[k, :] = trainPred[i, :] with open(pred_path + str(t) + '/lb' + str(lb).zfill(3) + '_' + str(r).zfill(3) + '_test') as f: testPred = pk.load(f) for i, k in enumerate(testLabel.keys()): typePred[k, :] = testPred[i, :] # add meta-path probability to global probability outPred += typePred * weight[str(t)] with open( 'data/local/lpmeta/' + scope_name + '/lb' + str(lb).zfill(3) + '_pred_rd_' + str(rd).zfill(3) + '_' + str(r).zfill(3), 'w') as f: pk.dump(outPred, f) return best_res