def crossData(data_list, alpha=0.0, rank_weight=False, stop_criterion_mis_rate=None, stop_criterion_min_node=1, stop_criterion_gain=0.0, prune_criteria=0): results = {} for data_train in data_list: results[data_train] = {} for data_test in data_list: if data_test == data_train: continue x_train, y_tr = LogR.dataClean(data_train) y_train = label2Rank(y_tr.tolist()) x_test, y_te = LogR.dataClean(data_test) y_test = label2Rank(y_te.tolist()) tree = DecisionTree().buildtree( x_train, y_train, weights=None, stop_criterion_mis_rate=stop_criterion_mis_rate, stop_criterion_min_node=stop_criterion_min_node, stop_criterion_gain=stop_criterion_gain) y_pred = tree.predict(x_test, alpha) results[data_train][data_test] = LogR.perfMeasure(y_pred, y_test, rankopt=True) return results
results["perf"].append(LogR.perfMeasure(y_pred, y_test, rankopt=True)) if nocross: break for key in results.keys(): item = np.array(results[key]) mean = np.nanmean(item, axis=0) std = np.nanstd(item, axis=0) results[key] = [mean, std] return results if __name__ == "__main__": x,y = LogR.dataClean("data/posts_Feature_Emotion.txt") y = label2Rank(y) # x,y = dataSimulated(100,3,5) # for j in range(1,6): # stop_criterion_mis_rate = 0.22 - 0.04*j # for m in range(10): # ITER_MAX = 10 + m*10 result = crossValidate(x,y, nocross = False, iter_max=ITER_MAX, cost = cost) print result with open("result_boost.txt","a") as f: f.write("Nsamp: %d\n" % x.shape[0]) f.write("iter_max "+str(ITER_MAX)+"\n") f.write("stop misclassification rate %f\n" %stop_criterion_mis_rate) f.write("cost: AdaC2.M1\n") f.write("cost_level %s" % str(COST_LEVEL)) f.write(str(result)+"\n")
return recalls def multitest(x, y, Ntest=10): results = {"perf": []} for itest in range(Ntest): x_train, y_train, x_test, y_test = anomalyDataPrep(x, y) results["perf"].append(traintest(x_train, y_train, x_test, y_test)) for key in results.keys(): item = np.array(results[key]) mean = np.nanmean(item, axis=0) std = np.nanstd(item, axis=0) results[key] = [mean, std] return results if __name__ == "__main__": K_SMPrank = int(sys.argv[2]) news = sys.argv[1] result_file = "results/anomaly_SMP" + news + ".txt" x, y = dataClean("data/" + news + "_Feature_linkemotion.txt") y = label2Rank(y) results = multitest(x, y) print results with open(result_file, "a") as f: f.write("news: %s\n" % news) f.write("K_SMPrank: %d\n" % K_SMPrank) f.write(str(results) + "\n")
if __name__ == "__main__": ### test ### # x,y = dataSimulated(8, 3, 6) # print x # print y # Nsamp = x.shape[0] # weight = 1.0/Nsamp # weights = np.array([weight for i in range(Nsamp)], dtype = np.float32) # print weights # tree = DecisionTree().buildtree(x,y,weights, stop_criterion_mis_rate=0.4) # tree.printtree() # for i in range(x.shape[0]): # y_pred = tree.predict(x[i]) # print y_pred, y[i] x_total, y_total = LogR.dataClean("data/nytimes_Feature_linkemotion.txt") y_total = label2Rank(y_total) sizes = [100 * i for i in range(1, 41)] np.random.seed(2017) for size in sizes: samples = np.arange(y_total.shape[0]) np.random.shuffle(samples) x, y = x_total[samples[:size]], y_total[samples[:size]] start = datetime.now() result = crossValidate(x, y, cv=2, stop_criterion_mis_rate=0.0, rank_weight=False) duration = datetime.now() - start with open("time_dt", "a") as f:
else: Nsamp_train = Nsamp - test_size Nsamp_test = Nsamp - Nsamp_train x_train = x[samples[:Nsamp_train]] y_train = y[samples[:Nsamp_train]] x_test = x[samples[Nsamp_train:]] y_test = y[samples[Nsamp_train:]] return x_train, y_train, x_test, y_test def distance(rank_true, rank_candidate): ## using - kendall's tau as distance measure, the larger the farther ## return -KendalltauSingle(rank_true, rank_candidate) if __name__ == "__main__": x, y = dataClean("data/nytimes_Feature_linkemotion.txt") y = label2Rank(y) x_train, y_train, x_test, y_test = anomalyDataPrep(x[:40, :], y[:40, :]) print "-------- training data --------" print x_train print y_train print "-------- non-perturbed data ---------" print x_test[0] print y_test[0] print "-------- perturbed data -----------" print x_test[1] print y_test[1]
y_pred = [] for i in range(THREADS): y_pred = y_pred + y_preds[i] return np.array(y_pred) def singlethreadPredict(x_test, y_pred, KNNobject): results = KNNobject.predict(x_test) for result in results.tolist(): y_pred.append(result) if __name__ == "__main__": datafile = "data/posts_Feature_Emotion.txt" Ks = [10, 40, 80] for K in Ks: print K, "start at ", datetime.now() x,y = LogR.dataClean(datafile) y = np.array(map(LogR.rankOrder, y.tolist())) # x, y = readSushiData() # x,y = x[:1000, :], y[:1000, :] start = datetime.now() result = crossValidate(x,y,K=K) duration = datetime.now() - start print duration.total_seconds() print result with open("results/result_KNNMallows.txt", "a") as f: f.write("K = %d\n" % K) f.write("data = %s\n" % datafile) f.write("time = %f\n" % duration.total_seconds())