def supernatural_rs(): file_config = FilesConfig( vocab_file='twitter_hashtag/twitterhashtags.vocab', dataset_file='DataSetsEraldo/dataSetSupernatural.txt', task='supernatural') c = CorpusTE(train_file='DataSetsEraldo/dataSetSupernatural.txt', vocab_file='twitter_hashtag/twitterhashtags.vocab') x, y = c.prepare() ndy = np.array(y) print('exemplos positivos, todo dataset') print(ndy.sum()) print(c.size) f = KFold(c, 3, rand=1) f.prepare_fold(x, y) myTuner = Tuner(c, file_config) epochs = (100, 0) lrs = (1e-5, 1e-1) myTuner.random_search_cv(execs=6, epoch_limits=epochs, lr_limits=lrs, cv=10, folds=f, freeze_epochs=True, freeze_lr=False) print("RS finished!\n")
def test_kfold(): c = CorpusTE(train_file='DataSetsEraldo/dataSetSupernatural.txt', vocab_file='twitter_hashtag/twitterhashtags.vocab') x, y = c.prepare() print(c.size) f = KFold(c, 3, rand=1) f.prepare_fold(x, y) cnn_config = TCNNConfig() cnn_config.num_epochs = 10 file_config = FilesConfig(vocab_file='twitterhashtags.vocab', dataset_file='DataSetsEraldo/dataSetBahia.txt') for i in range(5): for cf in f: model0 = TextCNN(cnn_config) print(c.train_distribution()) c.prepare_sample(x, y, size=300) c.sub_sampling(size=300) print(c.x_train.shape) t = Trainer(corpus=cf, model=model0, config=cnn_config, file_config=file_config, verbose=True) train_acc, train_loss, val_acc, val_loss, best_epoch = t.train()
def cv_(param): params_names=list(params_df.columns.values) try: self.kernel=param[params_names.index('kernel')] #looks for index of kernel in df except ValueError: pass try: self.ld=param[params_names.index('ld')] except ValueError: pass try: self.sigma=param[params_names.index('sigma')] except ValueError: pass try: self.k=param[params_names.index('k')] except ValueError: pass try: self.d=param[params_names.index('d')] except ValueError: pass try: self.e=param[params_names.index('e')] except ValueError: pass try: self.beta=param[params_names.index('beta')] except ValueError: pass X_reset=X.reset_index(drop=True) y_reset=y.reset_index(drop=True) #needed to compute accuracy kf = KFold(n_splits=cv, shuffle=True, random_state=42) score=[] for train_index, val_index in kf.split(X_reset): X_train, X_val = X_reset.iloc[train_index], X_reset.iloc[val_index] y_train, y_val = y_reset.iloc[train_index], y_reset.iloc[val_index] self.fit(X_train,y_train) y_pred=self.predict(X_val) score.append(np.mean(y_pred==y_val.reset_index(drop=True))) return(np.mean(score),np.var(score))
def TIcE(data, features, labeled_info, nfolds=5, M=500, bepp=5, evaluation=EvaluatePaper): ''' Tree Induction fo c Estimation (TIcE) Args: data (list): list of observation points (dictionaries with [feature]:value). features (list): list of which features should be considered. labeled_info (str): name of the feature that indicates whether obseration point is labeled or not (1 or 0, respectively). nfolds (int): number of folds to average the final prediction. M (int): hard limit for branching. depp (number): parameter from TIcE's paper. evaluation (callable): evaluation measure of branch quality. Returns: (pred_c, pred_alpha): predicted c and predicted p (p is the proportion of positive observation points within the UNLABELED portion of the data). ''' pred_c = 0.5 for _ in range(2): clist = [] for est_data, tree_data in KFold(nfolds, data): delta = max(0.025, 1 / (1 + 0.004 * T(est_data))) cbest = L(est_data, labeled_info) / T(est_data) pq = [(-evaluation(tree_data, delta, pred_c, labeled_info), random(), tree_data, est_data, features[:])] limit = max(0, min(1000, math.floor(0.5 + 0.1 * min(T(est_data), T(tree_data))))) m = 0 while m < M and len(pq) > 0: _, _, St, Se, feat = heapq.heappop(pq) m += 1 if T(St) < limit or T(Se) < limit: continue nev = evaluation(Se, delta, pred_c, labeled_info) cbest = max(cbest, nev) nfeat = [] possible_feats = [] for f in feat: med = median([x[f] for x in St]) left_St = [x for x in St if x[f] <= med] left_Se = [x for x in Se if x[f] <= med] right_St = [x for x in St if x[f] > med] right_Se = [x for x in Se if x[f] > med] if T(left_St) == 0 or T(right_St) == 0: continue nfeat.append(f) crit = max(L(left_St, labeled_info)/(bepp + T(left_St)), L(right_St, labeled_info)/(bepp + T(right_St))) node = (crit, random(), f, left_St, left_Se, right_St, right_Se) possible_feats.append(node) if len(possible_feats) > 0: _, _, f, left_St, left_Se, right_St, right_Se = max(possible_feats) if T(left_St) > limit and T(left_Se) > limit: heapq.heappush(pq, (-evaluation(left_St, delta, pred_c, labeled_info), random(), left_St, left_Se, nfeat)) if T(right_St) > limit and T(right_Se) > limit: heapq.heappush(pq, (-evaluation(right_St, delta, pred_c, labeled_info), random(), right_St, right_Se, nfeat)) clist.append(cbest) pred_c = mean(clist) l = L(data, labeled_info) pred_alpha = max(0, min(1, (l / pred_c - l) / (len(data) - l))) return pred_c, pred_alpha
def ENKF(data, features, labeled_info, k=5, gamma="auto"): ''' Elkan's Algorithm (EN) with k-fold cross validation (not original Elkan). Returned C is incorrect. ''' labeled = [x for x in data if x[labeled_info] == 1] unlabeled = [x for x in data if x[labeled_info] != 1] all_probs = [] all_alphas = [] for tr, te in KFold(k, labeled): data_tr = unlabeled + tr shuffle(data_tr) test_data = pd.DataFrame(te)[features] training_data = pd.DataFrame(data_tr) svc = SVC(probability=True, gamma=gamma) svc.fit(training_data[features], training_data[labeled_info]) right_index = 0 if svc.classes_[0] == 1 else 1 svc_probs = svc.predict_proba(test_data) c_probs = [x[right_index] for x in svc_probs] all_probs += c_probs pred_c = np.mean(c_probs) l = len(tr) all_alphas += [max(0, min(1, (l / pred_c - l) / (len(data_tr) - l)))] pred_c = np.mean(all_probs) pred_alpha = np.mean(all_alphas) return pred_c, pred_alpha
def train_cv(): ultimos_r = [] dt = [] file_config = FilesConfig( vocab_file='twitter_hashtag/twitterhashtags.vocab', dataset_file='DataSetsEraldo/dataSetSupernatural.txt') c = CorpusTE(train_file='DataSetsEraldo/dataSetSupernatural.txt', vocab_file='twitter_hashtag/twitterhashtags.vocab') x, y = c.prepare() print(c.size) f = KFold(c, 3, rand=1) f.prepare_fold(x, y) for cv in f: t = Trainer(corpus=cv, file_config=file_config, verbose=True) ultimos_r.append(t.train(dt)) print(ultimos_r) print(':)')
def test_get_next(self): data = [1,2,3,4,5,6] classes = ['a', 'b', 'c', 'd', 'e', 'f'] kfold = KFold(3, data, classes) d1, c1 = kfold.get_next() d2, c2 = kfold.get_next() self.assertEquals(1, d1[0]) self.assertEquals(2, d1[1]) self.assertEquals(3, d1[2]) self.assertEquals('a', c1[0]) self.assertEquals('b', c1[1]) self.assertEquals('c', c1[2]) self.assertEquals(4, d2[0]) self.assertEquals(5, d2[1]) self.assertEquals(6, d2[2]) self.assertEquals('d', c2[0]) self.assertEquals('e', c2[1]) self.assertEquals('f', c2[2])
def pre_rs_supernatural(): file_config = FilesConfig( vocab_file='twitter_hashtag/twitterhashtags.vocab', dataset_file='twitter_hashtag/out.txt', task='supernatural') c = CorpusTE(train_file='DataSetsEraldo/dataSetSupernatural.txt', vocab_file='twitter_hashtag/twitterhashtags.vocab') x, y = c.prepare() print(c.size) f = KFold(c, 3, rand=1) f.prepare_fold(x, y) myTuner = Tuner(c, file_config) epochs = (100, 6) lrs = (1e-5, 1e-2) myTuner.random_search_cv(execs=1, epoch_limits=epochs, lr_limits=lrs, cv=1, folds=f, freeze_lr=True, freeze_epochs=True) print("PRS finished!\n")
def test_kfold_rs(): cnn_config = TCNNConfig() cnn_config.num_epochs = 4 file_config = FilesConfig( vocab_file='twitter_hashtag/twitterhashtags.vocab', dataset_file='twitter_hashtag/out.txt') c = CorpusTE(train_file='DataSetsEraldo/dataSetSupernatural.txt', vocab_file='twitter_hashtag/twitterhashtags.vocab') x, y = c.prepare() print(c.size) f = KFold(c, 3, rand=1) f.prepare_fold(x, y) myTuner = Tuner(c, file_config) epochs = (10, 30) lrs = (0.0001, 0.01) myTuner.random_search_cv(execs=5, epoch_limits=epochs, lr_limits=lrs, cv=4, folds=f, freeze_lr=True) print("RS finished!\n")
def test_get_next(self): data = [1,2,3,4,5,6,7,8,9,10] classes = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'] counter = 0 kfold = KFold(2, data, classes) while kfold.has_next(): train_d1, train_c1, test_d1, test_c1 = kfold.get_next() self.assertEquals(8, len(train_d1)) self.assertEquals(8, len(train_c1)) self.assertEquals(2, len(test_d1)) self.assertEquals(2, len(test_c1)) counter += 1 self.assertEquals(5, counter) kfold = KFold(2, data, classes) train_d1, train_c1, test_d1, test_c1 = kfold.get_next() self.assertEquals(True, lists_are_equal(train_d1, [3,4,5,6,7,8,9,10])) self.assertEquals(True, lists_are_equal(train_c1, ['c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'])) self.assertEquals(True, lists_are_equal(test_d1, [1,2])) self.assertEquals(True, lists_are_equal(test_c1, ['a', 'b'])) train_d1, train_c1, test_d1, test_c1 = kfold.get_next() self.assertEquals(True, lists_are_equal(train_d1, [1,2,5,6,7,8,9,10])) self.assertEquals(True, lists_are_equal(train_c1, ['a', 'b', 'e', 'f', 'g', 'h', 'i', 'j'])) self.assertEquals(True, lists_are_equal(test_d1, [3,4])) self.assertEquals(True, lists_are_equal(test_c1, ['c', 'd'])) train_d1, train_c1, test_d1, test_c1 = kfold.get_next() train_d1, train_c1, test_d1, test_c1 = kfold.get_next() train_d1, train_c1, test_d1, test_c1 = kfold.get_next() self.assertEquals(True, lists_are_equal(train_d1, [1,2,3,4,5,6,7,8])) self.assertEquals(True, lists_are_equal(train_c1, ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h'])) self.assertEquals(True, lists_are_equal(test_d1, [9,10])) self.assertEquals(True, lists_are_equal(test_c1, ['i', 'j']))
def BFT(data, features, labeled_info, scorer, nfolds=10): ''' Best Fixed Threshold Args: data (list): list of observation points (dictionaries with [feature]:value). features (list): list of which features should be considered. labeled_info (str): name of the feature that indicates whether obseration point is labeled or not (1 or 0, respectively). scorer (callable): One-class scorer to be used. See OCScorers.py for more information. nfolds (int): number of folds for cross validation to generate training scores. Returns: pred_alphas: 101 predicted p's, one for each percentile of training positive scores (p is the proportion of positive observation points within the UNLABELED portion of the data). ''' labeled = [x for x in data if x[labeled_info] == 1] p_scores = [] for tr, te in KFold(nfolds, labeled): tr_df = pd.DataFrame(tr)[features] te_df = pd.DataFrame(te)[features] p_scores += scorer(tr_df, te_df) p_scores.sort() labeled_df = pd.DataFrame(labeled)[features] unlabeled = [x for x in data if x[labeled_info] == 0] unlabeled_df = pd.DataFrame(unlabeled)[features] t_scores = scorer(labeled_df, unlabeled_df) t_scores.sort() percentiles = np.arange(0, 101, 1) thresholds = np.percentile(p_scores, percentiles) n = len(t_scores) alphas = [] for thr in zip(thresholds): n_neg = float(np.searchsorted(t_scores, thr, side="right")) n_pos = n - n_neg alphas.append(n_pos / n) return np.array(alphas)
def ODIn(data, features, labeled_info, scorer, nfolds=10): ''' One Distribution Inside (ODIn) Args: data (list): list of observation points (dictionaries with [feature]:value). features (list): list of which features should be considered. labeled_info (str): name of the feature that indicates whether obseration point is labeled or not (1 or 0, respectively). scorer (callable): One-class scorer to be used. See OCScorers.py for more information. nfolds (int): number of folds for cross validation to generate training scores. Returns: (pred_c, pred_alpha): predicted c and predicted p (p is the proportion of positive observation points within the UNLABELED portion of the data). ''' labeled = [x for x in data if x[labeled_info] == 1] p_scores = [] for tr, te in KFold(nfolds, labeled): tr_df = pd.DataFrame(tr)[features] te_df = pd.DataFrame(te)[features] p_scores += scorer(tr_df, te_df) labeled_df = pd.DataFrame(labeled)[features] unlabeled = [x for x in data if x[labeled_info] == 0] unlabeled_df = pd.DataFrame(unlabeled)[features] t_scores = scorer(labeled_df, unlabeled_df) percentiles = np.arange(0, 101, 10) thresholds = np.percentile(p_scores, percentiles) overflow_limit = EstimateOverflowLimit(p_scores, thresholds) p_histogram = CreateHistogram(p_scores, thresholds) t_histogram = CreateHistogram(t_scores, thresholds) p = FindP(p_histogram, t_histogram, overflow_limit) c = max(0, min(1, len(labeled) / (p * len(unlabeled) + len(labeled)))) return c, p
def supernatural_lltrain(): file_config = FilesConfig( vocab_file='twitter_hashtag/twitterhashtags.vocab', dataset_file='twitter_hashtag/out.txt', task='10llsupernatural') c = CorpusTE(train_file='DataSetsEraldo/dataSetSupernatural.txt', vocab_file='twitter_hashtag/twitterhashtags.vocab') x, y = c.prepare() print(c.size) f = KFold(c, 3, rand=1) f.prepare_fold(x, y) cnn_config_s = TCNNConfig() cnn_config_s.num_classes = 2 args = [ cnn_config_s, '../experiments/1kthashtag.2019-10-21/checkpoints/model21102019-211333epc200lr0.0001.emb', '../experiments/1kthashtag.2019-10-21/checkpoints/model21102019-211333epc200lr0.0001.convs' ] f = RandomSplit(corpus=c, n=10, sub=350) f.x = x f.y = y t = Tuner(c, file_config, callback=model_load, args=args, rand=False) epochs = (5, 6) lrs = (1e-5, 1e-2) t.random_search_rsplit(execs=4, rsplits=f, epoch_limits=epochs, lr_limits=lrs, freeze_epochs=True, freeze_lr=False, r=10) print("RS finished!\n")
def RunExperiment(exp_name, scorer_name, niterations=5, max_sample_size=2000, max_labeled_size=500, nfolds=5): exp = Exp[exp_name] dataset_filename = exp["input"] output_filename = exp["output"] class_feature = exp["class_feature"] positive_label = exp["positive_label"] scorer = scorers_by_name[scorer_name] features = None negative_labels = None data_df = pd.read_csv(dataset_filename, index_col=False) if callable(exp["negative_labels"]): negative_labels = set(filter(exp["negative_labels"], set(data_df[class_feature]))) elif isinstance(exp["negative_labels"], list): negative_labels = set(exp["negative_labels"]) else: negative_labels = set([x for x in set(data_df[class_feature]) if x != positive_label]) all_labels = set(list(negative_labels) + [positive_label]) data_df = pd.DataFrame(data_df.loc[data_df[class_feature].map(lambda x: x in all_labels)]) if callable(exp["features"]): features = list(filter(exp["features"], list(data_df))) elif isinstance(exp["features"], list): features = exp["features"] else: features = [x for x in list(data_df) if x != class_feature] labeled_info = 'dfjiweojgf' data = data_df.to_dict('registers') for_table = [] for alpha in tqdm(list(np.linspace(0, 1, 11)), desc="alpha"): abs_errors = np.zeros(101) abs_errors_2 = np.zeros(101) errors = np.zeros(101) errors_2 = np.zeros(101) ms_per_example = [] for it in trange(niterations): shuffle(data) eprint('Iteration %d#' % (it + 1)) for fold_i, (unlabeled, all_labeled) in zip(trange(nfolds, desc="kfold"), KFold(nfolds, data)): eprint(' Fold #%d' % (fold_i + 1)) for x in unlabeled: x[labeled_info] = 0 for x in all_labeled: x[labeled_info] = 1 labeled = [x for x in all_labeled if x[class_feature] == positive_label] positives = [x for x in unlabeled if x[class_feature] == positive_label] negatives = [x for x in unlabeled if x[class_feature] != positive_label] shuffle(labeled) shuffle(positives) shuffle(negatives) sample_size = min(len(positives), len(negatives), max_sample_size) npos = math.floor(alpha * sample_size) nneg = sample_size - npos nlab = min(len(labeled), max_labeled_size) sample = positives[:npos] + negatives[:nneg] + labeled[:nlab] shuffle(sample) actual_c = len(labeled) / (len(labeled) + npos) actual_alpha = npos / sample_size eprint(' Actual c: %6.2f | Actual alpha: %6.2f' % (actual_c, actual_alpha)) tm_start = timer() pred_alpha = BFT(sample, features, labeled_info, scorer) tm_end = timer() ms_per_example.append((tm_end - tm_start) * 1000 / len(sample)) abs_errors += np.abs(actual_alpha - pred_alpha) / (niterations * nfolds) abs_errors_2 += np.abs(actual_alpha - pred_alpha) ** 2.0 / (niterations * nfolds) errors += (actual_alpha - pred_alpha) / (niterations * nfolds) errors_2 += (actual_alpha - pred_alpha) ** 2.0 / (niterations * nfolds) std_abs_errors = 100 * np.sqrt(abs_errors_2 - abs_errors ** 2) std_errors = 100 * np.sqrt(errors_2 - errors ** 2) abs_errors *= 100 errors *= 100 row = [100 * alpha, np.mean(ms_per_example), np.std(ms_per_example)] + list(abs_errors) + list(std_abs_errors) + list(errors) + list(std_errors) for_table.append(tuple(row)) h1 = ','.join(['abs_error_th%03d' % x for x in range(101)]) h2 = ','.join(['abs_error_th%03d_std' % x for x in range(101)]) h3 = ','.join(['error_th%03d' % x for x in range(101)]) h4 = ','.join(['error_th%03d_std' % x for x in range(101)]) header_csv = 'alpha,time,time_std,%s,%s,%s,%s' % (h1, h2, h3, h4) mask_csv = ','.join(['%.2f'] * len(for_table[0])) eprint() with open(output_filename % ('', 'bft_raw_%s' % scorer_name), mode="w") as out: print(header_csv, file=out) for row in for_table: print(mask_csv % row, file=out)
def PE(data, features, labeled_info, nfolds=5): ''' PE Args: data (list): list of observation points (dictionaries with [feature]:value). features (list): list of which features should be considered. labeled_info (str): name of the feature that indicates whether obseration point is labeled or not (1 or 0, respectively). nfolds (int): number of folds for cross validation. Returns: (pred_c, pred_alpha): predicted c and predicted p (p is the proportion of positive observation points within the UNLABELED portion of the data). ''' labeled = [x for x in data if x[labeled_info] == 1] unlabeled = [x for x in data if x[labeled_info] == 0] X = pd.DataFrame(data)[features].values xp = pd.DataFrame(labeled)[features].values xm = pd.DataFrame(unlabeled)[features].values n1, n2 = len(labeled), len(unlabeled) mm = np.matmul sqnorm = lambda x: x.dot(x) # med_dist = np.median([norm(x - y) for x in X for y in X]) med_dist = np.sqrt(np.median([sqnorm(x - y) for x in X for y in X])) sigma_list = np.linspace(1 / 5, 5, 10) * med_dist lambda_list = np.logspace(-3, 1, 9) for (xp_tr, xp_te), (xm_tr, xm_te) in zip(KFold(nfolds, xp), KFold(nfolds, xm)): xp_tr, xp_te = np.array(xp_tr), np.array(xp_te) xm_tr, xm_te = np.array(xm_tr), np.array(xm_te) n1_tr, n1_te = len(xp_tr), len(xp_te) n2_tr, n2_te = len(xm_tr), len(xm_te) p1_tr = n1_tr / (n1_tr + n2_tr) p1_te = n1_te / (n1_te + n2_te) cv_scores = [] for sigma in sigma_list: phi = lambda x: np.array( [[math.exp(-sqnorm(x - xi) / (2 * sigma**2)) for xi in xp_tr]]).transpose() b = len(xp_tr) Phi1_tr = np.array([phi(x) for x in xp_tr]) Phi1_te = np.array([phi(x) for x in xp_te]) Phi2_tr = np.array([phi(x) for x in xm_tr]) Phi2_te = np.array([phi(x) for x in xm_te]) h_tr = np.mean(Phi1_tr, 0) h_te = np.mean(Phi1_te, 0) add = lambda x, y: x + y x = Phi1_tr[0] H_tr = p1_tr * reduce(add, (mm(x, x.transpose()) for x in Phi1_tr)) / n1_tr \ + (1 - p1_tr) * reduce(add, (mm(x, x.transpose()) for x in Phi2_tr)) / n2_tr H_te = p1_te * reduce(add, (mm(x, x.transpose()) for x in Phi1_te)) / n1_te \ + (1 - p1_te) * reduce(add, (mm(x, x.transpose()) for x in Phi2_te)) / n2_te for lamb in lambda_list: alpha = np.linalg.solve(H_tr + lamb * np.identity(b), h_tr) alpha_t = alpha.transpose() score = float(0.5 * mm(mm(alpha_t, H_te), alpha) - mm(alpha_t, h_te)) cv_scores.append((score, lamb, sigma)) _, lamb, sigma = min(cv_scores) phi = lambda x: np.array( [[math.exp(-sqnorm(x - xi) / (2 * sigma**2)) for xi in xp]]).transpose() b = len(xp) Phi1 = np.array([phi(x) for x in xp]) Phi2 = np.array([phi(x) for x in xm]) p1 = n1 / (n1 + n2) h = np.mean(Phi1, 0) H = p1 * reduce(add, (mm(x, x.transpose()) for x in Phi1)) / n1 \ + (1 - p1) * reduce(add, (mm(x, x.transpose()) for x in Phi2)) / n2 alpha = np.linalg.solve(H + lamb * np.identity(b), h) alpha_t = alpha.transpose() prior = 1 / float((2 * mm(alpha_t, h) - mm(mm(alpha_t, H), alpha))) prior = min(1, max(prior, n1 / (n1 + n2))) c = max(0, min(1, len(labeled) / (prior * len(unlabeled) + len(labeled)))) return c, prior
def RunExperiment(exp_name, method_name, niterations=5, max_sample_size=2000, max_labeled_size=500): '''Loads the experiment exp_name and applies method_name to it. Creates a CSV file (according to exp_name) with the results. Args: exp_name (str): the name of the experiment (see explist.py for more information). method_name (str): the name of the method (the method includes algorith + parameters. See methodlist.py for more details). niterations (int): number of runs to be averaged. max_sample_size (int): maximum size for UNLABELED part of the sample (from PU PE perspective) or maximum test size (from OCQ perspective). max_labeled_size (int): maximum size for LABELED part of the sample (from PU PE perspective) or maximum training size (from OCQ perspective). ''' exp = Exp[exp_name] dataset_filename = exp["input"] output_filename = exp["output"] class_feature = exp["class_feature"] positive_label = exp["positive_label"] method = methods[method_name]["func"] method_kargs = methods[method_name]["kargs"] features = None negative_labels = None data_df = pd.read_csv(dataset_filename, index_col=False) if callable(exp["negative_labels"]): negative_labels = set( filter(exp["negative_labels"], set(data_df[class_feature]))) elif isinstance(exp["negative_labels"], list): negative_labels = set(exp["negative_labels"]) else: negative_labels = set( [x for x in set(data_df[class_feature]) if x != positive_label]) all_labels = set(list(negative_labels) + [positive_label]) data_df = pd.DataFrame( data_df.loc[data_df[class_feature].map(lambda x: x in all_labels)]) if callable(exp["features"]): features = list(filter(exp["features"], list(data_df))) elif isinstance(exp["features"], list): features = exp["features"] else: features = [x for x in list(data_df) if x != class_feature] labeled_info = 'dfjiweojgf' data = data_df.to_dict('registers') for_table = [] for alpha in tqdm(list(np.linspace(0, 1, 11)), desc="alpha"): abs_errors = [] errors = [] ms_per_example = [] for it in trange(niterations): shuffle(data) eprint('Iteration %d#' % (it + 1)) for fold_i, (unlabeled, all_labeled) in zip(trange(5, desc="kfold"), KFold(5, data)): eprint(' Fold #%d' % (fold_i + 1)) for x in unlabeled: x[labeled_info] = 0 for x in all_labeled: x[labeled_info] = 1 labeled = [ x for x in all_labeled if x[class_feature] == positive_label ] positives = [ x for x in unlabeled if x[class_feature] == positive_label ] negatives = [ x for x in unlabeled if x[class_feature] != positive_label ] shuffle(labeled) shuffle(positives) shuffle(negatives) sample_size = min(len(positives), len(negatives), max_sample_size) npos = math.floor(alpha * sample_size) nneg = sample_size - npos nlab = min(len(labeled), max_labeled_size) sample = positives[:npos] + negatives[:nneg] + labeled[:nlab] shuffle(sample) actual_c = len(labeled) / (len(labeled) + npos) actual_alpha = npos / sample_size eprint(' #L %d #U %d' % (len(labeled), npos + nneg)) eprint(' Actual c: %6.2f | Actual alpha: %6.2f' % (actual_c, actual_alpha)) tm_start = timer() pred_c, pred_alpha = method(sample, features, labeled_info, **method_kargs) tm_end = timer() eprint(' Predicted c: %6.2f | Predicted alpha: %6.2f' % (pred_c, pred_alpha)) ms_per_example.append((tm_end - tm_start) * 1000 / len(sample)) abs_errors.append(abs(actual_alpha - pred_alpha)) errors.append(actual_alpha - pred_alpha) for_table.append( (100 * alpha, 100 * np.mean(abs_errors), 100 * np.std(abs_errors), 100 * np.mean(errors), 100 * np.std(errors), np.mean(ms_per_example), np.std(ms_per_example))) header_csv = 'alpha,mean_abs_error,mean_abs_error_std,mean_error,mean_error_std,time,time_std' mask_csv = ','.join(['%.2f'] * 7) mask_show = ' '.join(['%7.2f'] * 7) eprint() with open(output_filename % ('', method_name), mode="w") as out: print(header_csv, file=out) for row in for_table: eprint(mask_show % row) print(mask_csv % row, file=out)
print "count : %d" % count # return data return data data = Encoding(data, general_matrix) test_data = Encoding(test_data, general_matrix) p = pca(n_components=2) pca_cal(standardize_dataset(data), labels.T[0].tolist(), data, title = "PCA with z-score normalization on training set") pca_cal(standardize_dataset(test_data), test_labels, test_data, title = "PCA with z-score normalization on test set") data[:, 4:] = standardize_dataset(data[:, 4:]) test_data[:, 4:] = standardize_dataset(test_data[:, 4:]) # Seperate dataset to test and train set kf = KFold(n=len(data), n_folds=10, shuffle=True) train, test = kf.get_indices() s = Score() total_cv_error = [] total_test_error = [] confusion_matx = [] f1score = [] for k in range(1, 100, 5): cv_error = [] test_error = [] nn = Pipeline([ ('feature_selection', SelectFromModel(LinearSVC(penalty="l2"))), ('classification', KNeighborsClassifier(n_neighbors=k, metric='manhattan')) ]) for i in range(10):
def RunExperiment(exp_name, method_name, niterations=1, max_sample_size=2000, max_labeled_size=500): exp = Exp[exp_name] dataset_filename = exp["input"] output_filename = exp["output"] class_feature = exp["class_feature"] positive_label = exp["positive_label"] method = methods[method_name]["func"] method_kargs = methods[method_name]["kargs"] features = None negative_labels = None data_df = pd.read_csv(dataset_filename, index_col=False) if callable(exp["negative_labels"]): negative_labels = set( filter(exp["negative_labels"], set(data_df[class_feature]))) elif isinstance(exp["negative_labels"], list): negative_labels = set(exp["negative_labels"]) else: negative_labels = set( [x for x in set(data_df[class_feature]) if x != positive_label]) all_labels = set(list(negative_labels) + [positive_label]) data_df = pd.DataFrame( data_df.loc[data_df[class_feature].map(lambda x: x in all_labels)]) if callable(exp["features"]): features = list(filter(exp["features"], list(data_df))) elif isinstance(exp["features"], list): features = exp["features"] else: features = [x for x in list(data_df) if x != class_feature] labeled_info = 'dfjiweojgf' data = data_df.to_dict('registers') for_table = [] for alpha in tqdm(list(np.linspace(0, 1, 11)), desc="alpha"): abs_errors = [] errors = [] ms_per_example = [] for it in trange(niterations): shuffle(data) eprint('Iteration %d#' % (it + 1)) for fold_i, (unlabeled, all_labeled) in zip(range(5), KFold(5, data)): eprint(' Fold #%d' % (fold_i + 1)) for x in unlabeled: x[labeled_info] = 0 for x in all_labeled: x[labeled_info] = 1 labeled = [ x for x in all_labeled if x[class_feature] == positive_label ] positives = [ x for x in unlabeled if x[class_feature] == positive_label ] negatives = [ x for x in unlabeled if x[class_feature] != positive_label ] shuffle(labeled) shuffle(positives) shuffle(negatives) sample_size = min(len(positives), len(negatives), max_sample_size) npos = math.floor(alpha * sample_size) nneg = sample_size - npos nlab = min(len(labeled), max_labeled_size) sample = positives[:npos] + negatives[:nneg] + labeled[:nlab] shuffle(sample) actual_c = len(labeled) / (len(labeled) + npos) actual_alpha = npos / sample_size eprint(' #L %d #U %d' % (len(labeled), npos + nneg)) eprint(' Actual c: %6.2f | Actual alpha: %6.2f' % (actual_c, actual_alpha)) tm_start = timer() pred_c, pred_alpha = method(sample, features, labeled_info, **method_kargs) tm_end = timer() eprint(' Predicted c: %6.2f | Predicted alpha: %6.2f' % (pred_c, pred_alpha)) ms_per_example.append((tm_end - tm_start)) abs_errors.append(abs(actual_alpha - pred_alpha)) errors.append(actual_alpha - pred_alpha) break for_table.append( (100 * alpha, 100 * np.mean(abs_errors), 100 * np.std(abs_errors), 100 * np.mean(errors), 100 * np.std(errors), np.mean(ms_per_example), np.std(ms_per_example))) header_csv = 'alpha,abs_mean_error,abs_mean_error_std,mean_error,mean_error_std,time,time_std' mask_csv = ','.join(['%.2f'] * 7) mask_show = ' '.join(['%7.2f'] * 7) eprint() with open(output_filename % ('_time', method_name), mode="w") as out: print(header_csv, file=out) for row in for_table: eprint(mask_show % row) print(mask_csv % row, file=out)
new_data = [] new_classes = [] for index in srt: new_data.append(data[index]) new_classes.append(classes[index]) # setup the network for node in nodes: BN.setup_node(node, nodes) # initial confusion matrix confusion = {'0':{'0':0, '1':0}, '1':{'0':0, '1':0}} # do k-fold validation total_count = 0 kfold = KFold(100, new_data, new_classes) while kfold.has_next(): dat, cls = kfold.get_next() correct_count = 0 for i in range(0, len(dat)): row = dat[i] guess = BNClassifier.classify(row, nodes, dat, cls, ['0', '1']) if guess == cls[i]: correct_count += 1 confusion[cls[i]][guess] += 1 total_count += correct_count