def main(): # img_width, img_height = 48, 48 img_width, img_height = 200, 60 img_channels = 1 # batch_size = 1024 batch_size = 32 nb_epoch = 1000 post_correction = False save_dir = 'save_model/' + str(datetime.now()).split('.')[0].split()[0] + '/' # model is saved corresponding to the datetime train_data_dir = 'train_data/ip_train/' # train_data_dir = 'train_data/single_1000000/' val_data_dir = 'train_data/ip_val/' test_data_dir = 'test_data//' weights_file_path = 'save_model/2016-10-27/weights.11-1.58.hdf5' char_set, char2idx = get_char_set(train_data_dir) nb_classes = len(char_set) max_nb_char = get_maxnb_char(train_data_dir) label_set = get_label_set(train_data_dir) # val 'char_set:', char_set print 'nb_classes:', nb_classes print 'max_nb_char:', max_nb_char print 'size_label_set:', len(label_set) model = build_shallow(img_channels, img_width, img_height, max_nb_char, nb_classes) # build CNN architecture # model.load_weights(weights_file_path) # load trained model val_data = load_data(val_data_dir, max_nb_char, img_width, img_height, img_channels, char_set, char2idx) # val_data = None train_data = load_data(train_data_dir, max_nb_char, img_width, img_height, img_channels, char_set, char2idx) train(model, batch_size, nb_epoch, save_dir, train_data, val_data, char_set)
def main(): global k_out k_out = 0 x, y = load_data(k=2) kf = cross_validation.KFold(len(x), n_fold) scaler = preprocessing.StandardScaler() acc, prec, recall = [], [], [] for train, test in kf: x_train, x_test, y_train, y_test = x[train] , x[test] , y[train] , y[test] c_star, gamma_star = choose_c_gamma(x_train, y_train) print '=========c*:{} g*:{}'.format(c_star, gamma_star) scaler.fit(x_train) clf = svm.SVC(C=c_star, gamma=gamma_star) clf.fit(scaler.transform(x_train), y_train) y_pred = clf.predict(scaler.transform(x_test)) acc.append(accuracy_score(y_test, y_pred)) prec.append(precision_score(y_test, y_pred)) recall.append(recall_score(y_test, y_pred)) print acc k_out += 1 a = np.mean(acc) p = np.mean(prec) r = np.mean(recall) f = 2 * p * r / (p + r) print 'precision: {}'.format(p) print "recall: {}".format(r) print "f1: {}".format(f) print "accuracy: {}".format(a)
def main(): window_size = 100 threshold = calc_threshold(exp_moving_average, window_size) print threshold filename = sys.argv[1] data_in = load_data(filename) # Uncomment for more realistic first values. First window_size/4 values # should not be taken into account in the output data and plots. # data_in[:0] = [sum(data_in[:(window_size/4)])/(window_size/4)] filtered_ma = average_diff(data_in, moving_average, window_size) filtered_ema = average_diff(data_in, exp_moving_average, window_size) plot([0] * len(data_in), filtered_ma, filtered_ema, [threshold] * len(data_in), [-threshold] * len(data_in), ) mean_ma = mean_value_detector(filtered_ma, threshold) mean_ema = mean_value_detector(filtered_ema, threshold) plot(mean_ema) plot(mean_ma) write_data(mean_ema, filename + ".out")
def bagging(): from sklearn.feature_selection import SelectPercentile, chi2 comments, dates, labels = load_data() select = SelectPercentile(score_func=chi2, percentile=4) clf = LogisticRegression(tol=1e-8, penalty='l2', C=7) #clf = BaggingClassifier(logr, n_estimators=50) countvect_char = TfidfVectorizer(ngram_range=(1, 5), analyzer="char", binary=False) countvect_word = TfidfVectorizer(ngram_range=(1, 3), analyzer="word", binary=False) badwords = BadWordCounter() ft = FeatureStacker([("badwords", badwords), ("chars", countvect_char), ("words", countvect_word)]) #ft = TextFeatureTransformer() pipeline = Pipeline([('vect', ft), ('select', select), ('logr', clf)]) cv = ShuffleSplit(len(comments), n_iterations=20, test_size=0.2, indices=True) scores = [] for train, test in cv: X_train, y_train = comments[train], labels[train] X_test, y_test = comments[test], labels[test] pipeline.fit(X_train, y_train) probs = pipeline.predict_proba(X_test) scores.append(auc_score(y_test, probs[:, 1])) print("score: %f" % scores[-1]) print(np.mean(scores), np.std(scores))
def test_stacker(): comments, dates, labels = load_data() clf = LogisticRegression(tol=1e-8, C=0.01, penalty='l2') countvect_char = TfidfVectorizer(ngram_range=(1, 5), analyzer="char", binary=False) countvect_word = TfidfVectorizer(ngram_range=(1, 3), analyzer="word", binary=False) badwords = BadWordCounter() select = SelectPercentile(score_func=chi2) char_select = Pipeline([('char_count', countvect_char), ('select', select)]) words_select = Pipeline([('word_count', countvect_word), ('select', select)]) badwords_select = Pipeline([('badwords', badwords), ('select', select)]) stack = FeatureStacker([("badwords", badwords_select), ("chars", char_select), ("words", words_select)]) #stack.fit(comments) #features = stack.transform(comments) #print("training and transforming for linear model") print("training grid search") pipeline = Pipeline([("features", stack), ("clf", clf)]) param_grid = dict(clf__C=[0.31, 0.42, 0.54], features__words__select__percentile=[5, 7]) grid = GridSearchCV(pipeline, cv=5, param_grid=param_grid, verbose=4, n_jobs=1, score_func=auc_score) grid.fit(comments, labels) tracer()
def plot_conformity(name, log_dir, ax=None, legend=True): if ax is None: ax = plt.gca() r, actual, pred, a_err, p_err = util.load_data(name, log_dir) ax.errorbar(r, actual[0] - a_err[0], actual[0] + a_err[0], color=red_col, label='Red centrals') ax.errorbar(r, actual[1] - a_err[1], actual[1] + a_err[1], color=blue_col, label='Blue centrals') ax.errorbar(r, actual[2] - a_err[2], actual[2] + a_err[2], color='k', label='All centrals') ax.errorbar(r, pred[0] - p_err[0], pred[0] + p_err[0], color=red_col, linestyle='--', alpha=0.3) ax.errorbar(r, pred[1] - p_err[1], pred[1] + p_err[1], color=blue_col, linestyle='--', alpha=0.3) ax.errorbar(r, pred[2] - p_err[2], pred[2] + p_err[2], color='k', linestyle='--', alpha=0.3) ax.set_xscale('log') ax.set_xlabel('r [Mpc/h]') ax.set_ylabel('Quenched Fraction') ax.set_ylim(0.0, 1.1) ax.set_xlim(0.1, 20) if legend: ax.legend(loc='best') return style_plots(ax)
def main(): global k_out k_out = 0 x, y = load_data(k=2) kf_out = cross_validation.KFold(len(x), n_fold) a_score, p_score, r_score = [], [], [] for train_out, test_out in kf_out: x_train_out, x_test_out, y_train_out, y_test_out = x[train_out] , x[test_out] , y[train_out] , y[test_out] kf = cross_validation.KFold(len(x_train_out), n_fold) m_opt = pruning_cross_validation(x_train_out, y_train_out, kf) clf = DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=m_opt + 1) print '=========m_opt:{}'.format(m_opt) clf.fit(x_train_out, y_train_out) y_pred = clf.predict(x_test_out) a_score.append(accuracy_score(y_test_out, y_pred)) p_score.append(precision_score(y_test_out, y_pred)) r_score.append(recall_score(y_test_out, y_pred)) k_out += 1 a = np.mean(a_score) p = np.mean(p_score) r = np.mean(r_score) f = 2 * p * r / (p + r) print 'precision: {}'.format(p) print "recall: {}".format(r) print "f1: {}".format(f) print "accuracy: {}".format(a)
def main(stat, stat_name): cats = util.load_all_cats() all_r_values = [] names = cats.keys() names = ['HW', 'Becker', 'Lu', 'Henriques', 'Illustris', 'EAGLE', 'MB-II'][::-1] proxies = ['s1','s2','s5','s10','d1','d2','d5','d10', 'rhill', 'rhillmass'] proxies_formatted = [ '$\Sigma_1$', '$\Sigma_2$', '$\Sigma_5$', '$\Sigma_{10}$', '$D_1$', '$D_2$', '$D_5$', '$D_{10}$', 'R$_\mathrm{hill}$', 'R$_\mathrm{hill-mass}$' ] for name in names: cat = cats[name] stat_dict = util.load_data('statistics.pckl', cat['dir']) r_values = [] for p in proxies: try: print 'std of ', stat,' for ', p, '=', np.std(stat_dict[stat][p]) r_values.append(np.mean(stat_dict[stat][p])) except: print 'no statistics found for', p r_values.append(0) all_r_values.append(r_values) df = pd.DataFrame(columns=proxies_formatted, index=names) for name, r_values in zip(names, all_r_values): df.loc[name] = pd.Series({p: v for p,v in zip(proxies_formatted, r_values)}) #plt.imshow(all_r_values) #plt.show() df = df[df.columns].astype(float) #sns.heatmap(df, vmin=0,vmax=0.71, cmap='Blues', annot=True, fmt='.2f') #plots.style_plots() #plt.show() print df.values plot_heatmap(df, proxies_formatted, names, stat_name)
def main(): x, y = load_data(k=2) kf = cross_validation.KFold(len(x), n_fold) a, p, r, f = classify(x, y, kf, n_estimator=50) print "precision: {}".format(p) print "recall: {}".format(r) print "f1: {}".format(f) print "accuracy: {}".format(a)
def loadText(self): login, password, dbname = load_data() self.ui.loginEdit.setText(login) self.ui.passwordEdit.setText(password) self.ui.dbEdit.setText(dbname) self.ui.rememberPassword.setChecked(bool(password)) if login: self.ui.passwordEdit.setFocus()
def get_visitorid(): visitor_id = util.load_data(addon, VISITOR_FILE) if visitor_id is False: from random import randint visitor_id = str(randint(0, 0x7fffffff)) util.save_data(addon, VISITOR_FILE, visitor_id) return visitor_id
def __init__(self, problem_path): A, b, N, block_sizes, x_true, nz, f = util.load_data(problem_path) self._A = A self._b = b self._U = util.U(block_sizes) self._x_true = x_true self._f = f self._N = N self._x0 = util.block_sizes_to_x0(block_sizes)
def main(): x, y = load_data(k=2) kf = cross_validation.KFold(len(x), n_fold) max_m = min(2500 - 1, int(len(x) * (n_fold - 1) / n_fold) - 1) acc_score = [[] for i in xrange(max_m)] p_score = [[] for i in xrange(max_m)] r_score = [[] for i in xrange(max_m)] for train, test in kf: print len(train) x_train, x_test, y_train, y_test = x[train] , x[test] , y[train] , y[test] m = 1 while 1: print "iter: {}".format(m) clf = DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=m + 1) clf.fit(x_train, y_train) y_pred = clf.predict(x_test) acc = accuracy_score(y_test, y_pred) acc_score[m - 1].append(acc) p_score[m - 1].append(precision_score(y_test, y_pred)) r_score[m - 1].append(recall_score(y_test, y_pred)) print 'accuracy: {}'.format(acc) m += 1 if m > max_m: break # break max_val, max_id = -1, -1 for i in xrange(len(acc_score)): acc = np.mean(acc_score[i]) if acc > max_val: max_val = acc max_id = i acc_score[i] = acc p_score[i] = np.mean(p_score[i]) r_score[i] = np.mean(r_score[i]) print acc_score[:10] with open('res/effect_of_leaves', 'w') as out: out.write(str(acc_score) + '\n') out.write(str(p_score) + '\n') out.write(str(r_score) + '\n') print 'splits:{}'.format(max_id + 1) print 'accuracy:{}'.format(max_val) print 'p:{} r:{}'.format(p_score[max_id], r_score[max_id]) plt.clf() m_idx = np.arange(2, len(acc_score) + 2) max_leaf = max_id + 2 plt.plot(m_idx, acc_score, label='cross_validation') plt.plot(max_leaf, max_val, linestyle='none', marker='o', markeredgecolor='r', markeredgewidth=1, markersize=12, markerfacecolor='none', label='best choice') plt.plot((max_leaf, max_leaf), (0, max_val), 'k--') plt.ylim(ymin=0.88, ymax=0.96) plt.xlabel("Number of leaf nodes") plt.ylabel("Cross validation score") plt.legend(numpoints=1, loc=4) plt.savefig('figs/effect_of_leaves.png')
def ge_cmd_predict(): args = parse_arg_predict() # prepare input to GE_learn data = util.load_data(args.data) model = util.load_model(args.model) pred_path = args.output pred = GE_predict(data, model) util.write_prediction(pred, pred_path) return
def setup_ts(self): cube, self.time, flux, radii, unc = load_data(self.setup['data_dir'], self.aor) pixels = get_pix(cube, geom=self.geom) self.t = binned(self.time, binsize=self.bs) self.pix = binned(pixels, binsize=self.bs) i = self.select_radius(flux) print("using radius: {}".format(radii[i])) self.radius = radii[i] self.f = binned(flux[i], binsize=self.bs) self.unc = binned(unc[i], binsize=self.bs) / np.sqrt(self.bs) self.pld = [0] * pixels.shape[1] + [0] * 2
def main(): # The original data set. data = util.load_data() # Fill in missing values with the average for that course. data.fill_missing_with_feature_means() # Count successful and probation students as one group (s) # Comment this out to try and distinguish all 3 groups (s, p, f) data.combine_labels(["s", "p"], "s") binning_exploration(data) plot_tests(data)
def main(): # The original data set. data = util.load_data() # Fill in missing values with the average for that course. data.fill_missing_with_feature_means() # Count successful and probation students as one group (s) # Comment this out to try and distinguish all 3 groups (s, p, f) data.combine_labels(["s", "p"], "s") examine_principal_components(data) pca_find_important_features(data)
def obfuscate_keystrokes(name, strategy, param): """ """ df = load_data(name) df = df.groupby(level=[0, 1]).apply(keystrokes2events).reset_index(level=[2, 3], drop=True) if strategy == 'delay': df = df.groupby(level=[0, 1]).apply(lambda x: delay_mix(x, param)) elif strategy == 'interval': df = df.groupby(level=[0, 1]).apply(lambda x: interval_mix(x, param)) else: raise Exception('Unknown masking strategy') df = df.groupby(level=[0, 1]).apply(events2keystrokes).reset_index(level=[2, 3], drop=True) save_data(df, name, masking=(strategy, param)) return
def main(): window_size = 150 threshold = 3000 filename = sys.argv[1] data_in = load_data(filename) # second arg - maximum size of the window of interest # third arg - some threshold data_filtered = adaptive_window_avg(data_in, 100, 10) abs_data = data_abs(data_filtered) out_data = filtered_derivative_detector(abs_data, window_size, 0, 0) tline = [threshold] * len(out_data) plot(data_in) plot(data_filtered) plot(out_data, tline)
def main(): # The original data set. data = util.load_data() # Fill in missing values with the average for that course. data.fill_missing_with_feature_means() cluster_3_groups(data.copy()) cluster_pass_fail(data.copy()) cluster_success_struggle(data.copy()) util.print_line_break() print "Now with PCA:" cluster_3_groups_with_pca(data.copy()) cluster_pass_fail_with_pca(data.copy()) cluster_success_struggle_with_pca(data.copy())
def ge_cmd_learn(): args = parse_arg_learn() # prepare input to GE_learn data = GE_data() data.dat = util.load_data(args.data) data.labeled_features = util.load_labeled_features(args.labeled_features) init_model = GE_model() param = GE_param() if args.l2: param.l2_regularization = args.l2 final_model_path = args.model # print data final_model = GE_learn(data, init_model, param) util.save_model(final_model, final_model_path) return
def main(): # The original data set. data = util.load_data() # Fill in missing values with the average for that course. data.fill_missing_with_feature_means() # Count successful and probation students as one group (s) # Comment this out to try and distinguish all 3 groups (s, p, f) data.combine_labels(["s", "p"], "s") # Take a 50-50 split training, testing = data.split(0.5, using_labels=True) # Run tests for each classifier to determine the accuracy it can achieve. knn_accuracy_tests(training, testing) naive_bayes_accuracy_tests(training, testing) decision_tree_accuracy_tests(training, testing)
def main(): # The original data set. data = util.load_data() # Fill in missing values with the average for that course. data.fill_missing_with_feature_means() # Count successful and probation students as one group (s) # Comment this out to try and distinguish all 3 groups (s, p, f) data.combine_labels(["s", "p"], "s") num_components = recommend_num_components(data, min_pct_variance=0.95) pca_data = pca(data, num_components) training, testing = data.split(0.5, using_labels=True) pca_training, pca_testing = pca_data.split(0.5, using_labels=True) compare_knn(training, testing, pca_training, pca_testing) compare_naive_bayes(training, testing, pca_training, pca_testing)
def __init__(self, *args, **kwargs): wx.Frame.__init__(self, *args, **kwargs) shared.options.update(load_data()) #menu setup self.CreateStatusBar() # A Statusbar in the bottom of the window # Setting up the menu. filemenu = wx.Menu() menuAbout = filemenu.Append(wx.ID_ABOUT, '&About',' Information about this program') menuExit = filemenu.Append(wx.ID_EXIT,'E&xit',' Terminate the program') # Creating the menubar. menuBar = wx.MenuBar() menuBar.Append(filemenu,'&File') # Adding the 'filemenu' to the MenuBar self.SetMenuBar(menuBar) # Adding the MenuBar to the Frame content. # Events. self.Bind(wx.EVT_MENU, self.OnExit, menuExit) self.Bind(wx.EVT_MENU, self.OnAbout, menuAbout) # Here we create a panel and a notebook on the panel panel = wx.Panel(self) notebook = wx.Notebook(panel) # create the page windows as children of the notebook filepage = FilePanel(notebook) formatpage = FormatPanel(notebook) modifypage = ModifyPanel(notebook) # add the pages to the notebook with the label to show on the tab notebook.AddPage(filepage, 'Convert') notebook.AddPage(formatpage, 'Format') notebook.AddPage(modifypage, 'Modify') # finally, put the notebook in a sizer for the panel to manage # the layout sizer = wx.BoxSizer() sizer.Add(notebook, 1, wx.EXPAND) panel.SetSizer(sizer) self.SetSize(self.GetSize() + (0, 35)) # Expand to fit the PngPanel self.Show()
def preprocess_villani(in_file, out_file, long_fixed_out_file): """ Preprocess the raw Villani dataset and extend the long fixed dataset """ df = pd.read_csv(in_file, index_col=[0, 1]) # Make age a binary target, <30 and >=30 df['age'] = df['agegroup'].map({ 'under20': '<30', '20-29': '<30', '30-39': '>=30', '40-49': '>=30', '50-59': '>=30', 'over60': '>=30'} ) # Ignore missing data df = df.dropna() df = remove_repeated_keys(df) # combine the villani fixed text with citefa dataset fixed text long_fixed = load_data('long_fixed') slf = long_fixed.groupby(level=[0, 1]).size() villani_fixed = df[df['inputtype'] == 'fixed'] villani_fixed = villani_fixed.groupby(level=[0, 1]).apply(lambda x: make_sessions(x, slf.mean(), slf.std())) villani_fixed = villani_fixed.reset_index(level=[0, 1], drop=True) villani_fixed = reduce_dataset(villani_fixed, min_samples=10, max_samples=10) long_fixed = pd.concat([long_fixed, villani_fixed]) long_fixed = long_fixed[COLS] long_fixed.to_csv(long_fixed_out_file) # Free-text input only villani_free = df[df['inputtype'] == 'free'] villani_free = villani_free.groupby(level=[0, 1]).apply(lambda x: make_sessions(x, slf.mean(), slf.std())) villani_free = villani_free.reset_index(level=[0, 1], drop=True) villani_free = reduce_dataset(villani_free, min_samples=10, max_samples=10) villani_free = villani_free[COLS] villani_free.to_csv(out_file) return
def main(): # The original data set. data = util.load_data() # Fill in missing values with the average for that course. data.fill_missing_with_feature_means() # Count successful and probation students as one group (s) # Comment this out to try and distinguish all 3 groups (s, p, f) data.combine_labels(["s", "p"], "s") util.print_line_break() print "Without PCA: %.5f" % get_knn_accuracy(data) util.print_line_break() print "With PCA:" print "\t".join(["PCs", "Accuracy"]) for num_components in range(1, data.num_features()): accuracy = get_knn_accuracy(pca(data, num_components)) print "%d\t%.5f" % (num_components, accuracy)
def describe(name): """ Describe the dataset """ df = load_data(name) s = df.groupby(level=[0, 1]).size() print('Dataset :', name) print('Users :', len(s.groupby(level=0))) print('Sessions/user :'******'Sample size :', s.mean(), '+/-', s.std()) print('Mean pp interval (ms) :', df.groupby(level=[0, 1]).apply(lambda x: x['timepress'].diff().dropna().mean()).mean()) print('Mean duration (ms) :', df.groupby(level=[0, 1]).apply(lambda x: (x['timerelease'] - x['timepress']).mean()).mean()) for target in TARGETS[1:]: s = df.reset_index().groupby([target, 'session']).size().groupby(level=0).size() print(target) print(s / s.sum()) return
def main(): x, y = load_data(k=2) kf = cross_validation.KFold(len(x), n_fold) if performance: for criterion in criteria: print 'criterion: {}'.format(criterion) a, p, r, f = classify(x, y, kf, criterion=criterion, n_estimator=500) print 'precision: {}'.format(p) print "recall: {}".format(r) print "f1: {}".format(f) print "accuracy: {}".format(a) if relation: res = [] for k in xrange(1, 50 + 1): print 'num of trees:{}'.format(k * 10) a, p, r, f = classify(x, y, kf, criterion='entropy', n_estimator=k * 10) print a, p, r, f res.append((a, p, r, f)) with open('res/rf_trees', 'w') as out: for v in res: out.write('{},{},{},{}\n'.format(v[0], v[1], v[2], v[3]))
def main(): x, y = load_data(k=2) if evaluation: kf = cross_validation.KFold(len(x), n_fold) for criterion in criteria: print 'criterion: {}'.format(criterion) acc, prec, recall, node_cnt = [], [], [], [] clf = DecisionTreeClassifier(criterion=criterion) for train, test in kf: x_train, x_test, y_train, y_test = x[train] , x[test] , y[train] , y[test] clf.fit(x_train, y_train) node_cnt.append(clf.tree_.node_count) y_pred = clf.predict(x_test) acc.append(accuracy_score(y_test, y_pred)) prec.append(precision_score(y_test, y_pred)) recall.append(recall_score(y_test, y_pred)) a = np.mean(acc) p = np.mean(prec) r = np.mean(recall) f = 2 * p * r / (p + r) print 'precision: {}'.format(p) print "recall: {}".format(r) print "f1: {}".format(f) print "accuracy: {}".format(a) print "nodes: {}".format(np.mean(node_cnt)) if plot: from sklearn.externals.six import StringIO from sklearn import tree import pydot clf = DecisionTreeClassifier(criterion='entropy', max_leaf_nodes=41) clf.fit(x, y) print clf.tree_.max_depth print clf.tree_.node_count dot_data = StringIO() tree.export_graphviz(clf, out_file=dot_data) graph = pydot.graph_from_dot_data(dot_data.getvalue()) graph.write_pdf("figs/test.pdf")
def grid_search(): comments, labels = load_data() param_grid = dict(logr__C=np.arange(1, 20, 5)) clf = build_nltk_model() cv = ShuffleSplit(len(comments), n_iterations=10, test_size=0.2) grid = GridSearchCV(clf, cv=cv, param_grid=param_grid, verbose=4, n_jobs=12, score_func=auc_score) grid.fit(comments, labels) print(grid.best_score_) print(grid.best_params_) tracer() cv_scores = grid.scores_ for param in cv_scores.params: means, errors = cv_scores.accumulated(param, 'max') plt.errorbar(cv_scores.values[param], means, yerr=errors) plt.xlabel(param) plt.ylim((0.85, 0.93)) plt.savefig("grid_plot_%s.png" % param) plt.close() comments_test, dates_test = load_test() prob_pred = grid.best_estimator_.predict_proba(comments_test) write_test(prob_pred[:, 1])
from sklearn import metrics from sklearn.naive_bayes import GaussianNB, BernoulliNB from sklearn.cross_validation import train_test_split from sklearn.metrics import confusion_matrix, classification_report from util import load_data ################################################################################ # Classification data target_names = ['Female', 'Male'] dir_label = [['badeer-r', 1], ['benson-r', 1], ['blair-l', 0], ['cash-m', 0], ['corman-s', 1], ['hain-m', 1]] dataset = load_data(dir_label) X = np.array(dataset[0]) y = dataset[1] # Sci-Kit Learn Naive Baye's Classifiers # Train/Test split model X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=50) # Gaussian gauss = GaussianNB().fit(X_train, y_train) y_pred_gauss = gauss.predict(X_test) acc = metrics.accuracy_score(y_test, y_pred_gauss)
def CV(args): ''' k-fold Cross-Validation :param args: model arguments ''' # loading model parameters MAX_SEQUENCE_LENGTH = args['ms'] embeddings_index = util.load_embedding('glove.6B.100d.txt') EMBEDDING_DIM = 100 drops = args['do'] batch = args['bs'] hidden = args['hs'] n_folds = args['nf'] epochNo = args['ep'] ds_id = args['ds'] verbose = args['vb'] # loading data data, labels, word_idx, id_all = util.load_data(dataset_id=ds_id,isonefile=False, MAX_SEQUENCE_LENGTH=MAX_SEQUENCE_LENGTH) skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=None) # saving bins info i = 0 f_bin = open(ds_id+'.interbins', 'w') f_bin.close() f_train = open(ds_id+'.interout_train', 'wt') f_train.close() f_test = open(ds_id+'.interout_test', 'wt') f_test.close() id_all = np.array(id_all) # Cross-Validation avg_acc_train, avg_acc_test, avg_error_train, avg_error_test =0,0,0,0 for train_index, test_index in skf.split(np.zeros(len(labels)), labels): f_bin = open(ds_id+'.interbins', 'a') np.savetxt(f_bin, [id_all[train_index]], fmt='%s') np.savetxt(f_bin, [id_all[test_index]], fmt ='%s') f_bin.close() print("size of train index ", len(train_index)) print("size of test index ", len(test_index)) print ("Running Fold %d/%d " % (i+1, n_folds)) my_model = None # Clearing the NN. my_model ,inter_model = model.model(drop=drops, hidden_units=hidden, word_index=word_idx, embedding_index=embeddings_index, EMBEDDING_DIM=EMBEDDING_DIM, MAX_SEQUENCE_LENGTH=MAX_SEQUENCE_LENGTH) [data_l, data_r] = data my_data_train = [data_l[train_index], data_r[train_index]] my_data_test = [data_l[test_index], data_r[test_index]] labels = np.asarray(labels) [loss_train, acc_train], [loss_test, acc_test],[inter_out_train,inter_out_test] = train_and_evaluate_model\ (my_model, my_data_train, labels[train_index], my_data_test, labels[test_index], epochNo, inter_model, batch, verbose) a_train = labels[train_index].reshape(labels[train_index].shape[0],-1) a_test = labels[test_index].reshape(labels[test_index].shape[0],-1) print (inter_out_train.shape, a_train.shape) inter_out_train = np.concatenate((inter_out_train,a_train),axis=1) inter_out_test = np.concatenate((inter_out_test,a_test),axis=1) # updating bins info f_train = open(ds_id+'.interout_train', 'at') np.savetxt(f_train, inter_out_train) f_train.close() f_test = open(ds_id+'.interout_test', 'at') np.savetxt(f_test, inter_out_test) f_test.close() # results avg_acc_train += acc_train avg_acc_test += acc_test avg_error_train += loss_train avg_error_test += loss_test i += 1 print ("avg acc train , test :", avg_acc_train/n_folds, avg_acc_test/n_folds) print ("avg error train , test :", avg_error_train/n_folds, avg_error_test/n_folds)
import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn from util import load_data from sklearn.model_selection import train_test_split from sklearn.metrics import log_loss from sklearn.ensemble import ExtraTreesClassifier from sklearn.decomposition import TruncatedSVD DATA_PATH = '../data/train.csv' X, y = load_data(DATA_PATH) df = pd.read_csv(DATA_PATH) svd = TruncatedSVD(n_components=50, n_iter=10) X_selected = svd.fit_transform(X) var_exp = svd.explained_variance_ratio_ cum_var_exp = np.cumsum(var_exp) with plt.style.context('seaborn-whitegrid'): plt.bar(range(50), var_exp, alpha=0.5, align='center', label='individual explained variance') plt.step(range(50), cum_var_exp, where='mid', label='cumulative explained variance') plt.ylabel('Explained variance ratio')
import util depth = 1 data, meta = util.load_data() print("Training random forest model on %s (%i examples) with depth %i" % (meta["type"], len(data), depth)) print("acc: 0.9")
''' (linear) ridge regression algorithm for classification (i.e. use 0/1 error for evaluation) For Q9, Q10 ''' import numpy as np import util from sklearn.linear_model import RidgeClassifier from sklearn.metrics import zero_one_loss import matplotlib.pylab as plt # Load data and parsing data = util.load_data("hw2_lssvm_all.dat.txt") X, y = util.preprocessing(data) # add x0 = 1 X = np.insert(X, 0, 1, axis=1) print(X) # test parameter λ = {0.05, 0.5, 5, 50, 500} lambbda = [0.05, 0.5, 5, 50, 500] # fit linear ridge regression E_in = np.zeros(5) E_out = np.zeros(5) for it, lb in enumerate(lambbda): print(">>>>> λ = {} >>>".format(lb)) clf = RidgeClassifier( alpha=lb) #alpha: Regularization strength # tol: precision #solver
def part_two(): data = load_data('Data/day09.txt', data_type=int) xmas = Xmas(preamble_size=25, data=data) target = xmas.find_invalid_number() result = xmas.find_vulnerability(data=data, target=target) print(f"Part two returns: {result}")
statistics_dir, cmd_args.data + "_" + cmd_args.gm + "_" + str(cmd_args.learning_rate) + "_" + str(cmd_args.sortpooling_k) + "_" + str(cmd_args.out_dim) + "_" + str(cmd_args.hidden)) if os.path.exists(save_dir): shutil.rmtree(save_dir) else: os.makedirs(save_dir) model_dir = os.path.join(save_dir, "models") if not os.path.exists(model_dir): os.makedirs(model_dir) results_dir = os.path.join(save_dir, "results") if not os.path.exists(results_dir): os.makedirs(results_dir) shuffle_dir = os.path.join(cur_dir, "shuffle_idx") graphs = load_data() if cmd_args.sortpooling_k <= 1: num_nodes_list = sorted([g.num_nodes for g in graphs]) cmd_args.sortpooling_k = num_nodes_list[ int(math.ceil(cmd_args.sortpooling_k * len(num_nodes_list))) - 1] print('k used in SortPooling is: ' + str(cmd_args.sortpooling_k)) skf = StratifiedKFold(n_splits=10) for shuffle_idx in range(1, 11): parameters_save = [] random_idx = [ int(idx) for idx in ud.load_list_from_file(shuffle_dir + '/' + cmd_args.data + "_" + str(shuffle_idx))
# coding: utf-8 import torch import matplotlib.pyplot as plt import numpy as np from model import CNN from util import load_data classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck') if __name__ == "__main__": train_loader, test_loader = load_data() device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") correct = 0 total = 0 net = CNN() net.load_state_dict(torch.load('model_data/model1.pth')) for data in test_loader: images, labels = data outputs = net(images) _, predicted = torch.max(outputs.data, 1) total += labels.size(0) correct += (predicted == labels).sum().item() print('Accuracy of the network on the 10000 test images: %d %%' % (100 * correct / total))
def part_one(): data = load_data('Data/day09.txt', data_type=int) xmas = Xmas(preamble_size=25, data=data) result = xmas.find_invalid_number() print(f"Part one returns: {result}")
'consensus_distance': consensus_distance, 'Sbins': Sbins, 'Sbinc': Sbinc} # get the allele frequency histograms for mutations away and towards consensus if params.type == 'nuc': (data['to_histogram'][subtype], data['away_histogram'][subtype]) = get_toaway_histograms(subtype, Sc=10, refname=params.reference) else: (data['to_histogram'][subtype], data['away_histogram'][subtype]) = get_toaway_histograms_aminoacids(subtype, Sc=10, refname=params.reference) data['time_bins'] = time_bins data['af_bins'] = af_bins store_data(data, fn_data) else: print "Loading data from file" data = load_data(fn_data) fig_filename = foldername+'to_away' if params.reference != 'HXB2': fig_filename = fig_filename + '_'+params.reference if params.type == 'aa': fig_filename = fig_filename + '_aa' plot_to_away(data, fig_filename=fig_filename, sequence_type=params.type)
from util import plt, np, load_data, grad_check_sparse, time_elapse from softmax import softmax_loss_vectorized from linear_classifier import Softmax cifar_dir = '../cifar-10-batches-py' X_train, y_train, X_val, y_val, X_test, y_test, X_dev, y_dev = load_data( cifar_dir, num_test=500) # ininialize W W = np.random.randn(3073, 10) * 0.0001 # test loss loss, grad = softmax_loss_vectorized(W, X_dev, y_dev, 0.0) #print('loss: %f' % loss) #print('sanity check: %f' % (-np.log(0.1))) # test gradient without regularization #def f(w): return softmax_loss_vectorized(W, X_dev, y_dev, 0.0)[0] #grad_numerical = grad_check_sparse(f, W, grad, 10) # test gradient with regularization #def f(w): return softmax_loss_vectorized(W, X_dev, y_dev, 1e2)[0] #grad_numerical = grad_check_sparse(f, W, grad, 10) softmax = Softmax() loss_history = softmax.train(X_train, y_train, learning_rate=1e-7, reg=5e4, num_iters=1500, verbose=True)
import numpy as np import util from sklearn.neighbors import KNeighborsClassifier from sklearn import svm from sklearn.metrics import accuracy_score import pickle DATA_PATH = 'data/data_total.csv' KNN_MODEL_FILE = 'knn_new.model' SVC_MODEL_FILE = 'svc_new.model' if __name__ == '__main__': test_data, test_labels, train_data, train_labels, label_to_xyz, selected_features = util.load_data( DATA_PATH) print(train_data.shape) clf = svm.LinearSVC() knn = KNeighborsClassifier(n_neighbors=7) clf.fit(train_data, train_labels) knn.fit(train_data, train_labels) pickle.dump([clf, label_to_xyz, selected_features], open(SVC_MODEL_FILE, 'wb')) [clf, label_to_xyz, selected_features] = pickle.load(open(SVC_MODEL_FILE, 'rb')) pickle.dump([knn, label_to_xyz, selected_features], open(KNN_MODEL_FILE, 'wb')) [knn, label_to_xyz, selected_features] = pickle.load(open(KNN_MODEL_FILE, 'rb')) pred_labels = clf.predict(test_data)
help='initial gru bias for r & z. higher => more like SimpleRnn') opts = parser.parse_args() print >> sys.stderr, opts NUM_LABELS = 3 def log(s): print >> sys.stderr, util.dts(), s # slurp training data, including converting of tokens -> ids vocab = Vocab() train_x, train_y, train_stats = util.load_data(opts.train_set, vocab, update_vocab=True, max_egs=int( opts.num_from_train)) log("train_stats %s %s" % (len(train_x), train_stats)) dev_x, dev_y, dev_stats = util.load_data(opts.dev_set, vocab, update_vocab=False, max_egs=int(opts.num_from_dev)) log("dev_stats %s %s" % (len(dev_x), dev_stats)) # input/output example vars s1_idxs = T.ivector('s1') # sequence for sentence one s2_idxs = T.ivector('s2') # sequence for sentence two actual_y = T.ivector('y') # single for sentence pair label; 0, 1 or 2 # keep track of different "layers" that handle their own gradients.
def run_train(args): all_result = {} model_result = [] for filename in args.input_file: print("=================================") print("== Loading data ... ") print("=================================") option = {} if args.group is not None: option["group"] = args.group x, y, opt, h, index = load_data( filename, ans_col=args.answer, ignore_col=args.ignore, header=args.header, cat_col=args.categorical, option=option, ) g = None if args.group is not None or "group" in opt: if "group_type" in opt: if opt["group_type"] != "int": print("group remapping") g = [] mapping_g = {} for g_name in opt["group"]: if g_name not in mapping_g: mapping_g[g_name] = len(mapping_g) g.append(mapping_g[g_name]) g = np.array(g, dtype=np.int32) else: g = np.array(opt["group"], dtype=np.int32) if args.data_sample is not None: x, y, g = resample(x, y, g, n_samples=args.data_sample) ## 欠損値を補完(平均) m = np.nanmean(x, axis=0) if h is not None: h = np.array(h)[~np.isnan(m)] imr = SimpleImputer(missing_values=np.nan, strategy="mean") x = imr.fit_transform(x) print("x:", x.shape) print("y:", y.shape) ## 標準化 sc = StandardScaler() x = sc.fit_transform(x) print("x:", x.shape) print("y:", y.shape) if g is not None: print("g:", g.shape) print("grouping enabled:", g.shape) ## データから2クラス問題か多クラス問題化を決めておく if args.task == "auto": if len(np.unique(y)) == 2: args.task = "binary" else: args.task = "multiclass" if args.task != "regression": y = y.astype(dtype=np.int64) ## ## cross-validation を並列化して行う ## print("=================================") print("== Starting cross-validation ... ") print("=================================") if g is not None: kf = sklearn.model_selection.GroupKFold(n_splits=args.splits) pool = Pool(processes=args.splits) results = pool.map(train_cv_one_fold, [(x, y, h, s, g, args) for s in kf.split(x, y, g)]) else: kf = sklearn.model_selection.KFold(n_splits=args.splits, shuffle=True) pool = Pool(processes=args.splits) results = pool.map(train_cv_one_fold, [(x, y, h, s, args) for s in kf.split(x)]) ## ## cross-validation の結果をまとめる ## ・各評価値の平均・標準偏差を計算する ## cv_result = {"cv": [r[0] for r in results]} model_result.append([r[1] for r in results]) print("=================================") print("== Evaluation ... ") print("=================================") if args.task == "regression": score_names = ["r2", "mse"] else: score_names = ["accuracy", "f1", "precision", "recall", "auc"] for score_name in score_names: scores = [r[0][score_name] for r in results] test_mean = np.nanmean(np.asarray(scores)) test_std = np.nanstd(np.asarray(scores)) print("Mean %10s on test set: %3f (standard deviation: %3s)" % (score_name, test_mean, test_std)) cv_result[score_name + "_mean"] = test_mean cv_result[score_name + "_std"] = test_std ## ## 全体の評価 ## test_y = [] pred_y = [] for result in cv_result["cv"]: test_y.extend(result["test_y"]) pred_y.extend(result["pred_y"]) if args.task != "regression": conf = sklearn.metrics.confusion_matrix(test_y, pred_y) cv_result["confusion"] = conf cv_result["task"] = args.task cv_result["index"] = index ## ## 結果をディクショナリに保存して返値とする ## all_result[filename] = cv_result return all_result, model_result
vlss_early_model = loss_value_val vacc_max = np.max((acc_val, vacc_max)) vloss_min = np.min((loss_value_val, vloss_min)) curr_step = 0 else: curr_step += 1 if curr_step == args.patience: print('Early stop! Min loss: ', vloss_min, ', Max accuracy: ', vacc_max) print('Early stop model validation loss: ', vlss_early_model, ', accuracy: ', vacc_early_model) break test_feed_dict = {} test_feed_dict.update({placeholders['labels']: y_test}) test_feed_dict.update({placeholders['features']: features}) test_feed_dict.update({placeholders['dropout']: 0.0}) test_feed_dict.update({placeholders['masks']: test_mask}) loss_value_test, acc_test = sess.run([loss, accuracy], feed_dict=test_feed_dict) print('Test loss:', loss_value_test, '; Test accuracy:', acc_test) sess.close() if __name__ == '__main__': time_stamp = strftime('%Y_%m_%d_%H_%M_%S', localtime()) print("The time of running the codes: ", time_stamp) args = parse_args() data = load_data(args.dataset) train(args, data)
def tensor_from_sentence(lang, sentence): indexes = indexes_from_sentence(lang, sentence) indexes.append(Lang.EOS_token) return torch.tensor(indexes, dtype=torch.long, device=device).view(-1, 1) def tensors_from_pair(input_lang, output_lang, pair): input_tensor = tensor_from_sentence(input_lang, pair[0]) target_tensor = tensor_from_sentence(output_lang, pair[1]) return (input_tensor, target_tensor) if __name__ == "__main__": device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") input_lang, output_lang, pairs = load_data() hidden_size = 256 encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device) decoder = DecoderRNN(hidden_size, output_lang.n_words).to(device) criterion = nn.NLLLoss() encoder_optimizer = optim.SGD(encoder.parameters(), lr=0.01) decoder_optimizer = optim.SGD(decoder.parameters(), lr=0.01) training_pairs = [ tensors_from_pair(input_lang, output_lang, random.choice(pairs)) for i in range(75000) ] # epoch 75000
else: parsed_args.languages = args['languages'] args['train'] = False args['path'] = parsed_args.path args['source'] = parsed_args.source args['target'] = parsed_args.target args['test'] = parsed_args.test args['store_test'] = parsed_args.store_test args['t'] = parsed_args.t for language in args['languages']: wordemb_path = args['wordemb_path']+'%s.pkl' % language wvec, vocab = load_word_vectors(language, wordemb_path) if parsed_args.train: train_path = args['data_path']+'/train/%s.json' % language dev_path = args['data_path']+'/dev/%s.json' % language x_ids, y_ids, cur_labels = load_data(path=train_path) xv_ids, yv_ids, cur_labels = load_data( path=dev_path) print "\tX_train (80%)"+": %d" % len(x_ids) print "\tX_val (10%)"+": %d" % len(xv_ids) X_ids.append(np.array(x_ids));Y_ids.append(np.array(y_ids)) XV_ids.append(np.array(xv_ids));YV_ids.append(np.array(yv_ids)) elif parsed_args.test or parsed_args.store_test: test_path = args['data_path']+'/test/%s.json' % language xt_ids, yt_ids, cur_labels = load_data( path=test_path) print "\tX_test (10%)"+": %d" % len(xt_ids) if parsed_args.store_test: max_num = parsed_args.max_num XT_ids.append(np.array(xt_ids)[:max_num]);YT_ids.append(np.array(yt_ids)[:max_num]) else: XT_ids.append(np.array(xt_ids));YT_ids.append(np.array(yt_ids)) print "\t|V|: %d, |Y|: %d" % (len(vocab[language]),len(cur_labels))
def main(): parser = argparse.ArgumentParser() parser.add_argument('-train', action='store_true', default=False, help="train flag") parser.add_argument('-eval', action='store_true', default=False, help="evaluate flag") parser.add_argument('-pred', action='store_true', default=False, help="predict flag") parser.add_argument('-w', action='store_true', default=False, help="load weights flag") parser.add_argument('-c', help="training: coarse dir") parser.add_argument('-f', help="training: fine scale with track dir") parser.add_argument('-tc', help="test dataset: coarse dir") parser.add_argument('-tf', help="test dataset: fine scale with track dir") parser.add_argument('-x', help="predict input dataset dir") parser.add_argument('-o', help="predict output dir") parser.add_argument('-l', help="learning rate") parser.add_argument('-e', help="epochs") parser.add_argument("-resume", help="bool flag, False by default") parser.add_argument("-modelh5", help="load exist model") parser.add_argument("-modelweighth5", help="load model weights") args = parser.parse_args() if len(sys.argv) < 4: print "Usage: --train=True -l=learning_rate -e=epochs -c=... -f=... --eval=False --pred=True option* --> use --help" return 0 coarseDir = None fineDir = None test_coarseDir = None test_fineDir = None pred_dir = None out_dir = None if args.train: learning_rate = float(args.l) epochs = int(args.e) coarseDir = args.c fineDir = args.f print "training dataset: " print ">>> " + str(coarseDir) + " >>> " + str(fineDir) if args.eval: test_coarseDir = args.tc test_fineDir = args.tf print "evaluate dataset: " print ">>> " + str(test_coarseDir) + " >>> " + str(test_fineDir) if args.pred: pred_dir = args.x out_dir = args.o if not os.path.exists(out_dir): os.makedirs(out_dir) print "predict: " print ">>> " + str(pred_dir) + " >>> " + str(out_dir) sample_data = [] if coarseDir: sdir = coarseDir elif test_coarseDir: sdir = test_coarseDir elif pred_dir: sdir = pred_dir file_name = sdir + [f for f in os.listdir(sdir) if not f.startswith('.') ][0] + "/00001_00.obj" # file_name = pred_dir + "test.obj" dim = obj2tri(file_name, sample_data) # [tri_dim, vert_dim] v_dim = dim[1] mtx, mtx_1 = face2mtx(file_name, dim) # create model model = setmodel(dim, mtx, mtx_1) ##load predefined weights load_weights = args.w if load_weights: alpha = 1.0 beta = 0.5 a1 = [alpha, 0.0, 0.0, beta, 0.0, 0.0, beta, 0.0, 0.0] a2 = [0.0, alpha, 0.0, 0.0, beta, 0.0, 0.0, beta, 0.0] a3 = [0.0, 0.0, alpha, 0.0, 0.0, beta, 0.0, 0.0, beta] a4 = [beta, 0.0, 0.0, alpha, 0.0, 0.0, beta, 0.0, 0.0] a5 = [0.0, beta, 0.0, 0.0, alpha, 0.0, 0.0, beta, 0.0] a6 = [0.0, 0.0, beta, 0.0, 0.0, alpha, 0.0, 0.0, beta] a7 = [beta, 0.0, 0.0, beta, 0.0, 0.0, alpha, 0.0, 0.0] a8 = [0.0, beta, 0.0, 0.0, beta, 0.0, 0.0, alpha, 0.0] a9 = [0.0, 0.0, beta, 0.0, 0.0, beta, 0.0, 0.0, alpha] w = np.array([[a1, a2, a3, a4, a5, a6, a7, a8, a9]]) # has to be 1x(9x9) dim w = np.array([[[ -0.0358, -0.0896, -0.0222, 0.0345, -0.0198, -0.0242, -0.0577, 0.0466, -0.044 ], [ 0.0369, 0.0963, -0.0193, 0.0888, -0.0208, -0.0687, -0.0288, -0.0076, 0.0463 ], [ -0.0098, 0.0295, -0.0726, 0.0491, 0.0215, -0.0231, 0.0533, 0.0355, 0.0101 ], [ 0.0993, 0.0233, -0.034, -0.0268, 0.014, 0.0581, -0.0794, -0.0376, 0.0361 ], [ 0.047, 0.0036, -0.0083, -0.0519, -0.0065, -0.0106, 0.032, -0.013, -0.016 ], [ -0.0321, -0.0622, 0.0714, -0.0885, -0.0279, -0.0009, 0.0293, -0.0219, -0.0361 ], [ -0.0441, 0.0593, 0.0486, 0.0189, -0.0226, 0.0179, 0.0712, 0.0213, -0.0723 ], [ -0.0729, -0.0937, 0.036, -0.0693, 0.0113, 0.0663, 0.0165, 0.0255, -0.012 ], [ 0.0262, -0.0108, -0.0177, -0.0069, 0.0036, 0.0014, -0.0144, 0.0373, -0.0357 ]]], dtype=np.float32) print ">>> predefined weights: " print w model.layers[1].set_weights(w) if args.train: x_train = np.empty(0) y_train = np.empty(0) x_test = np.empty(0) y_test = np.empty(0) print ">>>>>>> loading data..." for dirName, subdirList, fileList in os.walk(coarseDir): total = len(subdirList) count = 0 for subdir in subdirList: # print('Found directory: %s' % subdir) if count % 5 == 0: print str(float(count) / total * 100) + '%' count = count + 1 x, y = load_data(coarseDir + subdir, fineDir + subdir) if x_train.size == 0: x_train = x y_train = y else: x_train = np.vstack((x_train, x)) y_train = np.vstack((y_train, y)) if x_train.size == 0: print "Error: no input training data." return 0 train(model, x_train, y_train, learning_rate, epochs) if args.eval: print 'load test data to evaluate...' for dirName, subdirList, fileList in os.walk(test_coarseDir): for subdir in subdirList: print('Found directory: %s' % subdir) x, y = load_data(test_coarseDir + subdir, test_fineDir + subdir) if x_test.size == 0: x_test = x y_test = y else: x_test = np.vstack((x_test, x)) y_test = np.vstack((y_test, y)) if x_test.size == 0: print "Error: Need test dataset." return 0 eval(model, x_test, y_test) print ">>> weights: >>>> " weights = model.layers[1].get_weights() w1 = np.array(weights).astype(np.float32) np.set_printoptions(suppress=True) np.set_printoptions(precision=4) print weights ## predict and save output to obj if args.pred: for dirName, subdirList, fileList in os.walk(pred_dir): for subdir in subdirList: newpath = out_dir + subdir print newpath if not os.path.exists(newpath): os.makedirs(newpath) obj_in = pred_dir + subdir + '/00001_00.obj' batch_coarse = [] for dirpath, dirnames, filenames in os.walk(pred_dir + subdir): for x in xrange(1, 101): file_name = str(x).zfill(5) + '_00.obj' obj2tri(pred_dir + subdir + '/' + file_name, batch_coarse) x = np.array(batch_coarse) # print "predict input: \n >>>> " # print x.shape pred(model, x, v_dim, obj_in, out_dir + subdir + '/') # ============= test ============== # obj_in = pred_dir + "test.obj" # batch_coarse = [] # obj2tri(obj_in, batch_coarse) # x = np.array(batch_coarse) # pred(model, x, v_dim, obj_in, out_dir) # ============= test ============== save(model)
import util from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split from baseline import BaselinePredictor from sklearn.svm import SVC data = util.load_data() preprocessed_data = util.preprocess_data(data) X, Y = util.splitFeaturesAndLabel(preprocessed_data, 'Empathy') X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=42) baseline_predictor = BaselinePredictor() baseline_preds = util.trainAndPredict(X_train, Y_train, baseline_predictor, X_test) print("Baseline accuracy and classification report") util.printAccuracyAndClassficationReport(baseline_preds, Y_test, classes=['1', '2', '3', '4', '5']) X_train, X_test = util.getBestFeatures(X_train, Y_train, X_test) model = SVC(kernel='rbf') params = { 'C': [i for i in range(1, 11)],
def main(): from args import args # parser = argparse.ArgumentParser() # parser.add_argument('--model', required=True) # parser.add_argument('--train', required=True) # parser.add_argument('--dev', required=True) # args.load_model_dir = parser.parse_args().model # args.ent_train_dir = parser.parse_args().train # args.ent_dev_dir = parser.parse_args().dev args.load_model_dir = '/scratch0/shifeng/rawr/drqa/original.pt' args.ent_train_dir = 'results/20180217T172242.135276/train.pkl' args.ent_dev_dir = 'pkls/original.rawr.dev.pkl' args.other_train_dir = 'results/targeted_train_all.pkl' out_dir = prepare_output_dir(args, '/scratch0/shifeng/rawr/drqa/') log = logging.getLogger(__name__) log.setLevel(logging.DEBUG) fh = logging.FileHandler(os.path.join(out_dir, 'output.log')) fh.setLevel(logging.DEBUG) ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.INFO) formatter = logging.Formatter(fmt='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S') fh.setFormatter(formatter) ch.setFormatter(formatter) log.addHandler(fh) log.addHandler(ch) log.info('===== {} ====='.format(out_dir)) with open(os.path.join(out_dir, 'args.pkl'), 'wb') as f: pickle.dump(args, f) random.seed(args.seed) torch.manual_seed(args.seed) if args.cuda: torch.cuda.manual_seed(args.seed) log.info('loading regular data from {}'.format(args.data_file)) train_reg, dev_reg, dev_y, embedding, opt = load_data(args) log.info('{} regular training examples'.format(len(train_reg))) log.info('{} regular dev examples'.format(len(dev_reg))) # log.info(opt) ''' load data for regularization ''' log.info('loading entropy training data from {}'.format( args.ent_train_dir)) with open(args.ent_train_dir, 'rb') as f: train_ent = pickle.load(f) if isinstance(train_ent, dict) and 'reduced' in train_ent: train_ent = train_ent['reduced'] if isinstance(train_ent[0][0], list): train_ent = list(itertools.chain(*train_ent)) # log.info('loading targeted training data from {}'.format(args.other_train_dir)) # with open(args.other_train_dir, 'rb') as f: # other_train_ent = pickle.load(f) # if isinstance(other_train_ent, dict) and 'reduced' in train_ent: # other_train_ent = other_train_ent['reduced'] # if isinstance(other_train_ent[0][0], list): # other_train_ent = list(itertools.chain(*other_train_ent)) # train_ent += other_train_ent if args.filter_long > 0: train_ent = [x for x in train_ent if len(x[5]) < args.filter_long] log.info('loading entropy dev data from {}'.format(args.ent_train_dir)) with open(args.ent_dev_dir, 'rb') as f: dev_ent = pickle.load(f)['reduced'] if isinstance(dev_ent[0], list): # dev_ent = list(itertools.chain(*dev_ent)) dev_ent = [x[0] for x in dev_ent] # if args.filter_long > 0: # dev_ent = [x for x in dev_ent if len(x[5]) > args.filter_long] log.info('{} entropy training examples'.format(len(train_ent))) log.info('{} entropy dev examples'.format(len(dev_ent))) log.info('loading model from {}'.format(args.load_model_dir)) checkpoint = torch.load(args.load_model_dir) # opt = checkpoint['config'] state_dict = checkpoint['state_dict'] model = DocReaderModel(vars(opt), embedding, state_dict) model.cuda() ''' initial evaluation ''' dev_reg_batches = BatchGen(dev_reg, batch_size=args.batch_size, pos_size=args.pos_size, ner_size=args.ner_size, evaluation=True, gpu=args.cuda) dev_ent_batches = BatchGen(dev_ent, batch_size=args.batch_size, pos_size=args.pos_size, ner_size=args.ner_size, evaluation=True, gpu=args.cuda) predictions = [] for batch in dev_reg_batches: predictions.extend(model.predict(batch)) em, f1 = score(predictions, dev_y) ents, predictions_r = [], [] for batch in dev_ent_batches: p, _, ss, se, _, _ = model.predict(batch, get_all=True) ss = ss.cpu().numpy() se = se.cpu().numpy() ents.append(scipy.stats.entropy(ss.T).sum() + \ scipy.stats.entropy(se.T).sum()) predictions_r.extend(p) ent = sum(ents) / len(ents) em_r, f1_r = score(predictions_r, dev_y) log.info("[dev EM: {:.5f} F1: {:.5f} Ent: {:.5f}]".format(em, f1, ent)) log.info("[dev EMR: {:.5f} F1R: {:.5f}]".format(em_r, f1_r)) best_f1_score = f1 ''' interleaved training ''' train_ent_batches = BatchGen(train_ent, batch_size=args.batch_size, pos_size=args.pos_size, ner_size=args.ner_size, gpu=args.cuda) len_train_ent_batches = len(train_ent_batches) train_ent_batches = iter(train_ent_batches) n_reg = 0 n_ent = 0 for epoch in range(args.epochs): log.warning('Epoch {}'.format(epoch)) train_reg_batches = BatchGen(train_reg, batch_size=args.batch_size, pos_size=args.pos_size, ner_size=args.ner_size, gpu=args.cuda) start = datetime.now() for i_reg, reg_batch in enumerate(train_reg_batches): model.update(reg_batch) n_reg += 1 if n_reg > args.start_ent: if i_reg % args.n_reg_per_ent == 0: for j in range(args.n_ent_per_reg): try: model.update_entropy(next(train_ent_batches), gamma=args.gamma) n_ent += 1 except StopIteration: n_ent = 0 train_ent_batches = iter( BatchGen(train_ent, batch_size=args.batch_size, pos_size=args.pos_size, ner_size=args.ner_size, gpu=args.cuda)) if n_reg % args.n_report == 0: log.info( 'epoch [{:2}] batch [{}, {}] loss[{:.5f}] entropy[{:.5f}]'. format(epoch, i_reg, n_ent, model.train_loss.avg, -model.entropy_loss.avg / args.gamma)) # if n_reg % args.n_eval == 0: dev_reg_batches = BatchGen(dev_reg, batch_size=args.batch_size, pos_size=args.pos_size, ner_size=args.ner_size, evaluation=True, gpu=args.cuda) dev_ent_batches = BatchGen(dev_ent, batch_size=args.batch_size, pos_size=args.pos_size, ner_size=args.ner_size, evaluation=True, gpu=args.cuda) ''' regular evaluation ''' predictions = [] for batch in dev_reg_batches: predictions.extend(model.predict(batch)) em, f1 = score(predictions, dev_y) ''' entropy evaluation ''' ents, predictions_r = [], [] for batch in dev_ent_batches: p, _, ss, se, _, _ = model.predict(batch, get_all=True) ss = ss.cpu().numpy() se = se.cpu().numpy() ents.append(scipy.stats.entropy(ss.T).sum() + \ scipy.stats.entropy(se.T).sum()) predictions_r.extend(p) ent = sum(ents) / len(ents) em_r, f1_r = score(predictions_r, dev_y) log.info("dev EM: {:.5f} F1: {:.5f} Ent: {:.5f}".format(em, f1, ent)) log.info("[dev EMR: {:.5f} F1R: {:.5f}]".format(em_r, f1_r)) ''' save best model ''' if f1 > best_f1_score: best_f1_score = f1 model_file = os.path.join(out_dir, 'best_model.pt') model.save(model_file, epoch) log.info('[save best model F1: {:.5f}]'.format(best_f1_score)) ''' save models ''' model_file = os.path.join(out_dir, 'checkpoint_epoch_{}.pt'.format(epoch)) model.save(model_file, epoch) log.info("[save model {}]".format(model_file))
def main(): # Training settings # Note: Hyper-parameters need to be tuned in order to obtain results reported in the paper. parser = argparse.ArgumentParser( description= 'PyTorch graph convolutional neural net for whole-graph classification' ) parser.add_argument('--dataset', type=str, default="MUTAG", help='name of dataset (default: MUTAG)') parser.add_argument('--device', type=int, default=0, help='which gpu to use if any (default: 0)') parser.add_argument('--batch_size', type=int, default=32, help='input batch size for training (default: 32)') parser.add_argument( '--iters_per_epoch', type=int, default=50, help='number of iterations per each epoch (default: 50)') parser.add_argument('--epochs', type=int, default=350, help='number of epochs to train (default: 350)') parser.add_argument('--lr', type=float, default=0.01, help='learning rate (default: 0.01)') parser.add_argument( '--seed', type=int, default=0, help='random seed for splitting the dataset into 10 (default: 0)') parser.add_argument( '--fold_idx', type=int, default=0, help='the index of fold in 10-fold validation. Should be less then 10.' ) parser.add_argument( '--num_layers', type=int, default=5, help='number of layers INCLUDING the input one (default: 5)') parser.add_argument( '--num_mlp_layers', type=int, default=2, help= 'number of layers for MLP EXCLUDING the input one (default: 2). 1 means linear model.' ) parser.add_argument('--hidden_dim', type=int, default=64, help='number of hidden units (default: 64)') parser.add_argument('--final_dropout', type=float, default=0.5, help='final layer dropout (default: 0.5)') parser.add_argument( '--graph_pooling_type', type=str, default="sum", choices=["sum", "average"], help='Pooling for over nodes in a graph: sum or average') parser.add_argument( '--neighbor_pooling_type', type=str, default="sum", choices=["sum", "average", "max"], help='Pooling for over neighboring nodes: sum, average or max') parser.add_argument( '--learn_eps', action="store_true", help= 'Whether to learn the epsilon weighting for the center nodes. Does not affect training accuracy though.' ) parser.add_argument( '--degree_as_tag', action="store_true", help= 'let the input node features be the degree of nodes (heuristics for unlabeled graph)' ) parser.add_argument('--filename', type=str, default="", help='output file') args = parser.parse_args() #set up seeds and gpu device torch.manual_seed(0) np.random.seed(0) device = torch.device( "cuda:" + str(args.device)) if torch.cuda.is_available() else torch.device("cpu") if torch.cuda.is_available(): torch.cuda.manual_seed_all(0) graphs, num_classes = load_data(args.dataset, args.degree_as_tag) ##10-fold cross validation. Conduct an experiment on the fold specified by args.fold_idx. train_graphs, test_graphs = separate_data(graphs, args.seed, args.fold_idx) model = GraphCNN(args.num_layers, args.num_mlp_layers, train_graphs[0].node_features.shape[1], args.hidden_dim, num_classes, args.final_dropout, args.learn_eps, args.graph_pooling_type, args.neighbor_pooling_type, device).to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr) scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=50, gamma=0.5) for epoch in range(1, args.epochs + 1): scheduler.step() avg_loss = train(args, model, device, train_graphs, optimizer, epoch) acc_train, acc_test = test(args, model, device, train_graphs, test_graphs, epoch) if not args.filename == "": with open(args.filename, 'w') as f: f.write("%f %f %f" % (avg_loss, acc_train, acc_test)) f.write("\n") print("") extract_features(model, graphs)
if min_path is None: draw_document_graph(document_item, "error.png") draw_document_graph(document_item, "error2.png") print("Document graph is dumped in error.png") print("Entities are: {}".format(document_item["entities"])) raise Exception("No shortest path between entity {} and {}".format(i_ent1, i_ent2)) min_paths[i_ent1, i_ent2] = min_path return PatternPairwiseShortestPath(doc_graph, len(document_item["entities"]), min_paths) if __name__=="__main__": from util import load_data items, indmap, _observed_tuples, arities = load_data("wikismall.data.json", tuple_type="ent_index") for key in items["train"]: sample_doc = items["train"][key]["docs"][0] tokens = [] for sent in sample_doc["sentences"]: for node in sent["nodes"]: tokens.append(node["label"]) raw_sent = " ".join(tokens) ents = [ent for ent in sample_doc["entities"]] print(ents) print(raw_sent)
description="Make figure for divergence and diversity") parser.add_argument('--redo', action='store_true', help='recalculate data') params = parser.parse_args() username = os.path.split(os.getenv('HOME'))[-1] foldername = get_figure_folder(username, 'first') fn_data = foldername + 'data/' fn2_data = fn_data + 'divdiv_correlation.pickle' fn_data = fn_data + 'syn_nonsyn_divergence.pickle' if not os.path.isfile(fn_data) or params.redo: patients = ['p1', 'p2', 'p3', 'p5', 'p6', 'p8', 'p9', 'p10', 'p11'] regions = { 'structural': ['gag'], #['p17', 'p24'], 'enzymes': ['pol'], #['PR', 'RT', 'p15', 'IN'], 'accessory': ['vif', 'nef', 'vpr', 'vpu', 'tat', 'rev'], 'envelope': ['env'] #['gp41', 'gp120'], } # NOTE: these two give the same result, good data = collect_data_fabio(patients, regions) #data = collect_data_richard(patients, regions) store_data(data, fn_data) else: print("Loading data from file") data = load_data(fn_data) # this load additional data produced by script divergence_diversity_correlation data['divdiv_corr'] = load_data(fn2_data) plot_divdiv(data, fig_filename=foldername + 'divdiv')
# -*- coding:utf-8 -*- #test import tensorflow as tf import numpy as np import model import util import trainer # define log file descriptor log_file = open("log/log.txt", 'w') # load dataset data, label = util.load_data(model.DATA_PATH, model.LABEL_PATH) print(data.shape) print(label.shape) weight, bias = model.set_weights() param = { "model": model.model, "weight": weight, "bias": bias, "train_epoch": model.TRAIN_EPOCH, "learning_rate": model.LEARNING_RATE, "decay_rate": model.DECAY_RATE, "fold": model.FOLD, "train_batch_size": model.TRAIN_BATCH_SIZE, "valid_batch_size": model.VALID_BATCH_SIZE, "display_step": model.DISPLAY_STEP, "log_file": log_file }
def main(): parser = argparse.ArgumentParser(description='training nn and make predictions') parser.add_argument('--dataset', type = str, default = "all_models", help = 'dataset to use for layer2 stacking') parser.add_argument('--mode', type = str, default = "OOF", help = 'do cv tuning or oof generation') parser.add_argument('--model', type = str, default = "xgb", help = 'what model to use for stacking') parser.add_argument('--save_flag', type = str, default = "0", help = 'versioning flag') parser.add_argument('--save_prediction', type = str2bool, default = "True", help = 'save prediciton or not') args = parser.parse_args() print(args) models = all_models if args.dataset == "all_models": models = all_models elif args.dataset == 'model_bench': models = model_bench elif args.dataset == 'de_corred_models': models = de_corred_models train, test, y, y_label_dist = load_data(processed = True) sub = pd.read_csv("../input/sample_submission.csv") # some ad hoc features train['comment_text'].fillna("__UNKNOWN__", inplace = True) test['comment_text'].fillna("__UNKNOWN__", inplace = True) train['num_words'] = train.comment_text.str.count('\S+') test['num_words'] = test.comment_text.str.count('\S+') train['num_comas'] = train.comment_text.str.count('\.') test['num_comas'] = test.comment_text.str.count('\.') train['num_bangs'] = train.comment_text.str.count('\!') test['num_bangs'] = test.comment_text.str.count('\!') train['num_quotas'] = train.comment_text.str.count('\"') test['num_quotas'] = test.comment_text.str.count('\"') train['avg_word'] = train.comment_text.str.len() / (1 + train.num_words) test['avg_word'] = test.comment_text.str.len() / (1 + test.num_words) sent_analyzer = SentimentIntensityAnalyzer() train['sentiments'] = train.comment_text.progress_map(lambda text: sent_analyzer.polarity_scores(text)['compound']) test['sentiments'] = test.comment_text.progress_map(lambda text: sent_analyzer.polarity_scores(text)['compound']) META_FEATURES = [ 'num_words', 'num_comas', 'num_bangs', 'num_quotas', 'avg_word', 'sentiments'] # read in oof predictions from layer1 train_features = pd.concat([ pd.read_csv(inp)[LABELS] for inp in ["../models/{}/train_meta_probs_round_0.csv".format(model) for model in models]] , axis = 1) train_features.columns = ['_'.join([label, str(i + 1)]) for i in range(len(models)) for label in LABELS] train_features = pd.concat([train_features, train[META_FEATURES]], axis = 1) # read in avg test predicitons from layer 1 test_features = pd.concat([ pd.read_csv(inp)[LABELS] for inp in ["../models/{}/test_probs_5_bag_arith_mean_round_0.csv".format(model) for model in models]] , axis = 1) test_features.columns = ['_'.join([label, str(i + 1)]) for i in range(len(models)) for label in LABELS] test_features = pd.concat([test_features, test[META_FEATURES]], axis = 1) # === cv splits and place holders # I am reusing the same split from layer 1 splitter = StratifiedKFold(n_splits = 5, shuffle = True, random_state = CV_SPLIT_SEED) folds = list(splitter.split(train_features, y_label_dist)) if args.mode == 'CV': if args.model == 'lgb': lgb_params = {} aucs = [] # per label cv tuning for idx, label in enumerate(LABELS): print("idx {} label {} started cv at time {}".format(idx, label, datetime.now())) current_param_set = {} lgb_model = lgb.LGBMClassifier(objective = 'binary', n_jobs = 8, class_weight = 'balanced') for param_pairs in [ {'learning_rate': [0.02, 0.03, 0.05, 0.06, 0.07], 'n_estimators': [100, 120, 140, 160, 180]}, {'num_leaves': [15, 18, 24, 27, 30], 'min_child_samples': [30, 40, 60, 80, 90]}, {'subsample': [0.5, 0.6, 0.7, 0.8, 0.9], 'colsample_bytree': [0.3, 0.4, 0.5, 0.6, 0.7]}, {'reg_alpha': [0, 0.1, 0.2, 0.3, 0.5], 'reg_lambda': [0.2, 0.3, 0.5, 0.7, 0.9]} ]: grid_search = GridSearchCV( estimator = lgb_model , param_grid = param_pairs , scoring = 'roc_auc' , n_jobs = 1 , cv = folds , refit = True , verbose = 1 , return_train_score = True ) results = grid_search.fit(train_features, y[:, idx]) current_param_set.update(results.best_params_) lgb_model = results.best_estimator_ print(results.best_score_) print(lgb_model) sub[label] = lgb_model.predict_proba(test_features)[:,1] print(results.best_score_) print(current_param_set) aucs.append(results.best_score_) lgb_params[label] = current_param_set print(np.mean(aucs)) if args.save_prediction: sub.to_csv("lgb_stacker_ver{}.csv".format(args.save_flag), index = False) if args.model == 'lr': log_reg_params = {} aucs = [] # per label cv tuning for idx, label in enumerate(LABELS): print("idx {} label {} started cv at time {}".format(idx, label, datetime.now())) log_reg = LogisticRegression(fit_intercept = True, penalty = 'l2', class_weight = 'balanced') param_grid = { 'C': [0.001, 0.05, 0.1, 1, 2, 10], 'tol': [0.01], 'solver': ['lbfgs', 'newton-cg'] } grid_search = GridSearchCV( estimator = log_reg , param_grid = param_grid , scoring = 'roc_auc' , n_jobs = 8 , cv = folds , refit = True , verbose = 1 , return_train_score = True ) results = grid_search.fit(train_features, y[:, idx]) log_reg = results.best_estimator_ print(results.best_score_) print(results.best_params_) sub[label] = log_reg.predict_proba(test_features)[:,1] aucs.append(results.best_score_) log_reg_params[label] = results.best_params_ print(np.mean(aucs)) if args.save_prediction: sub.to_csv("log_reg_stacker_ver{}.csv".format(args.save_flag), index = False) if args.model == 'xgb': xgb_params = {} aucs = [] test_probs = [] # per label cv tuning for idx, label in enumerate(LABELS): print("idx {} label {} started cv at time {}".format(idx, label, datetime.now())) current_param_set = {} xgb_model = xgb.XGBClassifier(objective = 'binary:logistic', n_jobs = 8, class_weight = 'balanced') for param_pairs in [ {'learning_rate': [0.04, 0.05, 0.06]}, {'n_estimators': [120, 140, 150, 160]}, {'max_depth': [2,3,4]}, {'min_child_weight': [1,3,5]}, {'subsample': [0.8, 1]}, {'colsample_bytree': [0.8, 1]}, {'reg_alpha': [0, 0.1]}, {'reg_lambda': [0.9, 1]} ]: # print(np.mean( # cross_val_score( # xgb_model, # train_features, # y[:, idx], # cv = folds, # scoring = 'roc_auc', # verbose = 2 # ))) grid_search = GridSearchCV( estimator = xgb_model , param_grid = param_pairs , scoring = 'roc_auc' , n_jobs = 1 , cv = folds , refit = True , verbose = 2 , return_train_score = True ) results = grid_search.fit(train_features, y[:, idx]) current_param_set.update(results.best_params_) xgb_model = results.best_estimator_ print(results.best_score_) print(xgb_model) sub[label] = xgb_model.predict_proba(test_features)[:,1] print(results.best_score_) print(current_param_set) aucs.append(results.best_score_) xgb_params[label] = current_param_set print(np.mean(aucs)) if args.save_prediction: sub.to_csv("xgb_stacker_ver{}.csv".format(args.save_flag), index = False) if args.mode == 'OOF': if args.model == 'lgb': model_params = lgbm_params[args.dataset] train_metas = np.zeros(y.shape) aucs = [] losses = [] test_probs = [] classifiers = {} aucs_per_label = {} for fold_num, [train_indices, valid_indices] in enumerate(folds): print("=== fitting fold {} datetime {} ===".format(fold_num, datetime.now())) x_train, x_valid = train_features.values[train_indices,:], train_features.values[valid_indices,:] y_train, y_valid = y[train_indices], y[valid_indices] valid_preds = np.zeros(y_valid.shape) test_preds = np.zeros((test_features.shape[0], len(LABELS))) for idx, label in enumerate(LABELS): print("fitting lightgbm for label {} at time {}".format(label, datetime.now())) classifier = "fold_{}_{}".format(fold_num, label) classifiers[classifier] = lgb.LGBMClassifier( objective = 'binary', n_jobs = 8, class_weight = 'balanced', learning_rate = model_params[label]['learning_rate'], num_leaves = model_params[label]['num_leaves'], n_estimators = model_params[label]['n_estimators'], min_child_samples = model_params[label]['min_child_samples'], subsample = model_params[label]['subsample'], colsample_bytree = model_params[label]['colsample_bytree'], reg_alpha = model_params[label]['reg_alpha'], reg_lambda = model_params[label]['reg_lambda'] ) classifiers[classifier].fit(x_train, y_train[:, idx]) valid_preds[:, idx] = classifiers[classifier].predict_proba(x_valid)[:, 1] test_preds[:, idx] = classifiers[classifier].predict_proba(test_features)[:, 1] auc_score = roc_auc_score(y_valid[:, idx], valid_preds[:, idx]) if label not in aucs_per_label: aucs_per_label[label] = [auc_score] else: aucs_per_label[label].append(auc_score) train_metas[valid_indices] = valid_preds test_probs.append(test_preds) auc_score = roc_auc_score(y_valid, valid_preds) log_loss_score = log_loss(y_valid, valid_preds) print("validation auc {} log loss {}".format(auc_score, log_loss_score)) aucs.append(auc_score) losses.append(log_loss_score) aaa = [] for label in aucs_per_label: print(np.mean(aucs_per_label[label])) aaa.append(np.mean(aucs_per_label[label])) print(np.mean(aaa)) print("mean auc score: {} - std {} , mean log loss score: {} - std {}".format( np.mean(aucs), np.std(aucs), np.mean(losses), np.std(losses) )) out_dir = '../models/layer2/{}-{}-{}'.format(args.model, args.dataset, args.save_flag) try: os.mkdir(out_dir) except: print("path exists or failed to create") pd.DataFrame(train_metas, columns = LABELS).to_csv(out_dir + "/train_meta_probs_round_0.csv", index = False) sub[LABELS] = np.zeros(sub[LABELS].shape) for i in range(5): sub[LABELS] += test_probs[i] sub[LABELS] /= 5 sub.to_csv(out_dir + "/test_probs_5_bag_arith_mean_round_0.csv", index = False) if args.model == 'lr': train_metas = np.zeros(y.shape) aucs = [] losses = [] test_probs = [] classifiers = {} aucs_per_label = {} for fold_num, [train_indices, valid_indices] in enumerate(folds): print("=== fitting fold {} datetime {} ===".format(fold_num, datetime.now())) x_train, x_valid = train_features.values[train_indices,:], train_features.values[valid_indices,:] y_train, y_valid = y[train_indices], y[valid_indices] valid_preds = np.zeros(y_valid.shape) test_preds = np.zeros((test_features.shape[0], len(LABELS))) for idx, label in enumerate(LABELS): print("fitting logistic regression for label {} at time {}".format(label, datetime.now())) classifier = "fold_{}_{}".format(fold_num, label) classifiers[classifier] = LogisticRegression( fit_intercept = True, penalty = 'l2', class_weight = 'balanced', C = lr_params[label]['C'], tol = lr_params[label]['tol'], solver = lr_params[label]['solver'], ) classifiers[classifier].fit(x_train, y_train[:, idx]) valid_preds[:, idx] = classifiers[classifier].predict_proba(x_valid)[:, 1] test_preds[:, idx] = classifiers[classifier].predict_proba(test_features)[:, 1] auc_score = roc_auc_score(y_valid[:, idx], valid_preds[:, idx]) if label not in aucs_per_label: aucs_per_label[label] = [auc_score] else: aucs_per_label[label].append(auc_score) train_metas[valid_indices] = valid_preds test_probs.append(test_preds) auc_score = roc_auc_score(y_valid, valid_preds) log_loss_score = log_loss(y_valid, valid_preds) print("validation auc {} log loss {}".format(auc_score, log_loss_score)) aucs.append(auc_score) losses.append(log_loss_score) aaa = [] for label in aucs_per_label: print(np.mean(aucs_per_label[label])) aaa.append(np.mean(aucs_per_label[label])) print(np.mean(aaa)) print("mean auc score: {} - std {} , mean log loss score: {} - std {}".format( np.mean(aucs), np.std(aucs), np.mean(losses), np.std(losses) )) out_dir = '../models/layer2/{}-{}-{}'.format(args.model, args.dataset, args.save_flag) try: os.mkdir(out_dir) except: print("path exists or failed to create") pd.DataFrame(train_metas, columns = LABELS).to_csv(out_dir + "/train_meta_probs_round_0.csv", index = False) sub[LABELS] = np.zeros(sub[LABELS].shape) for i in range(5): sub[LABELS] += test_probs[i] sub[LABELS] /= 5 sub.to_csv(out_dir + "/test_probs_5_bag_arith_mean_round_0.csv", index = False) if args.model == 'xgb': test_features = test_features.values model_params = xgbm_params[args.dataset] train_metas = np.zeros(y.shape) aucs = [] losses = [] test_probs = [] classifiers = {} aucs_per_label = {} for fold_num, [train_indices, valid_indices] in enumerate(folds): print("=== fitting fold {} datetime {} ===".format(fold_num, datetime.now())) x_train, x_valid = train_features.values[train_indices,:], train_features.values[valid_indices,:] y_train, y_valid = y[train_indices], y[valid_indices] valid_preds = np.zeros(y_valid.shape) test_preds = np.zeros((test_features.shape[0], len(LABELS))) for idx, label in enumerate(LABELS): print("fitting xgboost for label {} at time {}".format(label, datetime.now())) classifier = "fold_{}_{}".format(fold_num, label) classifiers[classifier] = xgb.XGBClassifier( objective = 'binary:logistic', n_jobs = 8, class_weight = 'balanced', learning_rate = model_params[label]['learning_rate'], n_estimators = model_params[label]['n_estimators'], max_depth = model_params[label]['max_depth'], min_child_weight = model_params[label]['min_child_weight'], subsample = model_params[label]['subsample'], colsample_bytree = model_params[label]['colsample_bytree'], reg_alpha = model_params[label]['reg_alpha'], reg_lambda = model_params[label]['reg_lambda'] ) classifiers[classifier].fit(x_train, y_train[:, idx]) valid_preds[:, idx] = classifiers[classifier].predict_proba(x_valid)[:, 1] test_preds[:, idx] = classifiers[classifier].predict_proba(test_features)[:, 1] auc_score = roc_auc_score(y_valid[:, idx], valid_preds[:, idx]) gc.collect() if label not in aucs_per_label: aucs_per_label[label] = [auc_score] else: aucs_per_label[label].append(auc_score) train_metas[valid_indices] = valid_preds test_probs.append(test_preds) auc_score = roc_auc_score(y_valid, valid_preds) log_loss_score = log_loss(y_valid, valid_preds) print("validation auc {} log loss {}".format(auc_score, log_loss_score)) aucs.append(auc_score) losses.append(log_loss_score) aaa = [] for label in aucs_per_label: print(np.mean(aucs_per_label[label])) aaa.append(np.mean(aucs_per_label[label])) print(np.mean(aaa)) print("mean auc score: {} - std {} , mean log loss score: {} - std {}".format( np.mean(aucs), np.std(aucs), np.mean(losses), np.std(losses) )) out_dir = '../models/layer2/{}-{}-{}'.format(args.model, args.dataset, args.save_flag) try: os.mkdir(out_dir) except: print("path exists or failed to create") pd.DataFrame(train_metas, columns = LABELS).to_csv(out_dir + "/train_meta_probs_round_0.csv", index = False) sub[LABELS] = np.zeros(sub[LABELS].shape) for i in range(5): sub[LABELS] += test_probs[i] sub[LABELS] /= 5 sub.to_csv(out_dir + "/test_probs_5_bag_arith_mean_round_0.csv", index = False)
def test_init(self): print("test_init") """测试初始化函数,捕捉异常""" data_x = load_data(train_file) self.assertEqual(len(data_x) > 0, True)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-train', action='store_true', default=True, help="train flag") # parser.add_argument('-eval', action='store_true', default=False, help="evaluate flag") # parser.add_argument('-pred', action='store_true', default=True, help="predict flag") # parser.add_argument('-w', action='store_true', default=False, help="load weights flag") parser.add_argument('-c', help="training: coarse dir") parser.add_argument('-f', help="training: fine scale with track dir") parser.add_argument('-logdir', help="logdir") # parser.add_argument('-tc', help="test dataset: coarse dir") # parser.add_argument('-tf', help="test dataset: fine scale with track dir") parser.add_argument('-x', help="predict input dataset dir") parser.add_argument('-o', help="predict output dir") parser.add_argument('-l', help="learning rate") parser.add_argument('-e', help="epochs") # parser.add_argument('-p', help="png file name") # parser.add_argument("-resume", help="bool flag, False by default") # parser.add_argument("-modelh5", help="load exist model") # parser.add_argument("-modelweighth5", help="load model weights") # parser.add_argument('-m', help="M") # parser.add_argument('-n', help="N") parser.add_argument('-restore', action='store_true', default=False, help="restore trained model") parser.add_argument('-init_w', action='store_true', default=False, help="init the weight from upsample.txt") parser.add_argument('-lr_decay', help="learning rate decay rate") # FLAGS = parser.parse_args() # args = parser.parse_args() args, unknown = parser.parse_known_args() if len(sys.argv) < 3: print("Usage: python upsample_train.py -c -f -logdir -x -o -l -e -restore -init_w -lr_decay") return # if args.m and args.n is not None: # m = int(args.m) # n = int(args.n) # print("m and n for prediction: ", m, n) # else: # m = 700 # n = 700 # print("No parameters m and n for prediction, use: ", m, n) restore = False init_w = False lr_decay_rate = 0 if args.restore: restore = True if args.init_w: init_w = True if args.lr_decay: lr_decay_rate = args.lr_decay if args.train: x_train = np.empty(0) y_train = np.empty(0) x_test = np.empty(0) y_test = np.empty(0) learning_rate = float(args.l) epochs = int(args.e) coarseDir = args.c fineDir = args.f logdir = args.logdir if not os.path.exists(logdir): os.makedirs(logdir) sdir = coarseDir rest_file = sdir + [f for f in os.listdir(sdir) if not f.startswith('.')][0] + "/00001_00.obj" dim, mtx, mtx_1 = preprocess.meshmtx_wnb(rest_file) rest_pos = util.load_pos(rest_file) print("training dataset: ") print(">>> " + str(coarseDir) + " >>> " + str(fineDir)) t0 = time.clock() print(">>>>>>> loading data for training >>>>>>> ") for dirName, subdirList, fileList in os.walk(coarseDir): total = len(subdirList) count = 0 for subdir in subdirList: # print('Found directory: %s' % subdir) if count%40 == 0: print(str(float(count)/total*100) + '%') count = count + 1 x, y = util.load_data(coarseDir + subdir, fineDir + subdir, rest_pos) if x_train.size == 0: x_train = x y_train = y else: x_train = np.vstack((x_train, x)) y_train = np.vstack((y_train, y)) print(time.clock() - t0, "seconds loading training data.") if x_train.size == 0: print("Error: no input training data.") return 0 # load data x_pred = np.empty(0) x_coarse = np.empty(0) outDir = "pred/" # if args.pred: inDir = args.x outDir = args.o print(">>>>>>> loading data for prediction >>>>>>>> ") t1 = time.clock() for dirName, subdirList, fileList in os.walk(inDir): total = len(subdirList) for subdir in subdirList: # print('Found directory: %s' % subdir) x_p, x_c = util.load_input_only(inDir + subdir, rest_file) if x_pred.size == 0: x_pred = x_p else: x_pred = np.vstack((x_pred, x_p)) if x_coarse.size == 0: x_coarse = x_c else: x_coarse = np.vstack((x_coarse, x_c)) print (time.clock() - t1, "seconds loading test data.") # batch_size = x_pred.shape[0] # for learning_rate in [1E-1, 1E-2]: # print('Starting run for learning_rate %f' % learning_rate) # train_model(x_train, y_train, dim, mtx, mtx_1, epochs, learning_rate, logdir) train_model(x_train, y_train, x_pred, x_coarse, rest_file, mtx, mtx_1, epochs, learning_rate, logdir, outDir, init_w, lr_decay_rate, restore)
print("test SMAPE", SMAPE) if plot_flag: util.plot(trainPred, trainY, testPred, testY) return trainPred, testPred, MAE, MRSE, SMAPE if __name__ == "__main__": lag = 40 batch_size = 32 epoch = 20 hidden_dim = 64 lr = 1e-4 # ts, data = util.load_data("./data/NSW2013.csv", columnName="TOTALDEMAND") # ts, data = util.load_data("./data/bike_hour.csv", columnName="cnt") # ts, data = util.load_data("./data/TAS2016.csv", columnName="TOTALDEMAND") # ts, data = util.load_data("./data/traffic_data_in_bits.csv", columnName="value") # ts, data = util.load_data("./data/beijing_pm25.csv", columnName="pm2.5") ts, data = util.load_data("./data/pollution.csv", columnName="Ozone") trainPred, testPred, mae, mrse, smape = MLP_forecasting( data, inputDim=lag, hiddenNum=hidden_dim, lr=lr, epoch=epoch, batchSize=batch_size, plot_flag=True)
return trainPred, testPred, MAE, MRSE, SMAPE if __name__ == "__main__": lag = 24 batch_size = 32 epoch = 20 hidden_dim = 64 lr = 1e-4 freq = 4 # ts, data = util.load_data("./data/NSW2013.csv", columnName="TOTALDEMAND") # ts, data = util.load_data("./data/bike_hour.csv", columnName="cnt") # ts, data = util.load_data("./data/TAS2016.csv", columnName="TOTALDEMAND") ts, data = util.load_data("./data/traffic_data_in_bits.csv", columnName="value") # ts, data = util.load_data("./data/beijing_pm25.csv", columnName="pm2.5") # ts, data = util.load_data("./data/pollution.csv", columnName="Ozone") trainPred, testPred, mae, mrse, smape = decompose_MLP_forecasting( ts, data, lag=lag, freq=freq, epoch=epoch, hidden_num=hidden_dim, lr=lr, batch_size=batch_size)
import util ##mass = "m0.001524" mass = "m0.0677" # filename with the correlators ##fff = "Pseuodoscalar_0.202_chargeAV_outcorr.gpl" fff = "gpl/" + mass + "_Rhox_0.202_chargeAV_outcorr.gpl" nt = 48 no_config = 101 ## ## new data ## corr = util.load_data(fff, nt, no_config, corr_tag) ##corr *= -1 corr /= nrm print("Normalizatio factor ", nrm, " applied") print("Computing jackknife correlators") tt, corr_mean, corr_err = util.calc_corr(corr, nt, no_config, 0.0) ## ## Dan's data ## nconfig_dan = 381 corr_dan = util.load_data("../docs/rho_vcphys_bothcharges_m0.001524.gpl", nt, nconfig_dan, "charged-up") tt_dan, corr_dan_mean, corr_dan_err = util.calc_corr(corr_dan, nt, nconfig_dan,