def main(argv=sys.argv): if len(argv) < 2: usage(argv) config_uri = argv[1] options = parse_vars(argv[2:]) setup_logging(config_uri) settings = get_appsettings(config_uri, options=options) engine = engine_from_config(settings, 'sqlalchemy.') DBSession.configure(bind=engine) Base.metadata.drop_all(engine) Base.metadata.create_all(engine) load_data('genproc/scripts/data/genproc_checkplan2014_data.csv')
def dbscan_demo(): print 'starting up' df = load_data() print 'load done' df = extract_features(df) print 'features done' dbscan(df, 0.2, 10)
def write_svm_data(num_features, num_blocks): svm_train_file = open("svm_train_file.dat", "w") svm_test_file = open("svm_test_file.dat", "w") data = load_data.load_data() #data.genes_to_rank = 3 train_samples = 40 features = dimension_reduction.choose_features(data, num_blocks, num_features) for s in range(0, data.samples): for g in range(0, data.genes_to_rank): line = "" line += str(int(data.ranking[g, s])) line += " qid:" line += str(int(s)) for f in range(0, num_features): line += " " feature_index = features[g, f] line += str(int(feature_index)) line += ":" if feature_index >= data.expression_genes: copynumber_index = feature_index - data.expression_genes line += str(data.copynumber[copynumber_index, s]) elif feature_index < data.expression_genes: line += str(data.expression[feature_index, s]) line += "\n" if s <= train_samples: svm_train_file.write(line) else: svm_test_file.write(line) svm_train_file.close() svm_test_file.close()
def start(self): data = load_data('formatted_veltman_pbp_small.pkl', False) self.train_set_x, self.train_set_y = data[0] self.test_set_x, self.test_set_y = data[1] # Opening prompt print('\nTry your luck as an NFL coach! Guess the play call based on each ' '(admittedly simple) game situation.') inpt = raw_input('Type \'q\' at any time to stop. ' 'Press enter to begin...\n') n_correct = 0 n_incorrect = 0 # Game loop if inpt != 'q': response = '' while response != 'q': response, answer = self.ask_question() if response == 'q': self.end_game(n_correct, n_incorrect) continue response = int(response) - 1 if response == answer: print('Good call, coach!\n') n_correct += 1 else: action = self.format_action(answer) print('Whoops, that\'s not what your NFL counterpart decided.' ' He {0}.\n'.format(action)) n_incorrect += 1 else: self.end_game(n_correct, n_incorrect)
def init(m, seed): if m == -1: m = None gc, mt, track = load_data(m, seed) msequences, mlabels = df_to_sequence_list(mt.data) gsequences, glabels = df_to_sequence_list(gc.data) sequences = np.concatenate((msequences, gsequences), 0) labels = np.concatenate((mlabels, glabels)) sequences = np.concatenate((sequences, -1 * sequences)) # tie positive and negative expression sequences tied = {} for i, label in enumerate(labels): tied[label] = [i, i+labels.size] labels = np.concatenate(((labels + '+'), (labels + '-'))) # noise model trained on all data once # genes/metabolites will be assigned to noise model if other models # fail to model it better noise_dist = [NormalDistribution(0, 1)] noise_trans = np.array([[1]]) starts = np.array([1]) noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts, name='noise') noise.freeze_distributions() return sequences, labels, tied, noise
def stn_eval(model_file): print("model: %s" % (model_file)) data = load.load_data(mnist_cluttered, DIM) values = pickle.load(open(model_file, 'r')) network_model, l_transform = model(DIM, DIM, NUM_CLASSES) lasagne.layers.set_all_param_values(network_model, values) X = T.tensor4() y = T.ivector() output_eval, transform_eval = lasagne.layers.get_output([network_model, l_transform], X, deterministic=True) # create funcition eval = theano.function([X], [output_eval, transform_eval]) # evalation function def eval_func(X, y): output_eval, transform_eval = eval(X) preds = np.argmax(output_eval, axis=-1) acc = np.mean(preds == y) return acc, transform_eval test_acc, test_transform = eval_func(data['X_test'], data['y_test']) transpose_visualization(data, test_transform) print("test acc: %f" % (test_acc))
def run(): config_dict = yaml.load(open(sys.argv[1], 'r')) print config_dict data_location = config_dict['data_location'] uniq_map_file = config_dict['uniq_map_file'] runiq_map_file = config_dict['runiq_map_file'] vertices_map, runiq_map = load_data(data_location) broken, unequal = fix_similarity_symmetry(vertices_map) print "* Fixed similarity relation symmetry (%d unidirected, %d unequal)" % (broken, unequal) print "* Vertices map generated" _, deleted = purge_invalid_vertices(vertices_map, runiq_map, uniq_map_file, runiq_map_file) print "* Cleaned up vertices map (deleted %d isolated vertices)" % (deleted) if 'min_elems' in config_dict: forest = Forest(vertices_map, min_graph_elems=config_dict['min_elems']) else: forest = Forest(vertices_map) ccs = forest.build_connected_components() print "* Built connected components" forest.build_forest(ccs) print "* Built graphs out of connected components" forest.reduce() print "* Forest reduced!" for graph in forest.elements: print graph.distance_matrix() print len(forest.elements) print forest.elements_size_hist() forest.pickle(config_dict['pickle_dir'])
def init(): m = 1000 # restricts number of genes, used for local testing gc, mt, track = load_data(m) state_range = [5, 10, 25, 50, 100] z_range = [3, 5, 10, 20] msequences, mlabels = df_to_sequence_list(mt.data) gsequences, glabels = df_to_sequence_list(gc.data) sequences = np.concatenate((msequences, gsequences), 0) labels = np.concatenate((mlabels, glabels)) sequences = np.concatenate((sequences, -1 * sequences)) # tie positive and negative expression sequences tied = {} for i, label in enumerate(labels): tied[label] = [i, i+labels.size] state_labels = np.concatenate(((labels + '+'), (labels + '-'))) labels = np.concatenate((labels, labels)) # noise model trained on all data once # genes/metabolites will be assigned to noise model if other models # fail to model it better noise_dist = [NormalDistribution(0, 1)] noise_trans = np.array([[1]]) starts = np.array([1]) noise = HiddenMarkovModel.from_matrix(noise_trans, noise_dist, starts) noise.freeze_distributions() return gc, mt, sequences, labels, state_labels, tied, noise, z_range, \ state_range
def sgd_predict(dataset=DataHome, batch_size=28): """ Demonstrate stochastic gradient descent optimization of a log-linear model This is demonstrated on MNIST. :type learning_rate: float :param learning_rate: learning rate used (factor for the stochastic gradient) :type n_epochs: int :param n_epochs: maximal number of epochs to run the optimizer :type dataset: string :param dataset: the path of the MNIST dataset file from http://www.iro.umontreal.ca/~lisa/deep/data/mnist/mnist.pkl.gz """ logistic_regression_model_pkl = open(train_model_route, "r") logistic_regression_model_state = cPickle.load(logistic_regression_model_pkl) W, b = logistic_regression_model_state datasets = load_data.load_data(dataset) test_set_x, test_set_y = datasets[2] n_test_batches = test_set_x.get_value(borrow=True).shape[0] / batch_size ###################### # BUILD ACTUAL MODEL # ###################### # print '... building the model' # allocate symbolic variables for the data index = T.lscalar() # index to a [mini]batch x = T.matrix("x") # the data is presented as rasterized images y = T.ivector("y") # the labels are presented as 1D vector of # [int] labels # construct the logistic regression class # Each MNIST image has size 28*28 classifier = LogisticRegression(input=x, n_in=28 * 28, n_out=10, W=W, b=b) # the cost we minimize during training is the negative log likelihood of # the model in symbolic format cost = classifier.negative_log_likelihood(y) # compiling a Theano function that computes the mistakes that are made by # the model on a minibatch test_results = theano.function( inputs=[index], outputs=classifier.y_pred, givens={x: test_set_x[index * batch_size : (index + 1) * batch_size]} ) test_res = [test_results(i) for i in xrange(n_test_batches)] print test_res
def gmm_demo(): print 'starting up' df = load_data() print 'load done' df = extract_features(df) print 'extract done' features_list = list(df.columns.values)[1:] print 'features done' gmm(df, features_list)
def factorize_and_save(): """ 1. Loads the original data. 2. Factorizes the resulting dataframe 3. Saves it as a CSV-file. """ data = ld.load_data() data = ld.factorize_data(data) del data["status_group"] data.to_csv(FACTORIZED_PATH)
def counts(config, cut='llh', bintype='logdist', weight=False, zcorrect=False): dataList = getDataList(config, bintype) bins = getEbins(reco=True) # Build histograms of desired information N, Err = {},{} for cfg, date in dataList[:2]: d = load_data(cfg, date, bintype) eList = getComps(d) c0 = d['cuts'][cut] r = np.log10(d['ML_energy']) if zcorrect: r -= zfix(d['zenith'], bintype=bintype) # Total counts w = d['weights'][c0] if weight else None w2 = d['weights'][c0]**2 if weight else None counts = np.histogram(r[c0], bins=bins, weights=w)[0] errors = np.sqrt(np.histogram(r[c0], bins=bins, weights=w2)[0]) try: N['All'] += counts Err['All'] += errors except KeyError: N['All'] = counts Err['All'] = errors # Counts by composition for e in eList: ecut = d['llh_comp'] == e c1 = c0 * ecut w = d['weights'][c1] if weight else None w2 = d['weights'][c1]**2 if weight else None counts = np.histogram(r[c1], bins=bins, weights=w)[0] errors = np.sqrt(np.histogram(r[c1], bins=bins, weights=w2)[0]) try: N[e] += counts Err[e] += errors except KeyError: N[e] = counts Err[e] = errors fig, ax = plt.subplots() ax.set_xlabel(r'$\log_{10}(E/\mathrm{GeV})$') ax.set_ylabel('Counts') # Plot reconstructions for e in eList + ['All']: pnt = getColor(e) + '.' ax.errorbar(getMids(bins), N[e], yerr=Err[e], fmt=pnt, label=e) ax.set_yscale('log') ax.legend(loc='lower left') plt.show()
def draw_scatter(filename,start,end): datav = load_data(filename,5)[start:end] dataj = load_data(filename,6)[start:end] datac = load_data(filename,4)[start:end] mp = [dataj[i]/datav[i] for i in range(len(datav))] lable = [] for i in range(len(datac)): if i == 0: lable.append(0) else: if datac[i]>datac[i-1]: lable.append(1) else: lable.append(0) datac = [i**1 for i in datac] mp = [i**1 for i in mp] plt.scatter(datac,mp,c=lable) plt.show()
def fit_model(formula, model_file): """ Saves a model :param formula: formula for the model :param model_file: name of file to save the model to """ data = load_data() model = logit(formula=formula, data=data) fitted = model.fit() fitted.save(model_file)
def test_df_columns(self): """ Test for output dataframe column count in load_data module. """ df = load_data.load_data() cols = df.columns.tolist() num = len(cols) num_assert = len(['kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'length', 'oxygen', 'replicate', 'week', 'abundance']) self.assertEqual(num, num_assert)
def _get_number_of_participants(self): """ Returns the number of participants in the dataset found in the specified data directory. """ sys.path.insert(0, self.args.data_dir) print os.getcwd() print sys.path from load_data import load_data dataset = load_data(self.args.data_dir) return len(dataset['data']['Y'])
def initialize_chair(self): self.trX, self.trY, self.teX, self.teY = load_data() self.trX = self.trX.reshape(-1, 1, 48, 64) self.teX = self.teX.reshape(-1, 1, 48, 64) self.w1 = self.init_weights((32, 1, 3, 3)) self.w2 = self.init_weights((64, 32, 3, 3)) self.w3 = self.init_weights((128, 64, 3, 3)) self.w4 = self.init_weights((128 * 5 * 7, 625)) self.wo = self.init_weights((625, 2))
def select_sample(oxygen, replicate): dataframe = load_data.load_data() if (oxygen == "Low") or (oxygen == 'low'): dataframe = dataframe[dataframe['oxygen'] == 'Low'] if (oxygen == "High") or (oxygen == "high"): dataframe = dataframe[dataframe['oxygen'] == 'High'] dataframe = dataframe[dataframe['replicate'] == int(replicate)] return dataframe
def __init__(self, fn, median=True): self.t, self.f, self.fe, self.truth = load_data(fn, median) self.ivar = 1.0 / self.fe ** 2 self.central = transit.Central(q1=self.truth["q1"], q2=self.truth["q2"]) self.system = transit.System(self.central) self.body = transit.Body(period=self.truth["period"], r=self.truth["r"], b=self.truth["b"], t0=self.truth["t0"]) self.system.add_body(self.body)
def classify_and_compare(data_path_1, data_path_2): data_1 = ld.load_data(data_path_1) data_2 = ld.load_data(data_path_2) y = data_1["status_group"].tolist() del data_1["status_group"] del data_1["date_recorded"] del data_2["status_group"] del data_2["date_recorded"] x_1 = data_1.as_matrix() x_2 = data_2.as_matrix() frac_test = 0.2 len_test = int(frac_test * len(y)) indices = np.random.choice(range(0,len(y)), len_test) test_set_1 = [x_1[i] for i in indices] train_set_1 = [x_1[i] for i in range(0,len(y)) if i not in indices] test_y = [y[i] for i in indices] train_y = [y[i] for i in range(0,len(y)) if i not in indices] test_set_2 = [x_2[i] for i in indices] train_set_2 = [x_2[i] for i in range(0,len(y)) if i not in indices] classifier_1 = ensm.RandomForestClassifier(n_estimators = 100) classifier_2 = ensm.RandomForestClassifier(n_estimators = 100) classifier_1.fit(train_set_1, train_y) classifier_2.fit(train_set_2, train_y) prediction_1 = classifier_1.predict(test_set_1) prediction_2 = classifier_2.predict(test_set_2) print "accuracy classifier 1 =", met.accuracy_score(test_y, prediction_1) print "accuracy classifier 2 =", met.accuracy_score(test_y, prediction_2)
def main(): """ Entry point for all code """ print "starting up" df = load_data() df_vectorized = extract_features(df, column_list=FEATURES_TO_EXTRACT, fillna=True, debug=False) target_correlation = calculate_features_target_correlation(df_vectorized, df_vectorized.columns.tolist(), PREDICTION_TARGET, PCA_METHOD) pca = pca_bacteria(df_vectorized, PCA_COMPONENTS) return target_correlation, pca
def riseperiod(filename,start,end): data = load_data(filename,4)[start:end] tmp = 0 p=[] for i in range(1,len(data)): if data[i]<data[i-1]: tmp += 1 else: p.append(tmp) tmp = 0 print p p=filter(lambda x:x!=0, p) print p return np.mean(p),np.std(p)
def initialize_chair(self): self.trX, self.trY, self.teX, self.teY = load_data() self.trX = self.trX.reshape(-1, 1, self.w, self.h) self.teX = self.teX.reshape(-1, 1, self.w, self.h) w = math.ceil((float(self.w)+4)/2) w = ((w -2)/2-2)/2 h = math.ceil((float(self.h)+4)/2) h = ((h -2)/2-2)/2 self.w1 = self.init_weights((32, 1, 3, 3)) self.w2 = self.init_weights((64, 32, 3, 3)) self.w3 = self.init_weights((128, 64, 3, 3)) self.w4 = self.init_weights((128 * w * h, 625)) self.wo = self.init_weights((625, 10))
def group_factorize_and_save(): """ 1. Loads the original data. 2. Finds the best splits of all categorical variables. 3. Factorizes the resulting dataframe 4. Saves it as a CSV-file. """ data = ld.load_data() for var in VARS_TO_GROUP: print "\nfinding best split of variable \"", var, "\"" data = cg.group_categories(data, var) #data = ld.factorize_data(data) del data["status_group"] data.to_csv(GROUPED_PATH_NAMED)
def load_dataset(self): if self.verbose: print 'loading data ... ' start_time = time.time() self.xs_train, self.xs_test, self.ys_train, self.ys_test, self.categories = load_data(self.data_dir, self.sample_size, self.train_test_split_percentage, self.verbose) self.inv_categories = {v: k for k, v in self.categories.items()} num_val = len(self.xs_train)/10 self.xs_val = self.xs_train[-num_val:] self.ys_val = self.ys_train[-num_val:] self.xs_train = self.xs_train[:-num_val] self.ys_train = self.ys_train[:-num_val] if self.verbose: end_time = time.time() self.print_time(start_time,end_time,'loading data')
def load_epo_data(data_cat, n_before=-3, n_len=100, subjects=None): # loading 'data_cat' data data, channels, markers = load_data(FS, folder_name, data_cat, subjects) # converting plain data to continuous Data object cnt = convert_mushu_data(data, markers, FS, channels) # Define the markers belonging to class 1 and 2 markers_definitions = None if data_cat == 'train': markers_definitions = {'class 1': (train_labels.query('Prediction == 0', engine='python')['IdFeedBack']).tolist(), 'class 2': (train_labels.query('Prediction == 1', engine='python')['IdFeedBack']).tolist()} else: # marker classes doesn't matter for test data markers_definitions = {'class 1': [m[1] for m in markers], 'class 2': []} # segmenting continuous Data object into epoched data # Epoch the data -25ms(5 rows) and +500ms(100 rows) around the markers defined in markers_definitions return segment_dat(cnt, markers_definitions, [n_before*5, (n_before + n_len)*5])
def distro(config, bintype='logdist', cut='llh', xaxis='energy', weight=False): # General setup labelDict = {'energy':r'$\log_{10}(E/\mathrm{GeV})$', 'zenith':r'$\cos(\theta)$', 'core':'Distance from center (m)'} binDict = {'energy':getEbins(), 'zenith':np.linspace(0.8, 1, 41), 'core':np.linspace(0, 700, 71)} dataList = getDataList(config, bintype) bins = binDict[xaxis] xlabel = labelDict[xaxis] #fbins = fineBins(bins) # Build histograms of desired information for cfg, date in dataList[:1]: d = load_data(cfg, date, bintype) c0 = d['cuts'][cut] w = d['weights'][c0] if weight else None if xaxis == 'energy': y = np.log10(d['ML_energy']) if xaxis == 'zenith': y = np.cos(d['zenith']) if xaxis == 'core': y = np.sqrt(d['ML_x']**2 + d['ML_y']**2) counts = np.histogram(y[c0], bins=bins, weights=w)[0] try: h += counts except NameError: h = counts # Plot fig, ax = plt.subplots() x = getMids(bins) width = bins[1] - bins[0] ax.plot(x, h, ls='steps') ax.set_xlabel(xlabel) ax.set_ylabel('Counts') ax.set_yscale('log') plt.show()
def compute_feature_matrix(data_dir, functions, labels, save_file=None, verbose=False): """ For each .mat EEG data file in data_dir, compute the features given by functions and labels and return a 2D array where each row contains the index of the hour the segment belongs to, the segment type ('preictal': 1, 'interictal': 0, 'test': -1), and its features. Save the resulting feature matrix if the save_file keyword is set. """ X = np.zeros(len(labels) + 2) # add 2 columns for hour and type data_files = [] for f in os.listdir(data_dir): if f.split('.')[-1] == 'mat': data_files.append(f) if verbose: print f data = load_data.load_data(os.path.join(data_dir, f)) new_features = compute_features(data, functions) if data['type'] == 'preictal': seg_type = 1 elif data['type'] == 'interictal': seg_type = 0 elif data['type'] == 'test': seg_type = -1 else: seg_type = np.nan new_features = np.hstack(([data['hour'], seg_type], new_features)) X = np.vstack((X, new_features)) X = X[1:,:] if save_file is not None: columns = ['hour', 'type'] + labels np.savetxt(save_file, X, fmt='%.4e', header='Data directory: ' + data_dir + \ '\nColumns:\n ' + '\n '.join(columns)) data_list_file = '.'.join(save_file.split('.')[:-1]) + '_data_files.txt' with open(data_list_file, 'w') as df: df.writelines('\n'.join(data_files)) return (X, data_files)
def my_model(): xtrain, ytrain, xtest, ytest, features = load_data() # ytrain = transform_to_log(ytrain) # # mosq_model = GradientBoostingRegressor(loss='ls', verbose=1, max_depth=7, # n_estimators=20) # train_nmosq_model(mosq_model, xtrain, ytrain, do_grid_search=False) model = GradientBoostingClassifier(verbose=1, max_depth=3, n_estimators=100) train_has_wnv_model(model, xtrain, ytrain, do_grid_search=False, feature_list=features) prepare_submission(model, xtrain, ytrain[:, 1], xtest, ytest, feature_list=features) return
def __init__(self): config = get_config() self.data = load_data(config) print "%s data loaded..." %config["dataset"] nhidden_layers = len(config["hidden_sizes"]) nhidden = config["hidden_sizes"][0] print "num_hidden_layers :",nhidden_layers print "hidden_units_per_layer :",nhidden X = T.fmatrix() Y = T.ivector() scaling_factors = T.fvector() num_input = config["num_input"] num_output = 10 w_h, b_h = init_parameters(num_input, num_output, config["hidden_sizes"],scale=0.01) w_m, b_m, = init_parameters(num_input, num_output, config["hidden_sizes"],scale=0.0) self.parameters = w_h + b_h self.momentum = w_m + b_m Layers = [X] py_x = model(X, w_h, b_h, Layers) y_x = T.argmax(py_x, axis=1) individual_cost = -1.0 * (T.log(py_x)[T.arange(Y.shape[0]), Y]) cost = T.mean(individual_cost) scaled_individual_cost = scaling_factors * individual_cost scaled_cost = T.mean(scaled_individual_cost) updates = sgd(scaled_cost, self.parameters, self.momentum, config["learning_rate"], config["momentum_rate"]) squared_norm_var = compute_grad_norms(X,cost,Layers) accuracy = T.mean(T.eq(T.argmax(py_x, axis = 1), Y)) self.train = theano.function(inputs=[X, Y, scaling_factors], outputs=[cost,squared_norm_var, individual_cost, accuracy], updates=updates, allow_input_downcast=True) self.predict = theano.function(inputs=[X], outputs=[y_x,py_x], allow_input_downcast=True) self.get_attributes = theano.function(inputs=[X, Y], outputs=[cost,squared_norm_var, individual_cost, accuracy], allow_input_downcast=True)
def sgd_optimization_mnist(learning_rate=0.01, n_epochs=1000, dataset='mnist.pkl.gz', batch_size=600, optimizer='gd'): datasets = load_data(dataset) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] tmpl = [(28 * 28, 10), 10] flat, (Weights, bias) = climin.util.empty_with_views(tmpl) cli.initialize.randomize_normal(flat, 0, 1) if batch_size is None: args = itertools.repeat(([train_set_x, train_set_y], {})) n_train_batches = 1 else: args = cli.util.iter_minibatches([train_set_x, train_set_y], batch_size, [0, 0]) args = ((i, {}) for i in args) n_train_batches = train_set_x.shape[0] // batch_size print('... building the model') x = T.matrix('x') y = T.ivector('y') classifier = LogisticRegression( input = x, n_in = 28 * 28, n_out = 10, W = theano.shared(value = Weights, name = 'W', borrow = True), b = theano.shared(value = bias, name = 'b', borrow = True) ) gradients = theano.function( inputs = [x, y], outputs = [ T.grad(classifier.negative_log_likelihood(y), classifier.W), T.grad(classifier.negative_log_likelihood(y), classifier.b) ], allow_input_downcast = True ) cost = theano.function( inputs=[x, y], outputs=classifier.negative_log_likelihood(y), allow_input_downcast=True ) def loss(parameters, input, target): return cost(input, target) def d_loss_wrt_pars(parameters, inputs, targets): g_W, g_b = gradients(inputs, targets) return np.concatenate([g_W.flatten(), g_b]) zero_one_loss = theano.function( inputs = [x, y], outputs = classifier.errors(y), allow_input_downcast = True ) if optimizer == 'gd': print('... using gradient descent') opt = cli.GradientDescent(flat, d_loss_wrt_pars, step_rate=learning_rate, momentum=.95, args=args) elif optimizer == 'rmsprop': print('... using rmsprop') opt = cli.RmsProp(flat, d_loss_wrt_pars, step_rate=1e-4, decay=0.9, args=args) elif optimizer == 'rprop': print('... using resilient propagation') opt = cli.Rprop(flat, d_loss_wrt_pars, args=args) elif optimizer == 'adam': print('... using adaptive momentum estimation optimizer') opt = cli.Adam(flat, d_loss_wrt_pars, step_rate = 0.0002, decay = 0.99999999, decay_mom1 = 0.1, decay_mom2 = 0.001, momentum = 0, offset = 1e-08, args=args) elif optimizer == 'adadelta': print('... using adadelta') opt = cli.Adadelta(flat, d_loss_wrt_pars, step_rate=1, decay = 0.9, momentum = .95, offset = 0.0001, args=args) else: print('unknown optimizer') return 1 print('... training the model') # early stopping parameters if batch_size== None: patience = 250 else: patience = 5000 # look at this many samples regardless patience_increase = 2 # wait this mutch longer when a new best is found improvement_threshold = 0.995 # a relative improvement of this mutch is considered signigicant validation_frequency = min(n_train_batches, patience // 2) best_validation_loss = np.inf test_loss = 0. valid_losses = [] train_losses = [] test_losses = [] epoch = 0 start_time = timeit.default_timer() for info in opt: iter = info['n_iter'] epoch = iter // n_train_batches minibatch_index = iter % n_train_batches if iter % validation_frequency == 0: # compute zero-one loss on validation set validation_loss = zero_one_loss(valid_set_x, valid_set_y) valid_losses.append(validation_loss) train_losses.append(zero_one_loss(train_set_x, train_set_y)) test_losses.append(zero_one_loss(test_set_x, test_set_y)) print( 'epoch %i, minibatch %i/%i, validation error % f %%, iter/patience %i/%i' % ( epoch, minibatch_index + 1, n_train_batches, validation_loss * 100, iter, patience ) ) # if we got the best validation score until now if validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter * patience_increase) best_validation_loss = validation_loss # test it on the test set test_loss = zero_one_loss(test_set_x, test_set_y) print( ' epoch %i, minibatch %i/%i, test error of best model %f %%' % ( epoch, minibatch_index + 1, n_train_batches, test_loss * 100 ) ) if patience <= iter or epoch >= n_epochs: break end_time = timeit.default_timer() print('Optimization complete with best validation score of %f %%, with test performance %f %%' % (best_validation_loss * 100., test_loss * 100.)) print('The code run for %d epochs, with %f epochs/sec' % (epoch, 1. * epoch / (end_time - start_time))) print(('The code for file ' + os.path.split(__file__)[1] + ' ran for %.1fs' % ((end_time - start_time))), file=sys.stderr) losses = (train_losses, valid_losses, test_losses) return classifier, losses
def base_train(data_tag, to_train=False, is_bin=False): print("Unpacking data...") state_len, num_classes, x_train, y_train, x_test, y_test = load_data( data_tag) print(f"state_len: {state_len}, num_classes: {num_classes}") model = build_model(state_len, num_classes) print("Model built. ") time_stamp = get_time() print(time_stamp) model_save_root = f"checkpoints/{data_tag}/{MAGIC_CODE}" history_save_root = f"history/{data_tag}/{MAGIC_CODE}/{WORK_MAGIC_CODE}/{time_stamp}" os.makedirs(model_save_root, exist_ok=True) os.makedirs(history_save_root, exist_ok=True) model_basename = f"{MAGIC_CODE}-{data_tag}-{WORK_MAGIC_CODE}" model_save_path = f"{model_save_root}/{model_basename}-{time_stamp}.h5" model_universal = f"best_models/{model_basename}.h5" history = [] if to_train: # earlystopper = EarlyStopping(patience=10, verbose=1, monitor="val_acc") # checkpointer = ModelCheckpoint(model_universal, verbose=1, save_best_only=True, monitor="val_acc") earlystopper = EarlyStopping(patience=5, verbose=1) checkpointer = ModelCheckpoint(model_universal, verbose=1, save_best_only=True) # reduce_lr_loss = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=10, verbose=1, epsilon=1e-4, mode='min') history = model.fit( x_train, [y_train, y_train], batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(x_test, [y_test, y_test]), # callbacks=[earlystopper, checkpointer, reduce_lr_loss]) callbacks=[earlystopper, checkpointer], class_weight=generate_class_weights(np.argmax(y_train, axis=1)), ) with custom_object_scope({ "Projection": Projection, "Proj2Prob": Proj2Prob, "EigenDist": EigenDist, "categorical_bernoulli_crossentropy": categorical_bernoulli_crossentropy, "FullConnectedNeuralNetwork": FullConnectedNeuralNetwork, "Softmax": Softmax, "categorical_crossentropy": categorical_crossentropy }): model.load_weights(model_universal) if to_train: model.save(model_save_path) score = model.evaluate(x_test, [y_test, y_test], verbose=0) print(score) # print('Test loss:', score[0]) # print('Test loss 2:', score[1]) # print('Test accuracy:', score[2]) # print('Test accuracy 2:', score[3]) dataset = [x_train, y_train, x_test, y_test] save_history(dataset, model, num_classes, history, data_tag, WORK_MAGIC_CODE, MAGIC_CODE, history_save_root, time_stamp) cmp_res = compare_all(dataset, num_classes, model, data_tag, WORK_MAGIC_CODE, MAGIC_CODE, time_stamp, is_bin=is_bin) # save_compare_result(cmp_res, data_tag, WORK_MAGIC_CODE, MAGIC_CODE, time_stamp) print("Wait Nutstore to sync...") import time time.sleep(5) shutil.copy(f"history_{data_tag}.txt", f"{history_save_root}/")
import utils import load_data import numpy as np import torch adjs, attributes = load_data.load_data("DBLP_sub") adj = adjs[-2:].sum(0) print(adj.shape) for val in range(0, 15): print(val, len(np.where(adj == val)[0]))
def train_net(num_epochs=20, batch_size=50, learning_rate=1e-4, unseen=False, update_method=''): # Prepare Theano variables for inputs and targets input_var = T.tensor4('inputs') target_var = T.ivector('targets') net = vgg16.build_model(input_var, batch_size) network = net['prob'] # Load the dataset if unseen: print("Loading data, unseen val/test signatories task...") X_train, y_train, X_val, y_val, X_test, y_test = load_data.load_data_unseen_separated( ) else: print("Loading data, standard task...") X_train, y_train, X_val, y_val, X_test, y_test = load_data.load_data() # Create a loss expression for training, i.e., a scalar objective we want # to minimize (for our multi-class problem, it is the cross-entropy loss): prediction = lasagne.layers.get_output(network) all_params = lasagne.layers.get_all_params(network, trainable=True) # Get all the parameters we don't want to train fixed_params = lasagne.layers.get_all_params(net[LAST_FIXED_LAYER]) params = [x for x in all_params if x not in fixed_params] loss = lasagne.objectives.categorical_crossentropy( prediction, target_var) + REG * lasagne.regularization.apply_penalty( params, lasagne.regularization.l2) loss = loss.mean() # We could add some weight decay as well here, see lasagne.regularization. # Create update expressions for training, i.e., how to modify the # parameters at each training step. Here, we'll use Stochastic Gradient # Descent (SGD) with Nesterov momentum, but Lasagne offers plenty more. # First get all the parameters if update_method.lower() == 'nesterov' or update_method == '': updates = lasagne.updates.nesterov_momentum( loss, params, learning_rate=learning_rate, momentum=0.9) elif update_method.lower() == 'momentum': updates = lasagne.updates.momentum(loss, params, learning_rate=learning_rate, momentum=0.9) elif update_method.lower() == 'sgd': updates = lasagne.updates.sgd(loss, params, learning_rate=learning_rate) elif update_method.lower() == 'adam': updates = lasagne.updates.adam(loss, params, learning_rate=learning_rate, beta1=0.9, beta2=0.999, epsilon=1e-08) elif update_method.lower() == 'rmsprop': # typically better than adaGrad updates = lasagne.updates.rmsprop(loss, params, learning_rate=learning_rate, rho=0.9, epsilon=1e-06) elif update_method.lower() == 'adadelta': updates = lasagne.updates.adadelta(loss, params, learning_rate=learning_rate, rho=0.9, epsilon=1e-06) else: raise IOError("Not an acceptable parameter update method.") #updates = lasagne.updates.adam( # loss, params, learning_rate=learning_rate) # Create a loss expression for validation/testing. The crucial difference # here is that we do a deterministic forward pass through the network, # disabling dropout layers. test_prediction = lasagne.layers.get_output(network, deterministic=True) test_loss = lasagne.objectives.categorical_crossentropy( test_prediction, target_var) + REG * lasagne.regularization.apply_penalty( params, lasagne.regularization.l2) test_loss = test_loss.mean() # As a bonus, also create an expression for the classification accuracy: test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var), dtype=theano.config.floatX) # Hacky code to create the confusion matrix, which exists due to my # poor understanding of theano preds = T.argmax(test_prediction, axis=1) inv_preds = 1 - preds inv_target_var = 1 - target_var true_positives = T.sum(preds * target_var) # Use mult as elementwise and true_negatives = T.sum(inv_preds * inv_target_var) false_positives = T.sum(preds * inv_target_var) false_negatives = T.sum(inv_preds * target_var) # Compile a function performing a training step on a mini-batch (by giving # the updates dictionary) and returning the corresponding training loss: train_fn = theano.function([input_var, target_var], loss, updates=updates) print("train_fn set up.") # Compile a second function computing the validation loss and accuracy: val_fn = theano.function([input_var, target_var], [ test_loss, test_acc, true_positives, true_negatives, false_positives, false_negatives ]) # Finally, launch the training loop. print("Starting training...") # We iterate over epochs: val_loss_per_epoch = [] train_loss_per_epoch = [] for epoch in range(num_epochs): # In each epoch, we do a full pass over the training data: train_err = 0 train_batches = 0 start_time = time.time() for batch in iterate_minibatches(X_train, y_train, batch_size): inputs, targets = batch train_err += train_fn(inputs, targets) train_batches += 1 # And a full pass over the validation data: val_err = 0 val_acc = 0 val_far = 0 val_frr = 0 val_batches = 0 for batch in iterate_minibatches(X_val, y_val, batch_size): inputs, targets = batch err, acc, t_p, t_n, f_p, f_n = val_fn(inputs, targets) val_err += err val_acc += acc val_frr += float(f_n) / (t_p + f_n) val_far += float(f_p) / (f_p + t_n) val_batches += 1 # Then we print the results for this epoch: print("Epoch {} of {} took {:.3f}s".format(epoch + 1, num_epochs, time.time() - start_time)) print(" training loss:\t\t{:.6f}".format(train_err / train_batches)) print(" validation loss:\t\t{:.6f}".format(val_err / val_batches)) print(" validation accuracy:\t\t{:.2f} %".format(val_acc / val_batches * 100)) print(" validation far:\t\t{:.2f} %".format(val_far / val_batches * 100)) print(" validation frr:\t\t{:.2f} %".format(val_frr / val_batches * 100)) val_loss_per_epoch.append(val_err / val_batches) train_loss_per_epoch.append(train_err / train_batches) print("Val loss per epoch:", val_loss_per_epoch) print("Train loss per epoch:", train_loss_per_epoch) # After training, we compute and print the test error: test_err = 0 test_acc = 0 test_far = 0 test_frr = 0 test_batches = 0 for batch in iterate_minibatches(X_test, y_test, batch_size): inputs, targets = batch err, acc, t_p, t_n, f_p, f_n = val_fn(inputs, targets) test_err += err test_acc += acc test_frr += float(f_n) / (t_p + f_n) test_far += float(f_p) / (f_p + t_n) test_batches += 1 print("Final results:") print(" test loss: withheld until final submission lolol") print(" test accuracy: withheld until final submission lolol")
from theano import config import theano.sandbox.cuda config.floatX = 'float32' print(config.floatX) theano.sandbox.cuda.use("gpu0") import load_data import prepare_images import rotate_image pizza_eng_names, pizza_imgs = prepare_images.load_photos() channels, height, width = 3, 32, 32 batch_size = 20 labels, onehotencoder = load_data.load_data() labels_list = [] j = 0 image_list = [] for pizza_img in pizza_imgs: lst = load_data.resize_rotate_flip(pizza_img, (height, width)) print(len(lst)) image_list.extend(lst) lbls = [] for i in range(len(lst)): lbls.append(shuffle(labels[j], random_state=i)) labels_list.extend(lbls) j += 1
k_size = [1, 2, 3, 4, 5, 6, 7, 8] ''' Decoder config ''' de_embed = 256 de_H = 256 de_layers = 1 de_bi = False en_Hbi = en_H * (2 if en_bi == True else 1) ''' File path ''' th_en_ref = "th-en/ted_test_th-en.en.tok_seg" th_vi_ref = "th-vi/ted_test_th-vi.vi.tok" ########################### ### Load Data and Dict #### ########################### train_data1, train_target1, val_data1, val_target1, inp_dict1, tgt_dict1 = load_data( lang_pair1, source_type, tgt_type) train_data2, train_target2, val_data2, val_target2, inp_dict2, tgt_dict2 = load_data( lang_pair2, source_type, tgt_type) # combine dicts raw_inp_dict = {**inp_dict2, **inp_dict1} raw_tgt_dict = {**tgt_dict2, **tgt_dict1} inp_dict, tgt_dict = {}, {} count, count2 = 0, 0 for k, v in raw_inp_dict.items(): inp_dict[k] = count count += 1 for k2, v2 in raw_tgt_dict.items(): tgt_dict[k2] = count2
### WAND Debug ### import wandb import logConfig, load_data, accuracy_loss, train, preprocessing, plot train_X, train_Y, test_X, test_Y, labels = load_data.load_data() (N, w, h), n_labels = train_X.shape, len(labels) # Number of datapoints to train n = 100 # Dimension of datapoints d = w * h # Data Preprocessing (train_x, train_y), (val_x, val_y), (test_x, test_y) = preprocessing.pre_process( d, n_labels, train_X, train_Y, test_X, test_Y) def mainDebug(config=None): run = wandb.init(config=config) config = wandb.config hl = [config.hidden_layer_size] * config.hidden_layers # Hidden layers ol = [len(train_y[0])] # Output layers n_hl = len(hl) logConfig.logConfig(config)
# -*- coding: utf-8 -*- """ Created on Tue Nov 17 15:15:48 2020 @author: groes """ import neural_network as nn import numpy as np import load_data import utils data = load_data.load_data() X = data['data'] y = data['target'] X_train, X_test, y_train, y_test = utils.split_data(X, y, 0.3) unittest_mod = nn.new_neural_network(0.001) unittest_mod.create_input_layer(784) unittest_mod.add_hidden_layer(256) #unittest_mod.add_hidden_layer(256) unittest_mod.add_hidden_layer(128) #unittest_mod.add_hidden_layer(64) unittest_mod.add_output_layer(10) unittest_mod.new_train(X_train, y_train,5,batch_size = 32, optimiser= "Adam") unittest_mod.accuracy_score(X_test, y_test) y_test[0]
default=1111, help='torch seed for randomization') args = parser.parse_args() torch.manual_seed(args.seed) np.random.seed(args.seed) if not os.path.exists(args.save_folder): os.makedirs(args.save_folder) if not os.path.exists(args.save_folder + '/imgs'): os.makedirs(args.save_folder + '/imgs') # loading useful data print('\nLOADING CORPUS') model = load_model(args.model_path) sentences, labels = load_data(args.tree_data, 'open_nodes') corpus = data.Corpus(args.training_data) if args.gated_forward: print('USING GATED FORWARD') model_values = utils.get_model_values(model) data_load_failed = False if args.load_data: try: print('LOADING DATA') hidden_states = np.load(args.hidden_location).item() cell_states = np.load(args.cell_location).item() targets = np.load(args.targets_location) depth_targets = np.load(args.depth_targets_location)
Set the following three parameters. Plotting will run subsequently. scale: int, scales daily data. 7: one week. 30: one month. 0.5: half day. no_periods: int, determine scaled-day periods. measures: list, takes the desired measures from header_dict. header_dict: dict, output from load_data.ipynb. files: list, output from load_data.ipynb. """ import matplotlib.dates as mdates from matplotlib import cm from matplotlib.colors import ListedColormap, LinearSegmentedColormap from load_data import load_data # Init variables df, files, header_dict = load_data() scale = 7 # 1 = single day, 7 = week, etc. no_periods = 52 # numer of periods, ie 10 weeks; scale =7, no periods = 10 measures = ['no2', 'no', 'pm10'] def fix_series(series, missing, flag): """ Calls a series recursively and patches the minimal datetime value with the pervious one continues with the maxium. Et cetera. Not error proof, ie whole series == missing will recurse to the limit. Make sure to not pass an empty series. """ if len(missing) == 0: return series if flag == 'min': if missing.min() < series['datetime'].min(): pass
def run(_run): # Load configs, if parameters are unspecified, fill in a default config = _run.config run = config.get('fit_params') model_params = config.get('model_params') data_params = config.get('data_params') batch_size = data_params.get('batch_size') augmentations = data_params.get('augmentations') buffer_size = data_params.get('buffer_size') # the buffer sizes for shuffling use_sampling = data_params.get('use_sampling') class_target_prob = 1 / model_params.get('num_classes') print("[!] list of parameter configurations") pprint(config) # Load data and define generators ------------------------------------------ print("[!] loading datasets \n") x_train, x_val, x_test, probs = load_data() # get a rough estimate: there are 100 files per TFRecord # except for one TFRecord per item, so this estimate might not be 100% correct num_training = len(x_train) * 100 # TF parsing functions print("[!] Creating dataset iterators \n") # Load the dataset iterators train_dataset = create_training_dataset(x_train, batch_size, buffer_size, augmentations, use_sampling, probs, class_target_prob, **model_params) val_dataset = validate(x_val, batch_size, **model_params) test_dataset = validate(x_test, batch_size, **model_params) # we need the actual labels from the TFRecords, but they take INCREDIBLY long to parse # parse through them once, and create a csv file with a list of all the labels # note: the tf parsing requires that there is no randomness (shuffling) in the validation/test labels if not os.path.exists('../datasets/data/valid/val_labels.csv'): print(os.path.exists('../datasets/data/valid/val_labels.csv')) print("[!] creating validation label file in ../datasets/data/valid/val_labels.csv") create_label_csv(val_dataset,'../datasets/data/valid/val_labels.csv') else: print("[!] validation labels csv exist") if not os.path.exists('../datasets/data/test/test_labels.csv'): print("[!] creating test label file in ../datasets/data/test/test_labels.csv") create_label_csv(test_dataset,'../datasets/data/test/test_labels.csv') else: print("[!] test labels csv exist") # load the file with validation labels # getting labels from a TFRecords with lots of other data is horribly slow... print("[!] Loading validation labels for callbacks") val_labels = pd.read_csv('../datasets/data/valid/val_labels.csv') val_labels = np.squeeze(val_labels.to_numpy()) # Model definitions -------------------------------------------------------- print("[!] compiling model and adding callbacks \n") # function for building the model model_func = model_dict[run.get('model')] # invoke the user function model = model_func(**model_params) model.summary() # compile the model with catcrossentropy: one hot encoded labels!! model.compile(optimizer= tf.keras.optimizers.Adam(run.get('lr')), loss= 'categorical_crossentropy', metrics=['accuracy']) # Model callbacks ---------------------------------------------------------- # ReduceLRonPlateau if run.get('reduce_lr_on_plateau'): reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_lr=10e-7, verbose=1) else: reduce_lr = Callback() # Model checkpoints now = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") aug_string = 'aug' if augmentations==True else 'noaug' modelcheckpoint_name= lambda x: "checkpoints/model-{}-{}-{}-{}-{}.hdf5".format(run.get('model'), x, aug_string, 'ch_' + str(len(model_params.get('channels'))), now) modelcheckpoint = ModelCheckpoint(modelcheckpoint_name('best_loss'), monitor = 'val_loss', verbose=1, save_best_only=True, save_weights_only=True) # Model early stopping earlystopping = EarlyStopping(monitor='val_loss', patience=10) # tensorboard and metric callbacks log_dir = "logs/fit/{}-{}-{}-{}".format(run.get('model'), aug_string, 'ch_' + str(len(model_params.get('channels'))), now) file_writer = tfsum.create_file_writer(log_dir) tensorboard_cb = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1, profile_batch=0) f1_metric = Metrics(val_dataset, val_labels, save_best=True, save_name= modelcheckpoint_name('best_f1'), writer=file_writer) # Model Training and evaluation -------------------------------------------- print("[!] fitting model \n") model.fit( train_dataset.repeat(), epochs=run.get('epochs'), steps_per_epoch= int(num_training / batch_size), validation_data=val_dataset, validation_steps = None, shuffle=True, verbose= 1, callbacks = [tensorboard_cb, f1_metric, LogMetrics(), modelcheckpoint, earlystopping, reduce_lr, MemoryCallback()] ) print("[!] done running, terminating program") '''
def activation(self, N): x = (self.location + self.posn0 - self.t0 + N) % self.num_posns return x == 0 def _parse(d): d = d.split(' ') disc_number = int(d[1][1:]) num_posns = int(d[3]) t0 = int(d[6][d[6].index('=') + 1:-1]) posn0 = int(d[-1][:-1]) return (disc_number, num_posns, t0, posn0) if __name__ == "__main__": data = load_data('./input/day15.txt') print('Part 1') discs = [Disc(*_parse(d)) for d in data] N = 0 while not all(d.activation(N) for d in discs): N += 1 print('\tFirst N is: {}'.format(N)) print('\nPart 2') discs2 = [Disc(*_parse(d)) for d in data] discs2.append(Disc(discs2[-1].location + 1, 11, 0, 0)) N2 = 0 while not all(d.activation(N2) for d in discs2): N2 += 1 print('\tFirst N is: {}'.format(N2))
from model import unet from load_data import load_data from show_prediction import predict from callbacks import keras_callback train_gen, val_gen, x_test, y_test = load_data(datapath='../data/processed/') if __name__ == '__main__': model = unet() model.summary() model.fit( train_gen, epochs=100, validation_data=val_gen, callbacks=[keras_callback()] ) model.save('../models/checkpoint.h5') predict(model, x_test, y_test)
import torch from load_data import load_data from model.vgg16 import vgg16 from model.tiny import TinyClassifier2d from model.resnet50 import resnet50 from train import train_model device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') print('Using device:', device) train_loader, validate_loader, _ = load_data() # VGG16 Network # vgg16 = vgg16() # train_model(vgg16, 'vgg16', train_loader, validate_loader, 3, device, one_batch=True) # Tiny Residual Network # tiny = TinyClassifier2d() # train_model(tiny, 'tiny', train_loader, validate_loader, 3, device, one_batch=True) # ResNet50 Network resnet50 = resnet50() train_model(resnet50, 'resnet50', train_loader, validate_loader, 1, device, one_batch=True)
return self._train(state, reward, result_state) def predict(self, state): return self._predict(state) def q_values(self, state): return self._q_values(state) def bellman_error(self, state, reward, result_state): return self._bellman_error(state, reward, result_state) if __name__ == "__main__": dataset = "controllerTuples.json" states, actions, result_states, rewards = load_data.load_data(dataset) classifier = NeuralNet(states, n_in=9, n_out=9) best_error = 10000000.0 # print "Initial Model: " + str(classifier._model(states).shape) # print "Initial Model: " + str(np.max(classifier._model(states), axis=1, keepdims=True).shape) for i in range(20000): for start, end in zip(range(0, len(states), 128), range(128, len(states), 128)): # cost = train(states[start:end], rewards[start:end], result_states[start:end]) _states = states[start:end] _rewards = rewards[start:end] _result_states = result_states[start:end] """ print _states.shape
def train(args): dataset = args.dataset print("loading data from :{}".format(dataset)) adjs, features = load_data.load_data(dataset) node_num = adjs.shape[1] attribute_num = features.shape[2] time_length = adjs.shape[0] print("finish loading: node_number:{} ; time_length:{}; attribute_number:{}".format(node_num, time_length, attribute_num)) """ set parameters """ pre_len = args.pre_len # how many time steps to predict. train_len = time_length - pre_len """ preProcess data """ # preserve original data adjs_ori = torch.from_numpy(adjs).type(torch.float) + torch.eye(node_num) feats_ori = torch.from_numpy(features).type(torch.float).type(torch.float) # process data # divide testing/validating/training sets adjs_train, val_adjs, val_adjs_negative, test_adjs, test_adjs_negative = utils.mask_adjs_test(adjs=adjs) fea_train, val_feas, val_feas_false, \ test_feas, test_feas_false = utils.mask_attributes_test(features) adjs_train_lable = torch.from_numpy(adjs_train).type(torch.float) + torch.eye(node_num) adjs_train = utils.preprocess_adjs(adjs_train) adjs_train = torch.from_numpy(adjs_train).type(torch.float) # node_features = torch.eye(node_num).unsqueeze(0).repeat(time_length, 1, 1) 单位矩阵作为特征 node_features = torch.from_numpy(features).type(torch.float) attributes = torch.from_numpy(features.transpose([0, 2, 1])).type(torch.float) # batch_size = 1 """ implement a CDN model """ myModel = MyModel(node_num=node_num, feat_num=attribute_num, b_size=args.belief_size, pre_hid_size=args.pre_hidden_size, hid_size=args.hidden_size, pre_out_size=args.pre_out_size, z_size=args.emb_size, hid_decoder_size=args.decoder_hidden, flag=args.co_embedding) # Adam Optimizer optimizer = optim.Adam(myModel.parameters(), lr=args.lr, weight_decay=args.weight_decay) # begin training print("="*30) print("begin training") for epoch in range(args.epochs): myModel.train() optimizer.zero_grad() myModel.forward(adjs=adjs_train[0:train_len], node_features=node_features[0:train_len], attr_features=attributes[0:train_len]) # random choose two successive time steps t1 and t2 t_1 = np.random.choice(train_len - 1) t_2 = t_1 + np.random.choice([1]) loss, loss_fea_rec, loss_adj_rec, kl_loss, log_loss, adj_t2_prob, feature_t2_prob \ = myModel.calculate_loss(t_1, t_2, adjs_ori=adjs_train_lable, graph_feats_ori=feats_ori) # test on validate test roc_adj, ap_adj = get_roc_score_adj(val_adjs[t_2], val_adjs_negative[t_2], adj_t2_prob, t_2, adjs_ori) roc_feat, ap_feat = get_roc_score_feat(val_feas[t_2], val_feas_false[t_2], feature_t2_prob, t_2, feats_ori) print("epoch:{} loss_train:{:.5f} " "t1:{:2} t2:{:2} " "loss_fea_rec:{:.5f} loss_adj_rec:{:.5f} " "kl_loss:{:.5f} log_loss:{:.5f} " "roc_adj:{:.5f} ap_adj:{:.5f}" "roc_fea:{:.5f} ap_fea:{:.5f} ".format(epoch, loss.item(), t_1, t_2, loss_fea_rec, loss_adj_rec, kl_loss, log_loss, roc_adj, ap_adj, roc_feat, ap_feat )) # update the dynamic network if epoch % 300 == 0: print("=" * 30) print("begin testing") print(" time_length : {} train_length : {} predict_length : {}".format(time_length, train_len, pre_len)) # predict the future observations adjs_pre, features_pre, adj_last, features_last = myModel.predict(t_final=-1, pre_len=pre_len) # calculate the scores for t in range(train_len, time_length): adj_t_prob = adjs_pre[t - train_len] feature_t_prob = features_pre[t - train_len] roc_adj, ap_adj = get_roc_score_adj(val_adjs[t], val_adjs_negative[t], adj_t_prob, t, adjs_ori) roc_feat, ap_feat = get_roc_score_feat(val_feas[t], val_feas_false[t], feature_t_prob, t, feats_ori) print(" roc_adj:{:.5f} ap_adj:{:.5f}" " roc_fea:{:.5f} ap_adj:{:.5f} ".format(roc_adj, ap_adj, roc_feat, ap_feat)) # using the last embedding to reconstruct and predict the links and associations print("using the last time") for t in range(train_len-1, time_length): adj_t_prob = adj_last feature_t_prob = features_last roc_adj, ap_adj = get_roc_score_adj(val_adjs[t], val_adjs_negative[t], adj_t_prob, t, adjs_ori) roc_feat, ap_feat = get_roc_score_feat(val_feas[t], val_feas_false[t], feature_t_prob, t, feats_ori) print(" roc_adj:{:.5f} ap_adj:{:.5f}" " roc_fea:{:.5f} ap_adj:{:.5f} ".format(roc_adj, ap_adj, roc_feat, ap_feat)) print("finish testing") print("=" * 30) # update parameters loss.backward() optimizer.step() print("="*30) print("finish training") print("="*30) print("begin testing") print(" time_length : {} train_length : {} predict_length : {}".format(time_length, train_len, pre_len)) # predict the future observations adjs_pre, features_pre, adj_last, features_last = myModel.predict(t_final=-1, pre_len=pre_len) # calculate the scores print("using the delta way to predict") for t in range(train_len, time_length): adj_t_prob = adjs_pre[t - train_len] feature_t_prob = features_pre[t - train_len] roc_adj, ap_adj = get_roc_score_adj(test_adjs[t], test_adjs_negative[t], adj_t_prob, t, adjs_ori) roc_feat, ap_feat = get_roc_score_feat(test_feas[t], test_feas_false[t], feature_t_prob, t, feats_ori) print(" roc_adj:{:.5f} ap_adj:{:.5f}" " roc_fea:{:.5f} ap_adj:{:.5f} ".format(roc_adj, ap_adj, roc_feat, ap_feat)) print("using the latest embeddings to predict") for t in range(train_len-1, time_length): adj_t_prob = adj_last feature_t_prob = features_last roc_adj, ap_adj = get_roc_score_adj(test_adjs[t], test_adjs_negative[t], adj_t_prob, t, adjs_ori) roc_feat, ap_feat = get_roc_score_feat(test_feas[t], test_feas_false[t], feature_t_prob, t, feats_ori) print(" roc_adj:{:.5f} ap_adj:{:.5f}" " roc_fea:{:.5f} ap_adj:{:.5f} ".format(roc_adj, ap_adj, roc_feat, ap_feat)) print("finish testing") print("="*30) return 0
# Parameters data_directory = '../../data/generated-data-r-10-n-02/' booking_file = '../../data/booking.csv' users_file = '../../data/user.csv' rating_thresholds = [] true_objects_indexes = [0, 1, 2, 3, 4, 5, 6, 7] false_objects_indexes = [8, 9] file_names = os.listdir(data_directory) ids_vector = [int(name.split('-')[0]) for name in file_names] categories_vector = [name.split('-')[1] for name in file_names] ratings_vector = [int(name.split('.')[0].split('-')[2]) for name in file_names] name_vector = [data_directory + name for name in file_names] ratings_matrix, images_indexes_for_id, ids_indexes, users_matrix = load_data( data_directory, booking_file, users_file, rating_thresholds) features, new_ratings_vector, new_categories_vector, new_ids_vector, new_paths_vector, text_indexes = divide_texts( name_vector, ratings_vector, categories_vector, ids_vector, n=10) ratings_vector = new_ratings_vector ids_vector = new_ids_vector scores_auc = [] scores_rmse = [] for i in range(10): cv_results_file = '../results/cv-generated-data-r-10-n-02-z-random-' + str( i) + '.csv' selection = ObjectSelection(show_selection_results=False, selection_algorithm='random') selection.transform(ids=ids_vector,
@author: Admin """ import torch from load_data import load_data from learning_function import learning_function from torchsummary import summary from plot import plot from Unet import UNet from ict import ICT from config import config import transform ##################################################################################################### ######################################## load data ################################################## ##################################################################################################### l_train = load_data("data", "l_train") u_train = load_data("data", "u_train") test = load_data("data", "test") ##################################################################################################### ################################## transformation ################################################## ##################################################################################################### transform_fn = transform.transform(*config["transform"]) ##################################################################################################### #################################### student model ################################################## ##################################################################################################### S_model = UNet(2, transform_fn) #summary(S_model, (3, 480 ,640)) #####################################################################################################
for layer in base_model.layers: layer.trainable = False # YOLO_v1 中x,y,w,h 损失函数 def loss(y_true, y_pred): a = K.abs(y_pred[:, 0] - y_true[:, 0]) + K.abs(y_pred[:, 1] - y_true[:, 1]) b = K.abs(K.sqrt(y_pred[:, 2]) - K.sqrt(y_true[:, 2])) + K.abs(K.sqrt(y_pred[:, 3]) - K.sqrt(y_true[:, 3])) value = K.mean(a + b, axis=-1) return value model.compile(optimizer='rmsprop', loss=loss) # fine tuning 全连接层 x_train, y_train = load_data() print(x_train.shape, y_train.shape) model.fit(x_train, y_train, epochs=2, batch_size=16, verbose=1) # fine tuning 第二次 # make_data_set() # x_train, y_train = load_data() # for layer in model.layers[:11]: # layer.trainable = True # for layer in model.layers[11:]: # layer.trainable = True for layer in model.layers: layer.trainable = True model.compile(optimizer=SGD(lr=0.0001, momentum=0.9), loss=loss) model.fit(x_train, y_train, epochs=4, batch_size=16, verbose=1)
def main(): # camera matrix fx = 3551.342810 fy = 3522.689669 cx = 2033.513326 cy = 1455.489194 # fx = 718.8560 # fy = 718.8560 # cx = 607.1928 # cy = 185.2157 K = np.float64([[fx, 0, cx], [0, fy, cy], [0, 0, 1]]) D = np.float64([-0.276796, 0.113400, -0.000349, -0.000469]) #load images # dataset1_dir = '/home/linjian/datasets/Data_trajectory/2018-08-21/22_47_20_load/' # dataset2_dir = '/home/linjian/datasets/Data_trajectory/2018-08-22/' # dataset1_dir ='/home/linjian/dataset/docking_dataset/image/Data_trajectory/2018-08-21/22_47_20_load/' dataset1_dir = '/home/linjian/dataset/docking_dataset/image/Data_trajectory/2018-08-22/16h-26m-42s load/' filelist1 = glob.glob(dataset1_dir+'*.jpg') filelist1 = sorted(filelist1) img_num = len(filelist1) #load scale as speed of the wheel loaded_data = load_data(dataset1_dir) scale = loaded_data.get_speed() #initialization rotation_array =[] transformation_array =[] pose_array =[] R = np.eye(3) t = np.zeros((1, 3)) rotation_array.append(R) transformation_array.append(t) pose_array.append(t) #bag of virtual words init detector = cv2.ORB_create() bovw_class = bovw(detector) #init loop closure class loopclosure_class = loopclosure() #init the relative scale list relative_scale_list = [] #init keypoints and descriptors keypoints_list = [] descriptors_list = [] #keyframe flags #initialize input images img1 = cv2.imread(filelist1[0]) img2 = cv2.imread(filelist1[1]) keyframe_index = 1 # for i in range(0,50): for i in range(1,img_num): #initialize matching class with camera parameters matching_class = matching(K,D) #insert images matching_class.load_image(img1,img2) #create a detector detector = cv2.ORB_create() #scan matching enough_match,matches = matching_class.match_images(detector) if matches == -1 or scale[i-1] < 0.01: print('not a good keyframe') keyframe_index = keyframe_index+1 if keyframe_index >img_num-1: break img2 = cv2.imread(filelist1[keyframe_index]) continue kp1_match,kp2_match = matches keypoints_list.append(matching_class.kp1) descriptors_list.append(matching_class.des1) #calculate the relative scale try: relative_scale = comput_relative_scale(kp1_match,kp2_match) relative_scale_list.append(relative_scale) print("for the ",i,"image relative scale is ",relative_scale) print("for the ",i,"image absolute scale is ",scale[i-1]) print("for the ",i,"image calculated absolute scale is ",scale[i-2]/relative_scale_list[i-1]*relative_scale) except: print("An exception occurred") #add into bovw bovw_class.add_histogram(matching_class.des1) #calculate rotation and transformation dR = matching_class.getRotation() rotation_array.append(dR) dt = np.transpose(matching_class.getTransformation()) transformation_array.append(dt) R = dR.dot(R) t = t+dt.dot(R)*scale[i-1] pose_array.append(t) #find loop closure lc_index,lc_cost = bovw_class.find_lc(matching_class.des2) print(lc_cost) if (lc_cost < 0.01): img_lc = cv2.imread(filelist1[lc_index]) cv2.imshow('Loop closure matched',img_lc) #scale calculate, 1st calutate the good matches, then relative scale # lc_scale = comput_relative_scale(,) # print('lc scale is ', scale[i-2]/relative_scale_list[i-1]*lc_scale) cv2.waitKey(1) img1 = img2 keyframe_index = keyframe_index+1 if keyframe_index >img_num-1: break img2 = cv2.imread(filelist1[keyframe_index]) bovw_class.save_bovw_lib() save_to_pickle(filelist1,"image_file_list") #convert lists to array rotation_array = np.asarray(rotation_array) transformation_array = np.asarray(transformation_array) pose_array=np.asarray(pose_array) mapmax = np.amax(pose_array) +2 mapmin = np.amin(pose_array) -2 #plot # plot_camera_pose3d(pose_array) # plot_camera_pose2d(pose_array) plot_pose(pose_array,mapmax,mapmin) print('there are ', str(len(pose_array)),'number of camera poses')
# option parser.add_argument('-snapshot', type=str, default=None, help='filename of model snapshot [default: None]') parser.add_argument('-predict', type=str, default=None, help='predict the sentence given') parser.add_argument('-test', action='store_true', default=False, help='train or test') args = parser.parse_args() # load data load_data(load_path) ''' ''' print("\nLoading data...") issue1_field = data.Field(lower=True) issue2_field = data.Field(lower=True) label_field = data.Field(sequential=False) pairid_field = data.Field(lower=True) train_data, dev_data, test_data = mydatasets.MR.splits( issue1_field, issue2_field, label_field, pairid_field) issue1_field.build_vocab(train_data, dev_data, test_data) issue2_field.build_vocab(train_data, dev_data, test_data) label_field.build_vocab(train_data, dev_data, test_data) pairid_field.build_vocab(train_data, dev_data, test_data) print(len(train_data), len(dev_data), len(test_data))
import numpy as np import matplotlib.pyplot as plt import matplotlib.cm as cm from load_data import load_data from moving_average import moving_average N = 365 if __name__ == "__main__": _, dates, data = load_data() temperature = data[:, 0] temperature_max = data[:, 1] percipitation = data[:, 3] first_year, last_year = dates[0].year, dates[-1].year x = range(first_year, last_year + 1) y = range(0, 365) dt2day_of_year = lambda dt: dt.timetuple().tm_yday result = np.empty((len(y), len(x))) for i, val in enumerate(percipitation): dt = dates[i] x_i = dt.year - first_year y_i = dt2day_of_year(dt) - 1 if y_i < 365: result[y_i, x_i] = val result = np.log10(result)
return data def getRangeList(data): data = sorted([_parse_range(d) for d in data]) datarange = [Range(*d) for d in data] datarange = _parse_intvls(datarange) return datarange def _count_allowed(data): ctr = data[0].a - 0 for j in range(len(data) - 1): ctr += data[j + 1].a - data[j].b - 1 ctr += 2**32 - 1 - data[-1].b return ctr if __name__ == "__main__": data = load_data('./input/day20.txt') data = getRangeList(data) print('Part 1') print('\tFirst IP is: {}.'.format(data[0].b + 1)) # Answer: 32259706 print('\nPart 2') ctr = _count_allowed(datarange) print('\tNumber allowed IPs: {}.'.format(ctr)) # Answer: 113
def lstm_model_headline_body_combin(body_length, numb_epoch): fexc = Preprocessing() data = load_data() # Loading train data from files data.set_path(path='fnc-1-master') train_stance_data = data.get_headline_body_stance() train_bodies_data = data.get_body_id_text() train_headlines, train_bodies, train_stances = data.get_mapped_id_body( train_stance_data, train_bodies_data) # Removing punctuation and stop words from the headline and body of train data train_headlines_cl = fexc.get_clean_data(train_headlines) train_bodies_cl = fexc.get_clean_data(train_bodies) train_stances_cl = fexc.get_clean_data(train_stances) # Convert labels to integer train_stances_in = fexc.convert_lable_int(train_stances_cl) # Load the test data data.set_name("test") test_stance_data = data.get_headline_body_stance() test_bodies_data = data.get_body_id_text() test_headlines, test_bodies = data.get_mapped_id_body(test_stance_data, test_bodies_data, data_type="test") # Removing punctuation and stop words from the headline and body of test data test_headlines_cl = fexc.get_clean_data(test_headlines) test_bodies_cl = fexc.get_clean_data(test_bodies) # Remove Stop words # test_headlines_cl = fexc.remove_stop_words_list(test_headlines_cl) test_bodies_cl = fexc.remove_stop_words_list(test_bodies_cl) # Set the tokenizer alltext = train_headlines_cl + train_bodies_cl + test_headlines_cl + test_bodies_cl token = Tokenizer(num_words=30000) token.fit_on_texts(alltext) print('Number of Unique words: ' + str(len(token.word_index.keys()))) # Combine the headline and bodies of training data train_data = fexc.combine_heading_body(train_headlines_cl, train_bodies_cl) word_index = token.word_index # Converting train data to sequence train_data = token.texts_to_sequences(train_data) # Padding train data train_data = pad_sequences(train_data, maxlen=(MAX_HEADLINE_LENGTH + int(body_length))) # Converting the labels to one hot encoder onehotencoder = OneHotEncoder() train_stances_in = onehotencoder.fit_transform(train_stances_in).toarray() # Splitting the data in train and validation train_data, val_data, train_stances_final, stances_val = \ train_test_split(train_data, train_stances_in, test_size=0.2, random_state=42) # Combining test data test_data = fexc.combine_heading_body(test_headlines_cl, test_bodies_cl) # Converting test data to sequence test_data = token.texts_to_sequences(test_data) # Padding test data test_data = pad_sequences(test_data, maxlen=MAX_HEADLINE_LENGTH + int(body_length)) # Getting embedding index embeddings_index = models.get_embeddings_index(GLOVE_DIR) print('Found %s word vectors.' % len(embeddings_index)) # Getting embedding matrix embedding_matrix = models.get_embedding_matrix( embedding_dim=EMBEDDING_DIM, embeddings_index=embeddings_index, word_index=word_index) # Building the Model fake_nn = models.lstm_with_combine_headline_body( headline_length=MAX_HEADLINE_LENGTH, body_length=int(body_length), embedding_dim=EMBEDDING_DIM, word_index=word_index, embedding_matrix=embedding_matrix, activation='relu', drop_out=0.5, numb_layers=300, cells=200) # Early stopping and model checkpoint early_stopping = EarlyStopping(monitor='val_loss', patience=10) bst_model_path = 'Fake_news_nlp.h5' model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True) # Fitting the model fake_hist = fake_nn.fit(train_data, train_stances_final, batch_size=128, epochs=int(numb_epoch), shuffle=True, validation_data=(val_data, stances_val), callbacks=[early_stopping, model_checkpoint]) # Storing the training and validation accuracy and loss in file for plot lstm_data = [] with open( os.path.join( OBJECT_DUMP, "lstm_headline_body_combine" + str(body_length) + ".txt"), 'wb') as bow_hist: lstm_data.append(fake_hist.history['acc']) lstm_data.append(fake_hist.history['val_acc']) lstm_data.append(fake_hist.history['loss']) lstm_data.append(fake_hist.history['val_loss']) pickle.dump(lstm_data, bow_hist) # Predict the labels for test data result = fake_nn.predict([test_data], batch_size=128) # Store the results in the result file result_str = fexc.convert_lable_string(result) with io.open(TEST_FILE, mode='r', encoding='utf8') as read_file: test_stance = csv.DictReader(read_file) with io.open(RESULT_FILE + "_" + str(body_length) + ".csv", mode='w', encoding='utf8') as write_file: writer = csv.DictWriter( write_file, fieldnames=['Headline', 'Body ID', 'Stance']) writer.writeheader() for sample, prediction in zip(test_stance, result_str): writer.writerow({ 'Body ID': sample['Body ID'], 'Headline': sample['Headline'], 'Stance': prediction }) # Print the Accuracy, competition score and confusion matrix print_result("fnc-1-master/competition_test_stances.csv", RESULT_FILE + "_" + str(body_length) + ".csv")
x = add([Lambda(slice_last)(x), x_rnn]) return x if __name__ == '__main__': # Example usage from keras.layers import Input, Dense, Dropout from keras.models import Model from keras.callbacks import ReduceLROnPlateau from keras.optimizers import SGD from load_data import load_data from sklearn.model_selection import train_test_split from sklearn.preprocessing import MinMaxScaler x, y = load_data() input = Input(shape=(200, 84)) output = make_residual_lstm_layers(input, rnn_width=128, rnn_depth=4, rnn_dropout=0.4) output = Dropout(0.4)(output) output = Dense(2, activation='softmax', kernel_regularizer=keras.regularizers.l2(0.02))(output) model = Model(inputs=input, outputs=output) model.compile(optimizer=SGD(0.01, nesterov=True), loss='categorical_crossentropy',
args = paras() args.train_path = 'data/train.csv' args.dev_path = 'data/dev.csv' args.test_path = in_path args.to_test_path = 'data/to_test.csv' args.w2v_model_path = 'data/w2v_train.save' args.data_path = 'data/atec_nlp_sim_train.csv' args.res_path = out_path # load data # text_field, label_field, train_data, train_iter,\ # dev_data, dev_iter = load_data(args) # load data text_field, label_field, train_data, train_iter,\ dev_data, dev_iter, test_data, test_iter = load_data(args) # text_field.build_vocab(train_data, dev_data) args.embed_num = 7563 args.embed_dim = 300 args.word_Embedding = True embedding_dict = Word2Vec.load(args.w2v_model_path) word_vec_list = [] oov = 0 for idx, word in enumerate(text_field.vocab.itos): try: vector = np.array(embedding_dict[str(word.encode('utf-8'))], dtype=float).reshape(1, args.embed_dim) except:
from sklearn import linear_model from sklearn import kernel_ridge from sklearn import svm from load_data import load_data from write_submission import write_submission import numpy as np #from matplotlib import pyplot as plt from expand_features import expand_features def rmse(predictions, targets): return np.sqrt(((predictions - targets)**2).mean()) # load data [Xtr, Ytr, Xte, testID] = load_data() # expand features Xtr_expanded = expand_features(Xtr) Xte_expanded = expand_features(Xte) print('Xtr shape', Xtr_expanded.shape, 'Ytr shape', Ytr.shape, 'Xte shape', Xte.shape) clf = linear_model.RidgeCV(alphas=[1e-3, 1e-2, 1e-1], normalize=True, store_cv_values=True).fit(Xtr_expanded, Ytr) ridge_preds = clf.predict(Xtr_expanded) print('RMSE Ridge:', rmse(ridge_preds, Ytr)) print(clf.alpha_)
import numpy from load_data import load_data from nn import nn from ova import ova from pca import pca from rfe import rfe from tsne import tsne # import json # numpy.set_printoptions(threshold=1000000) train_values, train_values_rfe, train_classes, train_classes_binary, test_values, test_values_rfe, test_classes, test_classes_binary, class_desc\ = load_data() all_values = numpy.concatenate((train_values, test_values)) all_classes = numpy.concatenate((train_classes, test_classes)) all_classes_binary = numpy.concatenate( (train_classes_binary, test_classes_binary)) all_values_rfe = numpy.concatenate((train_values_rfe, test_values_rfe)) # print(all_values_rfe.shape) # print(all_values_rfe) # print(all_values_rfe[:,13]) # print(all_values[:,46]) # print(all_values) # print(all_classes) # t-sne on train data # tsne(train_values, train_classes, class_desc, 0)
from sklearn.metrics import roc_curve, auc import tensorflow.keras.backend as K np.random.seed(2020) import hyper_params as hp from dl_models import build_model_predict from dl_models import build_model_ae from load_data import load_data, impute_data, calc_impute_values if __name__ == '__main__': # load dataset hp = hp.create_hparams() op_mode = hp.op_mode print(hp.outcome) pdirname = os.path.dirname(__file__) clin_params, outcomes, patients_id = load_data(hp.outcome, pdirname + hp.dataset_path) # Load dataset orig_clin_params = clin_params # Imputation will change clin_params; so, we need to store it a copy separate sss1 = StratifiedShuffleSplit(n_splits=1, test_size=0.30, random_state=0) #prepare cross-validation: training/validation (70%) + test (30%) sss2 = StratifiedShuffleSplit(n_splits=10, test_size=0.28, random_state=0) #=0.28x0.7 (20%) Validation + (50%) Training print('Positives = {:.1f}'.format(np.sum(outcomes))+' Negatives = {:.1f}'.format(np.sum(1-outcomes))+' Total = {:.1f}'.format(len(outcomes))) ################################################################################################### ## Stage 1: PRE-TRAINING of AutoEncoder ################################################################################################### if op_mode is 'pretrain': idx =0 for train_valid_index, test_index in sss1.split(clin_params, outcomes):#Dummy for-loop: only one split (test vs train/vlid) is performed. trv_params = orig_clin_params[train_valid_index] # 70% of data; the other 30% are kept separate; not used for train or valid at all trv_outcomes = outcomes[train_valid_index] # 70% of data for train_index, valid_index in sss2.split(trv_params, trv_outcomes): # for-loop on the 10-fold cross-validations
import pandas as pd from xgboost import XGBClassifier from scipy.sparse import hstack from sklearn.preprocessing import LabelEncoder import os from os import path, makedirs from azureml.logging import get_azureml_logger from sklearn.model_selection import GridSearchCV import feature_engineering as fe from load_data import load_data # load data app_events, app_labels, events, gender_age_train, gender_age_test, label_categories, brand_model = load_data( ) # initialize logger run_logger = get_azureml_logger() run_logger.log("amlrealworld.distributed-tuning.single-vm", "true") # default temporary library of joblib is too small, change it os.environ["JOBLIB_TEMP_FOLDER"] = "/tmp" ################################################################# # Feature engineering ################################################################# # Create one-hot encoding of brand and model train_brand, test_brand, train_model, test_model = fe.one_hot_brand_model( brand_model, gender_age_train, gender_age_test) # Create weekday and hour features (represented using one-hot encoding)