def main(): """ Fit models and make predictions. We'll use one-hot encoding to transform our categorical features into binary features. y and X will be numpy array objects. """ model = linear_model.LogisticRegression(C=3) # the classifier we'll use # === load data in memory === # print "loading data" cwd = os.getcwd() trainDataLoc = cwd + '/../data/train.csv' testDataLoc = cwd + '/../data/test.csv' y, X = load_data(trainDataLoc) y_test, X_test = load_data(testDataLoc, use_labels=False) # === one-hot encoding === # # we want to encode the category IDs encountered both in # the training and the test set, so we fit the encoder on both encoder = preprocessing.OneHotEncoder() encoder.fit(np.vstack((X, X_test))) X = encoder.transform(X) # Returns a sparse matrix (see numpy.sparse) X_test = encoder.transform(X_test) # if you want to create new features, you'll need to compute them # before the encoding, and append them to your dataset after # === training & metrics === # mean_auc = 0.0 n = 10 # repeat the CV procedure 10 times to get more precise results for i in range(n): # for each iteration, randomly hold out 20% of the data as CV set X_train, X_cv, y_train, y_cv = cross_validation.train_test_split( X, y, test_size=.20, random_state=i * SEED) # if you want to perform feature selection / hyperparameter # optimization, this is where you want to do it # train model and make predictions model.fit(X_train, y_train) preds = model.predict_proba(X_cv)[:, 1] # compute AUC metric for this CV fold fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds) roc_auc = metrics.auc(fpr, tpr) print "AUC (fold %d/%d): %f" % (i + 1, n, roc_auc) mean_auc += roc_auc print "Mean AUC: %f" % (mean_auc / n) # === Predictions === # # When making predictions, retrain the model on the whole training set model.fit(X, y) preds = model.predict_proba(X_test)[:, 1] #filename = raw_input("Enter name for submission file: ") filename = 'LogisticRegressionResults' save_results(preds, filename + ".csv")
def main(): cwd = os.getcwd() trainDataLoc = cwd + '/../data/train.csv' testDataLoc = cwd + '/../data/test.csv' y, X = load_data(trainDataLoc) y_test, X_test = load_data(testDataLoc, use_labels=False) encoder = preprocessing.OneHotEncoder() encoder.fit(np.vstack((X, X_test))) X = encoder.transform(X) # Returns a sparse matrix (see numpy.sparse) X_test = encoder.transform(X_test) #model = findBestModel(X, y) Best model is rbf, gamma = 1, c = 1 X_train, X_cv, y_train, y_cv = cross_validation.train_test_split( X, y, test_size=.20, random_state=SEED) model = svm.SVC(C=1, probability=True, kernel='rbf', gamma=1) model.fit(X_train, y_train) preds = model.predict_proba(X_cv)[:, 1] # compute AUC metric for this CV fold fpr, tpr, thresholds = metrics.roc_curve(y_cv, preds) roc_auc = metrics.auc(fpr, tpr) print "AUC : %f" % (roc_auc) preds = model.predict_proba(X_test)[:, 1] save_results(preds, "SVM_classifier.csv")
def solve_challenge(packed_challenge, key, mac_key): """ Solve a challenge that was produced by generate_challenge with the given key and mac_key. Raises InvalidSignature in the event of a message authentication code mismatch. """ mac, hash_function, package = load_data(packed_challenge) if verify_mac(mac_key, package, mac, hash_function): challenge, bytes_per_hash, unencrypted_data = load_data(package) return (decrypt(challenge, key, hmac_factory(hash_function), getattr(hashlib, hash_function)().digestsize, output_block_size=bytes_per_hash), unencrypted_data) else: raise InvalidSignature("Message authentication code mismatch")
def run_main(): """ 主函数 """ # 聚类个数 n_cluster = 8 # 收敛阈值 cutoff = 0.002 samples = load_data() clusters = kmeans(samples, n_cluster, cutoff) # 输出结果 # for i, c in enumerate(clusters): # for sample in c.samples: # print('聚类--{},样本点--{}'.format(i, sample)) # 可视化结果 plt.subplot() color_names = list(mcolors.cnames) for i, c in enumerate(clusters): x = [] y = [] # random.choice color = [color_names[i % 100 + 10]] * len(c.samples) for sample in c.samples: x.append(sample.coords[2]) y.append(sample.coords[1]) plt.scatter(x, y, c=color) plt.show()
def compile_jets(data_path, n_events, p_granularity, q_granularity, batch_size): # Load in jets from file [ daughters, endings, mothers, (discrete_p_splittings, discrete_q_splittings), mother_momenta ] = load_data(data_path, n_events=n_events, batch_size=batch_size, split_p_q=True, p_granularity=p_granularity, q_granularity=q_granularity) # this unpacking is necessary to remove it from the tuple and put it into a # list x = [[*a] for a in zip(daughters, mother_momenta, [m[1] for m in mothers], [q[0] for q in discrete_q_splittings])] # temporary hack having to do with mask values; this will change later. for i in range(len(mothers)): mothers[i][0][mothers[i][0] == -1] = 0 discrete_p_splittings[i][0][discrete_p_splittings[i][0] == p_granularity**4] = 0 discrete_q_splittings[i][0][discrete_q_splittings[i][0] == q_granularity] = 0 y = [[*a] for a in zip([e[0] for e in endings], [m[0] for m in mothers], [d[0] for d in discrete_p_splittings], [q[0] for q in discrete_q_splittings])] return x, y
def __init__(self, batch_size, seed=1234): """ Inputs: hdf5_path: str batch_size: int dev_train_csv: str | None, if None then use all data for training dev_validate_csv: str | None, if None then use all data for training seed: int, random seed """ self.batch_size = batch_size self.random_state = np.random.RandomState(seed) self.validate_random_state = np.random.RandomState(0) # Load data (self.train_x, self.train_y, self.validate_x, self.validate_y, _, _) = load_data() print(self.train_x.shape, self.validate_x.shape) self.train_audio_names = np.arange(len(self.train_x)) self.validate_audio_names = np.arange(len(self.validate_x)) # Calculate scalar (self.mean, self.std) = calculate_scalar(self.train_x)
def test_project_extent(self): """Test project_extent""" layer = load_data('point-nyc.shp') QgsMapLayerRegistry.instance().addMapLayer(layer) geojson = projestions_geoms.project_extent() self.assert_valid_extent(geojson)
def get_key_ideas(pos, lang, patterns_path): # load the patterns patterns = load_data(patterns_path) patterns = patterns["default_patterns_" + lang] # exceptions = patterns["default_exceptions_"+lang] # get the key ideas key_ideas = [] tokens = [w[0] for w in pos] words = [] for w in pos: if w[1][0] in ['J', 'V', 'N']: words.append(w[1][0]) else: words.append(w[0]) # patterns = [x[0] for x in self.patterns] for p in patterns: start = search(words, p) if start: key_idea = tokens[start:start + len(p)] key_ideas.append(key_idea) return key_ideas
def process_documents(source_path, target_path, processes=None, content_index=0): """ Pre-processes all documents within a CSV file. :param Path source_path: Filename for the source CSV file :param Path target_path: Filename for destination file :param list processes: List of pre-processing functions, (document_content) -> (value, modified_content) :param int content_index: Index of the document content """ data = load_data(source_path, index_col=None).values[:, content_index] if processes is None: processes = standard_processes processor = partial(apply_process, processes=processes) workers = Pool(n_threads) # Define workers documents = workers.map(processor, data) # Apply processing workers.close() # Close document queue workers.join() # Wait for processes to finish contexts, indexes = split_into_contexts(documents) save_contexts(contexts, indexes, target_path)
def load(self): # Load labels da_labels = utils.load_labels(self.data_path, self.da_labels_file) ap_labels = utils.load_labels(self.data_path, self.ap_labels_file) # If unable to load labels inform user and exit if not da_labels and not ap_labels: print("unable to load label lists...Exiting program.") # Load default data default_data = utils.load_data(self.data_path, 'default') # Load JSON file data = utils.load_data(self.data_path, self.dialogue_file) # If file is not valid or invalid JSON if not data: # Try default data if default_data: data = default_data # Else exit else: print("Unable to load default JSON data...Exiting program.") exit() # Create dialogue object dialogues = utils.load_dialogues(data) # If JSON is not valid or keys missing if not dialogues: # Try default dialogues default_dialogues = utils.load_dialogues(default_data) if default_dialogues: # TODO popup to tell user loading default data. dialogues = default_dialogues # Else exit else: print("Unable to load default JSON data...Exiting program.") exit() # Create the dialogue model model = DialogueModel(data['dataset'], ap_labels, da_labels, dialogues) return model
def test_get_layer_geom(self): """Test layer_geom""" layer = load_data('point-nyc.shp') geojson = projestions_geoms.layer_geom(layer) self.assertNotEqual(geojson, '') geojson = json.loads(geojson) self.assertEqual(geojson['type'], 'FeatureCollection')
def test_get_layer_geom_large(self): """Test layer_geom with many features""" layer = load_data('many-points-nyc.shp') geojson = projestions_geoms.layer_geom(layer) self.assertNotEqual(geojson, '') geojson = json.loads(geojson) self.assertEqual(geojson['type'], 'FeatureCollection') self.assertLessEqual(len(geojson['features']), settings.PROJESTIONS_MAX_FEATURES)
def __init__(self, batch_size): """Data generator for test data. """ super(TestDataGenerator, self).__init__(batch_size=batch_size) # Load test data (_, _, _, _, self.test_x, self.test_y) = load_data() self.test_audio_names = np.arange(len(self.test_x))
def test_map_canvas_extent(self): """Test map_canvas_extent""" layer = load_data('point-nyc.shp') QgsMapLayerRegistry.instance().addMapLayer(layer) iface = QGIS_APP[2] mapCanvas = iface.mapCanvas() mapCanvas.zoomToFullExtent() geojson = projestions_geoms.map_canvas_extent(mapCanvas) self.assert_valid_extent(geojson)
def main(): cwd = os.getcwd() trainDataLoc = cwd + '/../data/train.csv' testDataLoc = cwd + '/../data/test.csv' y, X = load_data(trainDataLoc) y_test, X_test = load_data(testDataLoc, use_labels=False) clf = xgb.XGBClassifier(max_depth=15, n_estimators=200, learning_rate=.4, colsample_bytree=.8, seed=SEED) # fitting clf.fit(X, y, early_stopping_rounds=100, eval_metric="logloss", eval_set=[(X_test, y_test)]) #print y_pred preds = clf.predict_proba(X_test)[:,1] save_results(preds, "XGBoost_classifier.csv")
def run_analysis(dset): ids,X,y = load_data(dset) X = DataClean([["[^a-z]"," "], [" [ ]+", " "],],html_clean=True).fit(X).transform(X) labels = list(set(y)) for label in labels: Xlabel = X[y==label] Xlabel_str = ' '.join(Xlabel.tolist()) generate_wordcloud(Xlabel_str,label,dset,"white") generate_wordcloud(Xlabel_str,label,dset,"black") print "Label %d : %s" % (label,Xlabel[0])
def Main(): train_loader, validation_loader, test_loader = utilities.load_data( directory) model, optimizer, criterion = utilities.net_setup(structure, dropout, hidden_layer1, lr, device) utilities.train_network(model, optimizer, criterion, epochs, 20, train_loader, device) utilities.save_checkpoint(path, structure, hidden_layer1, dropout, lr) print( "**************Training Complete !! Thanks for the patience******************" )
def main(diff_path_neg, diff_path_pos, adv_path_neg, adv_path_pos, ind_neg_path, ind_pos_path): diff_neg = np.load(diff_path_neg) neg_ind = np.load(ind_neg_path) l1_norm_neg = cal_l1(diff_neg[neg_ind]) l2_norm_neg = cal_l2(diff_neg[neg_ind]) l_inf_neg = cal_l_inf(diff_neg[neg_ind]) diff_pos = np.load(diff_path_pos) pos_ind = np.load(ind_pos_path) l1_norm_pos = cal_l1(diff_pos[pos_ind]) l2_norm_pos = cal_l2(diff_pos[pos_ind]) l_inf_pos = cal_l_inf(diff_pos[pos_ind]) print neg_ind.shape[0], "negative adversarial sample have been made" print pos_ind.shape[0], "positive adversarial sample have been made" print "l1 norm of negative sample is:", l1_norm_neg print "l_inf norm of negative sample is:", l_inf_neg print "l_2 norm of negative sample is:", l2_norm_neg print "l1 norm of positive sample is:", l1_norm_pos print "l_inf norm of positive sample is:", l_inf_pos print "l_2 norm of positive sample is:", l2_norm_pos X_pos, Y_pos, X_neg, Y_neg = load_data() neg_cor_index = np.load('./neg_cor_index.npy') pos_cor_index = np.load('./pos_cor_index.npy') X_pos = X_pos[pos_cor_index] X_neg = X_neg[neg_cor_index] xadv_pos = np.load(adv_path_pos)[pos_ind] pdf_pos = cal_normal_pdf(X_pos, xadv_pos) stand_pdf = cal_normal_pdf(X_pos, X_pos) print "Gaussian Observation: pdf mean of positive sample is ", pdf_pos.mean( ) a = pdf_pos - stand_pdf.mean() > 0 print "Gaussian Observation: prob that pdf of positive sample is higher than standard pdf mean is ", a.mean( ) xadv_neg = np.load(adv_path_neg)[neg_ind] pdf_neg = cal_normal_pdf(X_neg, xadv_neg) stand_pdf = cal_normal_pdf(X_neg, X_neg) print "Gaussian Observation: pdf mean of negative sample is ", pdf_neg.mean( ) a = pdf_neg - stand_pdf.mean() > 0 print "Gaussian Observation: prob that pdf of negative sample is higher than standard pdf mean is ", a.mean( ) sess = tf.Session() print "KL Divergense of positive sample is :", sess.run(KL( X_pos, xadv_pos)) print "KL Divergense of negative sample is :", sess.run(KL( X_neg, xadv_neg))
def main(): #Run load_data function with command line file path trainloader, testloader, validloader = utilities.load_data(root) #Run network_setup with command line structure, dropout, hiddenlayer number, and learnrate model, criterion, optimizer = utilities.network_setup( structure, dropout, hiddenlayer1, learnrate) #run deep_learning training function with model, criterion, and optimizer from network_setup and command line arguments utilities.deep_learning(model, criterion, optimizer, trainloader, epochs, 40) #save the checkpoint of the trained model for later use. utilities.save_checkpoint(model, path, structure, hiddenlayer1, dropout, learnrate) print("Training complete. Model saved at {}".format(path))
def make_analyze(): try: #Load the data data = request.get_json() except Exception as e: raise e if data == {}: return (bad_request()) else: #Get the text and the language try: lang = data['lang'] except: try: lang = detect_language(data['text']) print(lang) except: responses = jsonify( "Error in vectorize: language field is missing") return responses try: text = data['text'] # we assume text is tokenized except: responses = jsonify("Error in analyze: text is missing") return responses if lang not in ['en', 'es', 'ar', 'ro', 'fr']: responses = jsonify( message= "Language not available. Language must be in ['en','es','ar','ro','fr']" ) return responses filename = os.path.join(os.path.dirname(__file__), 'models-registry.json') registry = load_data(filename) analysis = analyze(text, lang, registry) #print(analysis[0]) #Send the response codes responses = jsonify(concepts=analysis[0], key_ideas=analysis[1], topics=analysis[2]) responses.status_code = 200 return responses
def get_topics(text, lang, topics_path): #initialization embeddings = Embeddings(emb_dict[lang]) # get the topics dictionary from the path topics_dicts = load_data(topics_path) topics_dict = topics_dicts[lang] topics = list(topics_dict.keys()) if lang == 'en': #cl = 0.7 # when a topic is "close" cl = 0.5 else: cl = 0.5 # now vectorize the topics vect_dict_topics = [ (w, np.mean(to_vector_single_nonzeros(topics_dict[w], embeddings, len(topics_dict[w])), axis=0)) for w in topics ] #print(vect_dict_topics) # get topics assigned_topics = [] dists = [] if len(to_vector_single_nonzeros(text, embeddings, len(text))) > 0: vectorized_text = np.mean(to_vector_single_nonzeros( text, embeddings, len(text)), axis=0) else: vectorized_text = np.zeros((300, ) * 1) for v in vect_dict_topics: dists.append(spatial.distance.cosine( vectorized_text, v[1])) # measure distance to all topics good_topics = [ topics[i].upper() for i in range(len(topics)) if dists[i] < cl ] # choose close topics if not good_topics: good_topics.append('OTHER') # assigned_topics.append(topic) assigned_topics.append(good_topics) return assigned_topics
def main(): # Get CLI arguments args = get_input_args() # Prep data train_transform = utilities.transform_data('train') test_transform = utilities.transform_data('test') # Dataloaders trainloader = utilities.load_data(args.data_directory + '/' + 'train', train_transform) validationloader = utilities.load_data(args.data_directory + '/' + 'valid', test_transform) # Setup and train model model, optimizer, criterion = functions.model_setup( args.arch, args.hidden_units, args.learning_rate) trained_model = functions.train_model(optimizer, criterion, model, trainloader, validationloader, args.gpu, args.epochs) # Save the model functions.save_checkpoint(trained_model, args.save_dir)
def main(): model_file_path = "output" + os.sep + "linear_regression_model_mv.sav" ignored_columns = ['ZN', 'CHAS', 'NOX', 'RM', 'DIS', 'RAD', 'TAX', 'PIRATIO', 'B', 'LSTAT'] X, Y = load_data('input' + os.sep + 'housing.csv', False, ignored_columns) X = preprocess(X, "normalize") X_train, y_train, X_test, y_test = split_dataset(X, Y) train(X_train, y_train, model_file_path) y_predicted = predict(X_test, model_file_path) rmse_ration = calculate_rmse_ration(y_test, y_predicted) print("rmse ratio:", rmse_ration)
def solve_with_options(algorithm_to_run, seed, run_time, inst): print( f'''Running algorithm {algorithm_to_run} on file {inst} with a time limit of {run_time} seconds and a random seed of {seed}''' ) np.random.seed(seed) random.seed(np.random.randint(999999)) instance_name, city_data = load_data(inst) tracer = Tracer(method=algorithm_to_run, instance=instance_name, seed=seed, cutoff=run_time) score, solution = None, None if algorithm_to_run == 'LS1': score, solution = genetic_algorithm.solve( data=city_data, timer=early_stop_checker(seconds=run_time), tracer=tracer) elif algorithm_to_run == 'BnB': score, solution = BnB.solve(data=city_data, timer=early_stop_checker(seconds=run_time), tracer=tracer) elif algorithm_to_run == 'LS2': score, solution = two_opt.solve( data=city_data, timer=early_stop_checker(seconds=run_time), tracer=tracer) elif algorithm_to_run == 'LS3': score, solution = genetic_algorithm_opt_2_hybrid.solve( data=city_data, timer=early_stop_checker(seconds=run_time), tracer=tracer) elif algorithm_to_run == 'Approx': score, solution = nearest_neighbor.solve( data=city_data, timer=early_stop_checker(seconds=run_time), tracer=tracer) if not os.path.exists('output'): os.makedirs('output') save_solution_file(score, solution, method=algorithm_to_run, instance=instance_name, seed=seed, cutoff=run_time) tracer.write_to('output/')
def main(input_path, output_path, ignored_columns, preprocess_type, training_data_rate, step_length, threshold_rate, max_loop_num, dynamic_step): print("input:", input_path) print("output:", output_path) print("\n") if ignored_columns is not None: print("ignored_columns:", ignored_columns) print("\n") print("preprocess_type:", preprocess_type) print("training_data_rate:", training_data_rate) print("\n") print("threshold_rate:", threshold_rate) print("max_loop_num:", max_loop_num) print("step_length:", step_length) if dynamic_step: print("dynamic stepping ...") else: print("static stepping ...") print("\n") start_time = datetime.now() X, Y = load_data(input_path, True, ignored_columns) X = preprocess(X, preprocess_type) X_train, y_train, X_test, y_test = split_dataset(X, Y, training_data_rate) threshold = gen_threshold(Y, threshold_rate) train(X_train, y_train, output_path, step_length, threshold, max_loop_num, dynamic_step) Y_pred = predict(output_path, X_test) rmse_ration = calculate_rmse_ration(y_test, Y_pred) print("rmse ratio (rmse / y_mean) is:", rmse_ration, "\n") end_time = datetime.now() execution_duration = end_time - start_time print("execution duration:", execution_duration, "\n") return
def test(model, directory): """ This command loads the images from the given directory and evaluates a model 'model1' corresponds to the linear sklearn model 'model2' corresponds to the linear tensorflow model 'model3' corresponds to the lenet tensorflow model :param model: the model to be used. Either 'model1', 'model2', and 'model3' :param directory: the directory where images are saved :return: """ data, labels, _, one_hot_labels = load_data(directory, IMAGE_EXTENSION) data_reshaped = data.reshape((data.shape[0], 3072)) if model == MODEL1: sk_linear.predict(data_reshaped, MODEL1_PATH, labels) elif model == MODEL2: tf_linear.predict(data_reshaped, MODEL2_PATH, Y_test=one_hot_labels) elif model == MODEL3: tf_lenet.predict(data, MODEL3_PATH, Y_test=one_hot_labels)
def predict(): # load the saved model classifier = pickle.load(open('best_model.pkl')) # compile a predictor function predict_model = theano.function(inputs=[classifier.input], outputs=classifier.predicted_label) # We can test it on some examples from test test dataset = '/home/tao/Projects/machine-learning/data/mnist.pkl.gz' datasets = load_data(dataset) test_set_x, test_set_y = datasets[2] test_set_x = test_set_x.get_value() predicted_values = predict_model(test_set_x[:10]) print("Predicted values for the first 10 examples in test set:") print(predicted_values) print("Ground truth label values for the first 10 examples in test set:") print(test_set_y.eval()[:10])
def load_port(plot_comps=True, return_raw=False): stem = "Data Sets\\Daily_portfolio\\" # names = {"GBPEUR=X.csv": ['Adj Close'], # "GBPJPY=X.csv": ['Adj Close'], # "GBPNZD=X.csv": ['Adj Close'], # "GBPUSD=X.csv": ['Adj Close'], # "AAPL.csv": ['Adj Close'], # } names = { "Crude.csv": ['Adj Close'], "TOT.csv": ['Adj Close'], "CVX.csv": ['Adj Close'], # "Gold.csv": ['Adj Close'], "AAPL.csv": ['Adj Close'], "INTC.csv": ['Adj Close'], "AMD.csv": ['Adj Close'], #"W=F.csv": ['Adj Close'], } data_frame = load_data(stem, names) # Take only series values from the data frame data = data_frame.values[1:, :].astype('float') # data_pos = np.where(data <= 0, 0.05, data) data_pos = np.abs(data) # Take difference data_returns = np.log(data_pos[:, 1:]) - np.log(data_pos[:, :-1]) # Take the dates from the data frame for plotting dates = data_frame.values[0, 1:] if plot_comps: plot_components(data_returns, dates=dates, global_lims=[-0.2, 0.2]) if return_raw: return data_returns, dates, data_pos else: return data_returns, dates
def main(): ignored_columns = [ 'ZN', 'CHAS', 'NOX', 'RM', 'DIS', 'RAD', 'TAX', 'PIRATIO', 'B', 'LSTAT' ] X, Y = load_data('input' + os.sep + 'housing.csv', True, ignored_columns) X = preprocess(X, "normalize") X_train, y_train, X_test, y_test = split_dataset(X, Y) path = 'output' + os.sep + 'lsm_multivariant.csv' lsm(X_train, y_train, path) y_predicted = predict(path, X_test) rmse_ration = calculate_rmse_ration(y_test, y_predicted) print("rmse ratio:", rmse_ration) return
def test_RegressionOnSubset(self): Xtrain, ytrain, Xtest, ytest = utilities.load_data() columns = ['Longitude', 'Latitude'] est = ensemble.RandomForestRegressor() est.fit(Xtrain, ytrain) predict_est = est.predict(Xtest) mad_est = np.mean(np.abs(predict_est - ytest)) msd_est = np.mean(np.square(predict_est - ytest)) meta_est = utilities.RegressionOnSubset(est, columns) pipe = pipeline.Pipeline([('RegressionOnSubest', meta_est), ('Regression', ensemble.RandomForestRegressor())]) pipe.fit(Xtrain, ytrain) predict_pipe = pipe.predict(Xtest) mad_pipe = np.mean(np.abs(predict_pipe - ytest)) msd_pipe = np.mean(np.square(predict_pipe - ytest)) self.assertTrue(mad_pipe < mad_est) self.assertTrue(msd_pipe < msd_est)
def load_oil(plot_comps=True, return_raw=False): stem = "Data Sets\\Oil\\" names = { "BP.L.csv": ['Adj Close'], "CVX.csv": ['Adj Close'], "OGZPY.csv": ['Adj Close'], "PBR.csv": ['Adj Close'], # "PSX.csv": ['Adj Close'], "RDSA.L.csv": ['Adj Close'], "SLB.csv": ['Adj Close'], "TOT.csv": ['Adj Close'], "XOM.csv": ['Adj Close'], "Crude.csv": ['Adj Close'], } data_frame = load_data(stem, names) # Take only series values from the data frame data = data_frame.values[1:, :].astype('float') # data_pos = np.where(data <= 0, 0.05, data) data_pos = np.abs(data) # Take difference data_returns = np.log(data_pos[:, 1:]) - np.log(data_pos[:, :-1]) # Calculate the number of time series num_series = len(data[:, 0]) # Take the dates from the data frame for plotting dates = data_frame.values[0, 1:] if plot_comps: plot_components(data_returns, dates=dates, global_lims=[-0.2, 0.2]) if return_raw: return data_returns, dates, data_pos else: return data_returns, dates
def train(model, directory): """ This command load the images from the given directory and train a chosen model 'model1' corresponds to the linear sklearn model 'model2' corresponds to the linear tensorflow model 'model3' corresponds to the lenet tensorflow model :param model: the model to be used. Either 'model1', 'model2', and 'model3' :param directory: the directory where the training data is saved :return: """ data, labels, class_weights, one_hot_labels = load_data( directory, IMAGE_EXTENSION) data_reshaped = data.reshape((data.shape[0], 3072)) if model == MODEL1: sk_linear.train(data_reshaped, labels, MODEL1_PATH) elif model == MODEL2: tf_linear.model(data_reshaped, one_hot_labels, MODEL2_PATH) elif model == MODEL3: tf_lenet.model(data, one_hot_labels, epochs=400, class_weights=class_weights, model_path=MODEL3_PATH)
def deserialize(stream): sub_structs, packed_structure = utilities.load_data(stream) sub_structs = ast.literal_eval(sub_structs) struct = unpack_structure(packed_structure) _type, count = struct.__class__.__name__.split('_', 1) if _type == "dict": output = {} for attribute, __type in struct._fields_: output[attribute] = getattr(struct, attribute) elif _type == "tuple": output = tuple(getattr(struct, attribute) for attribute, __type in struct._fields_) elif _type == "list": output = list(getattr(struct, attribute) for attribute, __type in struct._fields_) else: raise ValueError() for key in sub_structs: print "Deserealizing: " print print output[key] output[key] = deserialize(output[key]) return output
def refresh(self, instance): print('The button <refresh> is being pressed') # Load JSON file data = utils.load_data(self.data_path, self.dialogue_file) # Get the current dialogues id target_id = self.model.current_dialogue.dialogue_id # Loop over the dialogues and utterances in the data for dialogue in data['dialogues']: # If the id's match get the utterances if dialogue['dialogue_id'] == target_id: utterances = [] for utterance in dialogue['utterances']: # Create a new utterance tmp_utterance = Utterance(utterance['text'], utterance['speaker']) # Set utterance labels if not blank if utterance['ap_label'] is not "": tmp_utterance.set_ap_label(utterance['ap_label']) if utterance['da_label'] is not "": tmp_utterance.set_da_label(utterance['da_label']) # Add to utterance list utterances.append(tmp_utterance) # Update current dialogue with the utterances self.model.current_dialogue.set_utterances(utterances) break # Update dialogue_box self.update_dialogue()
def unpack_structure(packed_data): name, fields, packed_bytes = utilities.load_data(packed_data) print "\nUnpacking structure", packed_data print print "Name: ", name print "Fields: ", fields print "Packed data: ", packed_bytes fields = ast.literal_eval(fields) format_characters = ''.join(_type for name, _type in fields) print "Extracting c types from format characters: ", format_characters c_types = get_ctypes_from_format(format_characters) print "Got c types: ", c_types fields = [(field_info[0], c_types[index]) for index, field_info in enumerate(fields)] print "Unpacking fields: ", format_characters, len(packed_bytes), packed_bytes #fields = [(name, format_to_type[character]) for name, character in fields] values = struct.unpack(format_characters, packed_bytes) _values = [] for value, _type in zip(values, (field[1] for field in fields)): if _type == ctypes.c_void_p: _values.append(None) else: _values.append(value) struct_type = new_struct_type_from_ctypes(name, *fields) return struct_type(*values)
import numpy as np from sklearn.cluster import MeanShift, estimate_bandwidth import utilities # Load data from input file X = utilities.load_data('data_multivar.txt') # Estimating the bandwidth bandwidth = estimate_bandwidth(X, quantile=0.1, n_samples=len(X)) # Compute clustering with MeanShift meanshift_estimator = MeanShift(bandwidth=bandwidth, bin_seeding=True) meanshift_estimator.fit(X) labels = meanshift_estimator.labels_ centroids = meanshift_estimator.cluster_centers_ num_clusters = len(np.unique(labels)) print "Number of clusters in input data =", num_clusters ########################################################### # Plot the points and centroids import matplotlib.pyplot as plt from itertools import cycle plt.figure() # specify marker shapes for different clusters markers = '.*xv'
scores_normalized = [] num_labels = len(self.labels) for score in scores: norm_score = float((score - scores_min))/(scores_max - scores_min) if norm_score == 1.0: norm_score -= 0.001 elif norm_score == 0.0: norm_score += 0.001 scores_normalized.append(norm_score) ypred = [self.labels[int(floor(score*num_labels))] for score in scores_normalized] return ypred if __name__ == '__main__': ids,X,y = load_data("cornell") pipeline = Pipeline([ ('cleaner',DataClean(clean_list=[ ["[^a-z]"," "], # only letters [" [ ]+", " "], # remove extra spaces ],html_clean=True)), ('classifier',DictSimple()), ]) cross_validate((X,y),pipeline,accuracy_score) # Cornell # accuracy_score : 0.357580308161 +/- 0.156942834821 # Confusion Matrix # [[ 1.74000000e+02 4.14600000e+03 2.67400000e+03 7.30000000e+01 # 5.00000000e+00] # [ 1.95000000e+02 1.44850000e+04 1.22810000e+04 2.97000000e+02
nwords += 1 except: continue return feat_vect/nwords def transform(self,X): Xtf = np.vstack([self.sentence2vector(x) for x in X]) return Xtf def fit_transform(self,X,y=None): self.fit(X,y) return self.transform(X) if __name__ == '__main__': _,unlabelledData = load_data("unsupervised") ids,X,y = load_data("stanford") pipeline = Pipeline([ ('cleaner',DataClean(clean_list=[ ["[^a-z]"," "], # only letters [" [ ]+", " "], # remove extra spaces ],html_clean=False)), ('w2v',Glove2AverageVector(data_src=unlabelledData)), ('classifier',RandomForestClassifier(n_estimators=100)) ]) cross_validate((X,y),pipeline,accuracy_score) # num_features=100,window=10,learning_rate=0.05,epochs=10 # Stanford # NB # accuracy_score : 0.72772 +/- 0.00562665086886
# ^_^ coding:utf-8 ^_^ import numpy as np import matplotlib.pyplot as plt from sklearn import svm, grid_search, cross_validation from sklearn.metrics import classification_report import utilities # 加载数据 input_file = 'data_multivar.txt' X, y = utilities.load_data(input_file) # 分割数据集 X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.25, random_state=5) # 通过交叉验证设置参数 parameter_grid = [ {'kernel': ['linear'], 'C': [1, 10, 50, 600]}, {'kernel': ['poly'], 'degree': [2,3]}, {'kernel': ['rbf'], 'gamma': [0.01, 0.001], 'C':[1, 10, 50, 600]}] # 定义需要使用的指标 metrics = ['precision', 'recall_weighted'] # 为每个指标搜索最优参数 for metric in metrics: print(u"为指标{}搜索最优参数:".format(metric)) classifier = grid_search.GridSearchCV(svm.SVC(C=1), parameter_grid, cv=5, scoring=metric) classifier.fit(X_train, y_train)
# loading each of the trained models u_model = JUNIPR_class.JUNIPR_energy(p_granularity, q_granularity, model_path=up_model_path).model d_model = JUNIPR_class.JUNIPR_energy(p_granularity, q_granularity, model_path=down_model_path).model ud_path_probs = [[up_path, u_log_probs], [down_path, d_log_probs]] for ud in range(len(ud_path_probs)): [ daughters, endings, mothers, (discrete_p_splittings, discrete_q_splittings), mother_momenta ] = load_data(ud_path_probs[ud][0], n_events=n_events, batch_size=batch_size, split_p_q=True, p_granularity=p_granularity, q_granularity=q_granularity) #zeros = [[0] * 100 for d in daughters] for i in range(len(ud_path_probs[ud][1])): ud_path_probs[ud][1][i] = np.zeros((2, len(daughters), 100)) for i in range(len(mothers)): mothers[i][0][mothers[i][0] == -1] = 0 for n in range(len(daughters)): for i_m, model in enumerate([u_model, d_model]): # for charge e, m, b, q = model.predict_on_batch(x=[ daughters[n], mother_momenta[n], mothers[n][1],
def train_convnet(train_size=200, valid_size=60, iterations=10000, momentum_decay=0.9, learning_rate=0.7, filter_size=10, n_hidden=500, n_filters=6, output_size=21, plot=False): theano.config.compute_test_value = 'off' # initialize some stuff # probably eventually want to un-hard-code this nbins_out = output_size batch_size=train_size rng = np.random.RandomState(4321) # load the data datasets = utilities.load_data("data/train_skies_grid.npz", train_size, valid_size, flip=False, rotate=False) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # get the shape of the input image from data nbins = train_set_x.get_value().shape[3] # prepare theano objects data = T.tensor4('x') data.tag.test_value = train_set_x.get_value() target = T.matrix('y') target.tag.test_value = train_set_y.get_value() # create the net conv_net_params = [rng, [batch_size, 3, nbins,nbins], (n_filters, 3, filter_size,filter_size), n_hidden, nbins_out] cls = ConvNet(data, *conv_net_params ) val_params = copy.copy(conv_net_params) val_params[1][0] = valid_size val = ConvNet(data, *val_params) #Sanity check to make sure the net works cost = theano.function(inputs=[], outputs=[cls.cost(target),cls.softmax_layer.output, cls.hidden_layer.output, cls.conv_layer.output, cls.output_layer.predict(target[:,0], target[:,1])], givens={data:train_set_x, target: train_set_y} # ,mode=PrintEverythingMode() ) print "Testing to make sure forward propagation works" print cost() # Setup learning rule # Currently using gradient decent with momentum grads = T.grad(cls.cost(target), cls.params) updates = {} momentum = {} for p, g in zip(cls.params, grads): momentum[p] = theano.shared(np.zeros_like(p.get_value())) updates[p] = p+learning_rate*(momentum_decay*momentum[p]-(1-momentum_decay)*g) updates[momentum[p]] = momentum_decay*momentum[p]-(1-momentum_decay)*g train_model = theano.function(inputs=[], outputs=[cls.cost(target), grads[0]], givens = { data: train_set_x, target: train_set_y }, updates = updates ) validation_cost = theano.function(inputs=[], outputs = val.cost(target), givens = { data: valid_set_x, target: valid_set_y }) # do the actual training print "Training" val_score = [] train_score = [] for i in xrange(iterations): if i%100 == 0: # check the score on the validation set every 100 epochs # note that this returns the cost *without* the L1 penalty val.copy_params(cls) vc = validation_cost() print "Validation Cost:", vc val_score.append(vc) # print "Validation Prediction\n", validation_pred()[-1] tc = train_model() print tc[0], np.cast[np.ndarray](tc[1]) train_score.append(tc) if i > 1500: # check stopping condition # linear least squares to last 10 points in train_score # see np.linalg.lstsq for explanation of how this works A = np.vstack([np.arange(10)*100, np.ones(10)]).T y = np.asarray(train_score[-10:]) slope, intercept = np.linalg.lstsq(A, y)[0] if -slope < .1: print "{} iterations".format(i) print "Final slope: ", slope print "Final intercept: ", intercept break train_model() # import pdb # pdb.set_trace() print "Final Training Cost: {}".format(train_model()) print "Final Validation Cost: {}".format(validation_cost()) print "Validation preditions" print validation_pred() # save the model parameters cls.save_params("test_weights_regress.npy") if plot: plt.figure() plt.plot(val_score) plt.plot(train_score) plt.legend(["Validation Cost", "Training Cost"]) plt.show()
import numpy as np import matplotlib.pyplot as plt import matplotlib.cm as cm from sklearn import neighbors, datasets from utilities import load_data # Load input data input_file = 'data_nn_classifier.txt' data = load_data(input_file) X, y = data[:,:-1], data[:,-1].astype(np.int) # Plot input data plt.figure() plt.title('Input datapoints') markers = '^sov<>hp' mapper = np.array([markers[i] for i in y]) for i in range(X.shape[0]): plt.scatter(X[i, 0], X[i, 1], marker=mapper[i], s=50, edgecolors='black', facecolors='none') # Number of nearest neighbors to consider num_neighbors = 10 # step size of the grid h = 0.01 # Create a K-Neighbours Classifier model and train it classifier = neighbors.KNeighborsClassifier(num_neighbors, weights='distance') classifier.fit(X, y)
for word in sentence_tokens: if word in word_vocab: feat_vect[self.word_centroid_dict[word]] += 1 return feat_vect def transform(self,X): Xtf = np.vstack([self.sentence2vector(x) for x in X]) return Xtf def fit_transform(self,X,y=None): self.fit(X,y) return self.transform(X) if __name__ == '__main__': _,unlabelledData = load_data("unsupervised") ids,X,y = load_data("cornell") pipeline = Pipeline([ ('cleaner',DataClean(clean_list=[ ["[^a-z]"," "], # only letters [" [ ]+", " "], # remove extra spaces ],html_clean=False)), ('w2v',Word2VecKMeans(data_src=unlabelledData)), ('classifier',BernoulliNB()) ]) cross_validate((X,y),pipeline,accuracy_score) # Stanford # NB # accuracy_score : 0.81932 +/- 0.00511171204197 # Confusion Matrix
for candidate in candidates: candidate_feature = self.extract_features(candidate,text,doc_word_counts) if candidate_feature != -1: candidate_features[candidate] = candidate_feature candidate_features_lst.append(candidate_features) return candidate_features_lst def stem_y(y_true): stemmer = PorterStemmer() for idx in xrange(len(y_true)): for idx_cand in xrange(len(y_true[idx])): y_true[idx][idx_cand] = ' '.join([stemmer.stem(word) for word in y_true[idx][idx_cand].split()]) return y_true if __name__ == '__main__': ids,X,y = load_data() to_stem = True # ids = ids[:50] # X = X[:50] # y = y[:50] pipeline = Pipeline([ ('cleaner',DataClean(clean_list=[ # ["\."," . "], ["[^a-z-]"," "], # only letters,fullstops,hyphens(Note!) [" [ ]+", " "], # remove extra spaces ])), ('candidate_features',CandidateFeatureExtractor()), ('keyword_selector',PairwiseRankingSVM(keyword_count=10,keyword_maxlen=5,stem=to_stem)) ]) # pipeline.fit(X,y) # pprint(pipeline.predict(X))
from utilities import load_data,cross_validate from utilities import DataClean from sklearn.naive_bayes import BernoulliNB from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.pipeline import Pipeline from sklearn.metrics import accuracy_score if __name__ == '__main__': ids,X,y = load_data("stanford") pipeline = Pipeline([ ('cleaner',DataClean(clean_list=[ ["[^a-z]"," "], # only letters [" [ ]+", " "], # remove extra spaces ],html_clean=True)), ('tf',TfidfVectorizer(use_idf=False,stop_words="english")), ('classifier',BernoulliNB()) ]) cross_validate((X,y),pipeline,accuracy_score) # Cornell # accuracy_score : 0.561444222777 +/- 0.00476207774317 # Confusion Matrix # [[ 744. 2936. 2872. 420. 100.] # [ 967. 6398. 17320. 2216. 372.] # [ 435. 4617. 68438. 5425. 667.] # [ 271. 1767. 18586. 10745. 1558.] # [ 71. 337. 2807. 4697. 1294.]] # Stanford
import numpy as np import matplotlib.pyplot as plt from sklearn import metrics from sklearn.cluster import KMeans import utilities # Load data data = utilities.load_data('data_perf.txt') scores = [] range_values = np.arange(2, 10) for i in range_values: # Train the model kmeans = KMeans(init='k-means++', n_clusters=i, n_init=10) kmeans.fit(data) score = metrics.silhouette_score(data, kmeans.labels_, metric='euclidean', sample_size=len(data)) print "\nNumber of clusters =", i print "Silhouette score =", score scores.append(score) # Plot scores plt.figure() plt.bar(range_values, scores, width=0.6, color='k', align='center') plt.title('Silhouette score vs number of clusters') # Plot data
def test_get_layer_extent(self): """Test layer_extent""" layer = load_data('point-nyc.shp') geojson = projestions_geoms.layer_extent(layer) self.assert_valid_extent(geojson)
kp_words = list(takewhile(lambda x:x in keywords, words[i:i+10])) if len(kp_words) != len(set(kp_words)): continue # No repitions avg_pagerank = sum(word_ranks[w] for w in kp_words)/float(len(kp_words)) keyphrases[' '.join(kp_words)] = avg_pagerank # to ensure merged keywords are not overlapping j = i + len(kp_words) keywords_lst.append([x[0] for x in sorted(keyphrases.iteritems(),key=lambda x: x[1],reverse = True)[:self.keyword_count]]) else: keywords_lst.append(keywords) return keywords_lst if __name__ == '__main__': ids,docs,keywords_doc = load_data() ids = ids docs = docs keywords_doc = keywords_doc pipeline = Pipeline([ ('cleaner',DataClean(clean_list=[ ["[^a-z\.-]"," "], # only letters,fullstops [" [ ]+", " "], # remove extra spaces ])), ('keyword_selector',TextRank_KeywordSelection(keyword_count=10,stem=True)) ]) cross_validate((docs,keywords_doc),pipeline,keyword_prf,stem_y=True) # keyword_prf_onegram - top 10 keywords - NounAdj Heuristic Word Extracter # precision_score : 0.460607928569 +/- 0.0223582417735 # recall_score : 0.101528291878 +/- 0.00369494108571
import numpy as np import matplotlib.pyplot as plt import utilities # Load input data input_file = 'data_multivar.txt' X, y = utilities.load_data(input_file) ############################################### # Separate the data into classes based on 'y' class_0 = np.array([X[i] for i in range(len(X)) if y[i] == 0]) class_1 = np.array([X[i] for i in range(len(X)) if y[i] == 1]) # Plot the input data plt.figure() plt.scatter(class_0[:, 0], class_0[:, 1], facecolors='black', edgecolors='black', marker='s') plt.scatter(class_1[:, 0], class_1[:, 1], facecolors='None', edgecolors='black', marker='s') plt.title('Input data') ############################################### # Train test split and SVM training from sklearn import model_selection
#### This is the URL for the parameters for AdaBoost Classifier def create_test_submission(filename, prediction): content = ['id,ACTION'] for i, p in enumerate(prediction): content.append('%i,%f' %(i+1,p)) f = open(filename, 'w') f.write('\n'.join(content)) f.close() print 'Saved' cwd = os.getcwd() trainDataLoc = cwd + '/../data/train.csv' testDataLoc = cwd + '/../data/test.csv' y, X = load_data(trainDataLoc) y_test, X_test = load_data(testDataLoc, use_labels=False) print ("encoding") encoder = preprocessing.OneHotEncoder() print ("fitting") encoder.fit(np.vstack((X, X_test))) X = encoder.transform(X) # Returns a sparse matrix (see numpy.sparse) X_test = encoder.transform(X_test) print("about to classify") clf = AdaBoostClassifier(base_estimator=None, n_estimators=900, learning_rate=1.8) scores = clf.fit(X, y) # """ # X_train, X_cv, y_train, y_cv = cross_validation.train_test_split(X, y, test_size=.20, random_state=SEED) # model = svm.SVC(C=1, probability=True, kernel='rbf')
parser.add_argument('--train_labels_path', nargs='?', type=str, default='data/wine_train_labels.csv', help='Path to training labels') parser.add_argument('--test_set_path', nargs='?', type=str, default='data/wine_test.csv', help='Path to the test set csv') parser.add_argument('--test_labels_path', nargs='?', type=str, default='data/wine_test_labels.csv', help='Path to the test labels csv') args = parser.parse_args() mode = args.mode[0] return args, mode if __name__ == '__main__': args, mode = parse_args() # get argument from the command line # load the data train_set, train_labels, test_set, test_labels = load_data(train_set_path=args.train_set_path, train_labels_path=args.train_labels_path, test_set_path=args.test_set_path, test_labels_path=args.test_labels_path) if mode == 'feature_sel': selected_features = feature_selection(train_set, train_labels) print_features(selected_features) elif mode == 'knn': predictions = knn(train_set, train_labels, test_set, args.k) print_predictions(predictions) elif mode == 'alt': predictions = alternative_classifier(train_set, train_labels, test_set) print_predictions(predictions) elif mode == 'knn_3d': predictions = knn_three_features(train_set, train_labels, test_set, args.k) print_predictions(predictions) elif mode == 'knn_pca': prediction = knn_pca(train_set, train_labels, test_set, args.k)
def train_regress_net(train_size=200, valid_size=60, iterations=10000, momentum_decay=0.9, learning_rate=0.7, filter_size=10, n_hidden=500, n_filters=6, plot=False): theano.config.compute_test_value = 'off' theano.config.DebugMode.check_strides = 0 # initialize some stuff nbins_out = 6 batch_size=8*train_size rng = np.random.RandomState(4321) # load the data datasets = utilities.load_data("data/train_skies_grid.npz", train_size, valid_size, flip=True, rotate=True) train_set_x, train_set_y = datasets[0] valid_set_x, valid_set_y = datasets[1] test_set_x, test_set_y = datasets[2] # get the shape of the input image from data nbins = train_set_x.get_value().shape[3] # prepare theano objects data = T.tensor4('x') # data.tag.test_value = train_set_x.get_value() target = T.matrix('y') # target.tag.test_value = train_set_y.get_value() # create the net net_params = [rng, [batch_size, 3, nbins,nbins], (n_filters, 3, filter_size,filter_size), n_hidden, nbins_out, .001] cls = RegressNetWithDropoutTrain(data, *net_params ) # create a validation net val_params = copy.copy(net_params) val_params[1][0] = valid_size val = RegressNetWithDropoutPredict(data, *val_params) #Sanity check to make sure the net works cost = theano.function(inputs=[], outputs=cls.cost(target), givens={data:train_set_x, target: train_set_y}) print "Testing to make sure forward propagation works" print cost() # Setup learning rule # Currently using gradient decent with momentum grads = T.grad(cls.cost(target), cls.params) lr = T.scalar('lr') updates = {} momentum = {} learning_rate_scales = [1., 1., 1., 1.] for p, g, ls in zip(cls.params, grads, learning_rate_scales): momentum[p] = theano.shared(np.zeros_like(p.get_value())) updates[p] = p+ls*lr*(momentum_decay*momentum[p]-(1-momentum_decay)*g) updates[momentum[p]] = momentum_decay*momentum[p]-(1-momentum_decay)*g # compile the training function in theano # train_model_debug = theano.function(inputs=[], # outputs=[cls.cost(target), cls.output, cls.conv_layer.output, cls.hidden_layer.output], # givens = { # data: train_set_x, # target: train_set_y # }, # updates = updates # # ,mode="DebugMode" # ) train_model = theano.function(inputs=[lr], outputs=cls.cost(target), givens = { data: train_set_x, target: train_set_y }, updates = updates # ,mode="DebugMode" ) validation_cost = theano.function(inputs=[], outputs = val.cost(target), givens = { data: valid_set_x, target: valid_set_y }) validation_pred = theano.function(inputs=[], outputs = val.output, givens= {data: valid_set_x}) # do the actual training print "Training" val_score = [] train_score = [] for i in xrange(iterations): if i%100 == 0: # check the score on the validation set every 100 epochs # note that this returns the cost *without* the L1 penalty val.copy_params(cls) vc = validation_cost() print "Validation Cost:", vc val_score.append(vc) print "Validation Prediction\n", validation_pred()[-1] print "Acutal value: ", valid_set_y.get_value()[-1] # print "Linear weights" # print cls.params[-2].get_value() tc = train_model(max([learning_rate*i/1000., learning_rate])) print tc train_score.append(tc) if i > 1500: # check stopping condition # linear least squares to last 10 points in train_score # see np.linalg.lstsq for explanation of how this works A = np.vstack([np.arange(10)*100, np.ones(10)]).T y = np.asarray(train_score[-10:]) slope, intercept = np.linalg.lstsq(A, y)[0] if abs(slope) < 1: print "{} iterations".format(i) print "Final slope: ", slope print "Final intercept: ", intercept break train_model(max([learning_rate*i/1000., learning_rate])) # import pdb # pdb.set_trace() print "Final Training Cost: {}".format(train_model(0.)) print "Final Validation Cost: {}".format(validation_cost()) print "Validation preditions" print validation_pred() # save the model parameters cls.save_params("test_weights_regress.npy") if plot: plt.figure() plt.plot(val_score) plt.plot(train_score) plt.legend(["Validation Cost", "Training Cost"]) plt.show()