def main(): #load data df = load_data('../../assignment10_data/restaurants.csv', ['CAMIS','BORO','GRADE','GRADE DATE']) df = clean_data(df) #clean data #question 4 sum_nyc, sum_boro = grade_sum(df) #calculate sum of test_grade in nyc and in each borough print 'The sum of test_grade in NYC is: {} \n'.format(sum_nyc) print 'The sum of test_grade in each boroughs is: \n {}'.format(sum_boro) #question 5 grade_overtime_plot(df, 'nyc') #grade overtime plot for nyc #grade overtime plot for each borough for borough in ['BRONX', 'BROOKLYN', 'MANHATTAN', 'QUEENS', 'STATEN ISLAND']: df_boro = df[df['BORO'] == borough] grade_overtime_plot(df_boro, borough.lower()) #question 6 df1 = load_data('../../assignment10_data/restaurants.csv', ['CAMIS','CUISINE DESCRIPTION']) type_name = get_top_10_nyc(df1) df2 = load_data('../../assignment10_data/restaurants.csv', ['CAMIS','CUISINE DESCRIPTION', 'GRADE', 'GRADE DATE']) df2 = clean_data(df2) df2 = df2[df2['CUISINE DESCRIPTION'].isin(type_name)] df_sum = top_10_grade_overtime(df2, type_name) #calculate score overtime for each restaurant type top_10_plot(df_sum) #score overtime plot top_10_colormap(df_sum) #plot correlation between any two restaurant types in NYC in color map
def main(): print("Loading the classifier") classifier = utility.load_model("fullsgd_model_rev{}".format(revision)) print("Reading in the training data") train = utility.load_data("training", "finalinput") truth = train['votes_useful_log'] del train['votes_useful_log'] print("Predicting the training data") logpred = np.ravel(classifier.predict(train.values[:,1:])) score = utility.rmsle_log(logpred, truth) print "Score:", score print("Reading in the test data") test = utility.load_data("test", "finalinput") del test['votes_useful_log'] print("Predicting the test data") logpred = np.ravel(classifier.predict(test.values[:,1:])) pred = np.exp(np.array(logpred, dtype=np.float64)) - 1 test['votes'] = pred print("Writing out a new submission file") utility.write_submission(test, "fullsgd_sub_rev{}.csv".format(revision))
def main(): revision = 1 print("Loading the classifier") classifier = utility.load_model("train_rtext_rev{}".format(revision)) print("Reading in the training data") train = utility.load_data("training", "rtext") print("Predicting the rest of the training data") pred = np.ravel(classifier.predict(list(train['rtext_bcat']))) score = utility.rmsle_log(pred, train['votes_useful_log']) print "Score:", score print("Writing out new training data") del train['rtext_bcat'] train['votes_useful_log_rtextpred_sgd'] = pd.Series(pred, index=train.index) utility.save_data(train, "training", "rtext_sgd_rev{}".format(revision)) print("Reading in the test data") test = utility.load_data("test", "rtext") tepred = np.ravel(classifier.predict(list(test['rtext_bcat']))) print("Writing out new test data") del test['rtext_bcat'] test['votes_useful_log_rtextpred_sgd'] = pd.Series(tepred, index=test.index) utility.save_data(test, "test", "rtext_sgd_rev{}".format(revision)) test['votes'] = pd.Series(np.exp(tepred) + 1, index=test.index) print("Writing out a new submission file") utility.write_submission(test, "rtextsgd_sub_rev{}".format(revision))
def main(): """ This function is to present the results of this assignment. Users will ask to see: 1)Income distribution across all countries for a given year: Users need to input a year from 1800 to 2012. Results will be saved as a .png file. 2)Income distribution by region in recent years: Users need to input the first year, last year and year gap in a year rangeand select a plot type, boxplot or histograms. Results will be saved as a .pdf file. """ #load countries and income data countries = load_data('countries.csv') income = load_data('indicator gapminder gdp_per_capita_ppp.csv') #transform income data set income = trans_data(income) try: while raw_input('To see income distribution across all countries? (y/n) ') == 'y': try: year = raw_input('Which year? ') #select a year income_distr(income, year) except: print 'Please input a year from 1800 to 2012' while raw_input('To see income distribution by region in recent years? (y/n) ') == 'y': try: from_year = int(raw_input('From which year? ')) #input the first year to_year = int(raw_input('To which year? ')) #input the last year year_gap = int(raw_input('Year gap? ')) #input a year gap pltype = raw_input('Plot type: boxplots or histograms? (b/h) ') #select a plot type if pltype == 'b': pp = PdfPages('results/Income by region from {0} to {1}_boxplot.pdf'.format(from_year, to_year)) #create a pdf file to save plots for i in xrange(from_year, to_year+1, year_gap): fig = income_region(1,str(i)) pp.savefig(fig) elif pltype == 'h': pp = PdfPages('results/Income by region from {0} to {1}_hist.pdf'.format(from_year, to_year)) for i in xrange(from_year, to_year+1, year_gap): fig = income_region(0, str(i)) plt.suptitle('{}'.format(i)) pp.savefig(fig) pp.close() #close the pdf file except: print 'please input years from 1800 to 2012 and try again!' except(KeyboardInterrupt): print 'Bye!' sys.exit()
def on_load(self): self.savable_environment = utility.load_data("lisp_state") if not self.savable_environment: self.savable_environment = Environment(self.globals) else: self.savable_environment.parent = self.globals
def test_without_changes(): """ Plots the rectangles on top of the image, without converting to a 5-D representation. Useful only for debugging of the 5-D transform Returns: """ path = "../debug_dataset" # Only add a couple of pictures to this path images, pos_rectangles, neg_rectangles = load_data(path) df = pd.DataFrame(columns=["filenames", "p1", "p2", "p3", "p4"]) for filename in pos_rectangles.filenames.unique(): points = pos_rectangles[pos_rectangles["filenames"] == filename] points = points.loc[:, ['x', 'y']] for i in range(0, len(points), 4): x1, y1 = points.iloc[i][0], points.iloc[i][1] x2, y2 = points.iloc[i + 1][0], points.iloc[i + 1][1] x3, y3 = points.iloc[i + 2][0], points.iloc[i + 2][1] x4, y4 = points.iloc[i + 3][0], points.iloc[i + 3][1] new_row = [ filename, np.asarray([int(x1), int(y1)]), np.asarray([int(x2), int(y2)]), np.asarray([int(x3), int(y3)]), np.asarray([int(x4), int(y4)]) ] df.loc[len(df)] = new_row print(df) for i, j in images.iterrows(): rectangles = df[df["filenames"] == j["filenames"]] plot(j["images"], j["filenames"], rectangles)
def run(train_file, test_file, output_file): train, labels, test = utils.load_data(train_file, test_file) clf = XGBoost(max_iterations=500, max_depth=12, min_child_weight=4.9208250938262745, row_subsample=.9134478530382129, min_loss_reduction=.5132278416508804, column_subsample=.730128689911957, step_size=.1) clf.fit(train, labels) predictions = clf.predict_proba(test) utils.save_prediction(output_file, predictions)
def optimal_svm(optimal_c): """ This function is to calculate AUC for optimal C chose from model selection """ #load datasets train_X, train_y = load_data('train_X.csv', 'train_y.csv') test_X, test_y = load_data('test_X.csv', 'test_y.csv') train_X_pca = data_pca(0.95, train_X, train_X) test_X_pca = data_pca(0.95, train_X, test_X) train_y = np.array(train_y).ravel() test_y = np.array(test_y).ravel() #set up model with the optimal C my_svm = svm.SVC(kernel='linear', C=optimal_c, class_weight='auto') predicted_y = my_svm.fit(train_X_pca,train_y).decision_function(test_X_pca) fpr, tpr, tr = roc_curve(test_y, predicted_y) print auc(fpr, tpr)
def main(): revision = 4 print("Loading the classifier") classifier = utility.load_model("train_rtext_rev{}".format(revision)) print("Reading in the training data") train = utility.load_data("training", "rtext") print("Predicting the rest of the training data") bunch = 50000 pred = np.zeros(len(train)) for ibunch in range(int(len(train) / bunch)) : beg = ibunch * bunch end = (ibunch + 1) * 50000 mtrain = train.ix[beg:end - 1] mpred = np.ravel(classifier.predict(list(mtrain['rtext_bcat']))) pred[beg:end] = mpred beg = int(len(train) / bunch) * bunch mtrain = train.ix[beg:] mpred = np.ravel(classifier.predict(list(mtrain['rtext_bcat']))) pred[beg:] = mpred score = utility.rmsle_log(pred, train['votes_useful_log']) print "Score:", score print("Writing out new training data") del train['rtext_bcat'] train['votes_useful_log_rtextpred'] = pd.Series(pred, index=train.index) utility.save_data(train, "training", "rtext_rev{}".format(revision)) print("Reading in the test data") test = utility.load_data("test", "rtext") tepred = np.ravel(classifier.predict(list(test['rtext_bcat']))) print("Writing out new test data") del test['rtext_bcat'] test['votes_useful_log_rtextpred'] = pd.Series(tepred, index=test.index) utility.save_data(test, "test", "rtext_rev{}".format(revision)) test['votes'] = pd.Series(np.exp(tepred) + 1, index=test.index) print("Writing out a new submission file") utility.write_submission(test, "rtextrf_sub_rev{}.csv".format(revision))
def train_and_evaluate(output_dir, hparams): # Main orchastrator of training and evaluation by calling models from estimator_model.py shutil.rmtree(output_dir, ignore_errors = True) tf.compat.v1.summary.FileWriterCache.clear() # Set log configuration, export to local file date_string = datetime.datetime.now().strftime("%m%d_%H%M") filename = 'training log/train_estimator_log_' + date_string + '.txt' logging.basicConfig(filename=filename, level=20) ((train_text, train_label), (eval_text, eval_label)) = utility.load_data("warehouse/store/", CLASSES) # Create vocabulary from training corpus tokenizer = tf.keras.preprocessing.text.Tokenizer(num_words=VOCAB_SIZE+1, oov_token='<unk>') tokenizer.fit_on_texts(train_text) # Save token dictionary to use during prediction time # saving with open('tokenizer.pickle', 'wb') as handle: pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL) # Create estimator config run_config = tf.estimator.RunConfig(save_checkpoints_steps=EVAL_INTERVAL, log_step_count_steps = 20, save_summary_steps = 50 ) estimator = cnn_estimator(hparams['model_version'], output_dir, run_config, hparams['learning_rate'], hparams['grad_clip_rate'], EMBEDDING_DIM, hparams['filters'], hparams['dropout_rate'], hparams['kernel_size'], hparams['pool_size'], hparams['strides'], hparams['padding_type'], hparams['fc_layer_nodes'], hparams['growth_rate'], word_index=tokenizer.word_index, embedding_path=hparams['embedding_path']) train_steps = hparams['num_epochs'] * len(train_text) / hparams['batch_size'] train_spec = tf.estimator.TrainSpec(input_fn=input_function(train_text, train_label, tokenizer, hparams['batch_size'], mode=tf.estimator.ModeKeys.TRAIN), max_steps=train_steps) # Create exporter configuration exporter = tf.estimator.LatestExporter('exporter', serving_input_fn) eval_spec = tf.estimator.EvalSpec(input_fn=input_function(eval_text, eval_label, tokenizer, hparams['batch_size'], mode=tf.estimator.ModeKeys.EVAL), steps=None, exporters=exporter, throttle_secs = 5) # evaluate every N seconds tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) return True
def predict(config_file): """ Main function that runs predictions Args: config_file [str]: path to config file Returns: None """ ################## # configure logger ################## logger = set_logger("./log/predict.log") ################## # Load config from config file ################## logger.info(f"Load config from {config_file}") config = parse_config(config_file) model_path = Path(config["predict"]["model_path"]) processed_test = config["predict"]["processed_test"] predicted_file = config["predict"]["predicted_file"] export_result = config["predict"]["export_result"] logger.info(f"config: {config['predict']}") ################## # Load model & test set ################## # Load model logger.info( f"-------------------Load the trained model-------------------") with open(model_path, "rb") as f: trained_model = load(f) # Load test set logger.info(f"Load the test data from {processed_test}") X, y, cols = load_data(processed_test) logger.info(f"cols: {cols}") logger.info(f"X: {X.shape}") logger.info(f"y: {y.shape}") ################## # Make prediction and evaluate ################## logger.info(f"-------------------Predict and evaluate-------------------") y_hat = trained_model.predict(X) logger.info(f"Classification report: \n {classification_report(y, y_hat)}") output = pd.DataFrame(y) output["prediction"] = y_hat if export_result: output.to_csv(predicted_file, index=False) logger.info(f"Export prediction to : {predicted_file}")
def update_history(opt: Namespace, epoch: int, loss: float, split: str) -> "model": train_hist = utility.load_data(os.path.join(opt.save, 'train_hist.pt')) val_hist = utility.load_data(os.path.join(opt.save, 'val_hist.pt')) if split == 'train': train_hist[epoch] = loss torch.save(train_hist, os.path.join(opt.save, 'train_hist.pt')) with open(os.path.join(opt.save, 'train_hist'), 'w') as f: f.write(utility.hist_to_str(train_hist)) return train_hist elif split == 'val': val_hist[epoch] = loss torch.save(val_hist, os.path.join(opt.save, 'val_hist.pt')) with open(os.path.join(opt.save, 'val_hist'), 'w') as f: f.write(utility.hist_to_str(val_hist)) return val_hist else: logging.error('Unknown split: ' + split)
def main(): in_arg = utility.get_input_args() print(in_arg) device = utility.set_device_type(in_arg.gpu) image_datasets, dataloaders = utility.load_data(in_arg.data_dir) cat_to_name = utility.load_category_map() model, criterion, optimizer = modelbuilder.build_model(in_arg, device) train_model(model, dataloaders['train'], dataloaders['valid'], in_arg.epochs, 40, criterion, optimizer, device) modelbuilder.save_model(in_arg, model, image_datasets['train'], optimizer)
def test_load_data(): X, y, cols = load_data("data/winequality.csv") # type check assert isinstance(X, np.ndarray) assert isinstance(y, pd.core.series.Series) assert isinstance(cols, list) # shape check assert X.shape[0] == y.shape[0], "# row of features = # rows of target" assert X.shape[1] >= 1, "# features >= 1" assert len(cols) == 12, "data should have 12 cols"
def main(): print("Reading in the training data") train = utility.load_data("training", "finalinput") truth = np.ravel(np.array(train['votes_useful_log'])) del train['votes_useful_log'] print("Extracting features and training review text model") classifier = get_pipeline() classifier.fit(train.values[:,1:], np.array(truth)) print("Saving the classifier") utility.save_model(classifier, "fullsgd_model_rev{}".format(revision))
def save_labels(path_to_data, path_to_labels): _, pos_rectangles, neg_rectangles = load_data(path_to_data) pos_rectangles = to_five_dimensional(pos_rectangles) neg_rectangles = to_five_dimensional(neg_rectangles) saved_path = Path(path_to_labels) pos_label_path = saved_path / "pos_labels.csv" neg_label_path = saved_path / "neg_labels.csv" if not saved_path.exists(): Path.mkdir(saved_path, parents=True) pos_rectangles.to_csv(pos_label_path) neg_rectangles.to_csv(neg_label_path)
def get_future_trade_date_index(start: datetime, end: datetime) -> np.ndarray: """ 返回用于对比所有期货品种的标准化时间序列 各品种夜盘时间不固定(有的无夜盘),在计算每日盈亏的时候就会出现时间线不齐的情况,此处以au为标准。 """ df = load_data( 'AU888.SHFE', '1h', start, end, ) df['date'] = df.index.map(lambda dt: dt.date()) return df['date'].drop_duplicates().values
def main(): dataset, labels = load_data(args.XLSX) features = [row[:-1] for row in dataset] mean_vector, e_values, e_vectors, norm_Features = calculate_eigen_of_cov_mat(features) scree_graph(e_values) k = 20 print('eigen value 1:', e_values[0]) print('eigen vector 1:\n', e_vectors[0]) print('eigen value 2:', e_values[1]) print('eigen vector 2:\n', e_vectors[1]) choose_k_largest_eigen_transform(norm_Features, e_vectors, mean_vector, k)
def main(): trabus = utility.load_data("training", "business") tesbus = utility.load_data("test", "business") bus = pd.concat((trabus, tesbus)) for cat in delbuscats : if hasattr(bus, cat) : del bus[cat] bus['procbcat'] = pd.Series(map(process_bcat, bus['categories']), bus.index) del bus['categories'] for s in ["training", "test"] : rev = utility.load_data(s, "review") for cat in delrevcats : if hasattr(rev, cat) : del rev[cat] if hasattr(rev, 'votes_useful') : rev['votes_useful_log'] = np.log(rev.votes_useful + 1) rev = pd.merge(rev, bus, 'inner') rev['rtext_bcat'] = rev['text'] + rev['procbcat'] del rev['procbcat'] del rev['text'] utility.save_data(rev, s, 'rtext')
def main(): revision = 4 print("Reading in the training data") train = utility.load_data("training", "rtext") inds = random.sample(range(len(train)), 100000) mtrain = train.ix[inds] print("Extracting features and training review text model") classifier = get_pipeline() classifier.fit(list(mtrain['rtext_bcat']), list(mtrain['votes_useful_log'])) print("Saving the classifier") utility.save_model(classifier, "train_rtext_rev{}".format(revision))
def main(): dataset, labels = load_data(args.XLSX) class_values = list(set(row[-1] for row in dataset)) root = get_best_feature_to_split(dataset, 'gain_ratio') split(root, class_values, max_depth=3, min_size=5, depth=1, mode='gain_ratio') print(root)
def main(): #load datasets train_X, train_Y = load_data('train_X.csv', 'train_y.csv') train_X_pca = data_pca(0.95, train_X, train_X) train = train_X_pca train['Y'] = train_Y #set a list of hyperparameter C c = [10**i for i in range(-9,2)] #conduct X cross validation and return AUCs in each sample for each C aucs=xValSVM(train, 'Y', 5, c) #calculate the average and standard error of AUC for each C avg, stderr = avg_stderr(aucs, c) #plot the results of cross validation plotxValSVM(avg, stderr, c)
def main(): #load data df = load_data('../../assignment10_data/restaurants.csv', ['CAMIS', 'BORO', 'GRADE', 'GRADE DATE']) df = clean_data(df) #clean data #question 4 sum_nyc, sum_boro = grade_sum( df) #calculate sum of test_grade in nyc and in each borough print 'The sum of test_grade in NYC is: {} \n'.format(sum_nyc) print 'The sum of test_grade in each boroughs is: \n {}'.format(sum_boro) #question 5 grade_overtime_plot(df, 'nyc') #grade overtime plot for nyc #grade overtime plot for each borough for borough in [ 'BRONX', 'BROOKLYN', 'MANHATTAN', 'QUEENS', 'STATEN ISLAND' ]: df_boro = df[df['BORO'] == borough] grade_overtime_plot(df_boro, borough.lower()) #question 6 df1 = load_data('../../assignment10_data/restaurants.csv', ['CAMIS', 'CUISINE DESCRIPTION']) type_name = get_top_10_nyc(df1) df2 = load_data('../../assignment10_data/restaurants.csv', ['CAMIS', 'CUISINE DESCRIPTION', 'GRADE', 'GRADE DATE']) df2 = clean_data(df2) df2 = df2[df2['CUISINE DESCRIPTION'].isin(type_name)] df_sum = top_10_grade_overtime( df2, type_name) #calculate score overtime for each restaurant type top_10_plot(df_sum) #score overtime plot top_10_colormap( df_sum ) #plot correlation between any two restaurant types in NYC in color map
def test(): """ Plots the rectangles on top of the image. To see if the 5-D transformation works well. Returns: """ path = "../debug_dataset" # Only add a couple of pictures to this path images, pos_rectangles, neg_rectangles = load_data(path) pos_rectangles = to_five_dimensional(pos_rectangles) replicated_imgs = replicate_images(images, pos_rectangles) x_train, y_train, x_test, y_test = split_train_test_data( replicated_imgs, pos_rectangles) df = to_four_points(pos_rectangles) for i, j in x_test.iterrows(): rectangles = df[df.filenames == j["filenames"]] plot(j["images"], j["filenames"], rectangles)
def main(): # Get command line input results_argparse = argparse_train() # Load and transform image data image_datasets, dataloaders = load_data(results_argparse.data_directory) # Create the model model = create_model(results_argparse.arch, results_argparse.hidden_units, results_argparse.prob_dropout) # Train the model model, optimizer = train_model(model, dataloaders['train'], results_argparse.learning_rate, results_argparse.epochs, results_argparse.gpu, print_every=40, validloader=dataloaders['valid']) # Save model model.class_to_idx = image_datasets['train'].class_to_idx save_model(model, optimizer)
def on_load(self): self.notebook = utility.load_data('notes', {})
import utility import numpy as np import pandas as pd import multiprocessing from sklearn.grid_search import GridSearchCV from sklearn.linear_model import LogisticRegression from sklearn.cross_validation import train_test_split from sklearn.ensemble import GradientBoostingRegressor nweakgbms = 18 dftest = utility.load_data('test') dftrain = utility.load_data('train') dftestpreds = pd.DataFrame(dftest.id) dftrainpreds = pd.DataFrame({'id':np.arange(len(dftrain)), 'ACTION':dftrain.ACTION}) y = np.array(dftrain.ACTION) del dftrain['ACTION'] X = np.array(dftrain) Xtest = np.array(dftest)[:,1:] def train_weakgbm(i) : cols = np.ones(9) cols[i % X.shape[1]] = 0 smallX = np.compress(cols, X, axis=1) X_cvtrain, X_cvtest, y_cvtrain, y_cvtest = train_test_split(
def on_load(self): self.__aliases = utility.load_data("stockaliases", {})
def on_load(self): self.location = utility.load_data('festern_bbq', "okänt")
def on_load(self): self.reminders = utility.load_data("reminders", [])
def on_load(self): self.places = utility.load_data("places", {})
min_delta = setup["Delta"] patience = setup["Patience"] optimizer = setup["Optimizer"] loss = setup["Loss"] activation_function = setup["Activation"] # layers for decoder(/encoder) # if 2 the overall net will have 4 layers layers_num = setup["Layer"] cell_type = setup["Cell"] start_dim = setup["Starting"] # ---------------------------------------------------- # # deep autoencoders for gene clustering # ---------------------------- LOAD DATA -------------------------------- # print "Loading the data." dataset, _ = utility.load_data(dataset_code, cell_type, withLabel=True) while start_dim < dataset.shape[1] / 2 and start_dim != 20000: dataset = dataset.loc[:, dataset.var() > dataset.var().median()] print "Pre-filtering dimensions " + str(dataset.shape[1]) # reduce intial dimensions exit_flag = False while True and start_dim != 20000: min_var = dataset.var().min() for i in dataset: if dataset.shape[1] <= start_dim: exit_flag = True break if dataset[i].var() == min_var: del (dataset[i]) if exit_flag: break
def mask_load(self): self.url_masks = utility.load_data("urlmasks", {})
def load_urls(self): self.url_list = utility.load_data("urls", [])
parser.add_argument('--sample_percent', type=float, default=1.0) parser.add_argument('--layers', type=str, default='100') parser.add_argument('--feature_output_path', type=str, default=None) args = parser.parse_args() load_path = args.load_path image_dir = args.image_dir batch_size = args.batch_size epochs = args.epochs save_name = args.save_name sample_percent = args.sample_percent feature_output_path = args.feature_output_path layers = list(map(int, args.layers.split(','))) X, labels, file_names = load_data(image_dir, sample_percent=sample_percent, return_names=True) length, width = X[0].shape input_size = length * width X = torch.Tensor(X).view(-1, input_size).type(torch.float32) X /= 255 if load_path is None: model = AutoEncoder([input_size] + layers) model.train(X=X, batch_size=batch_size, epochs=epochs, verbose=True) else: model = AutoEncoder.load(load_path) if feature_output_path is not None: print('Saving learned features...') new_features = model(X)
def on_load(self): self.compliments = utility.load_data("compliments", [])
def on_load(self): self.insults = utility.load_data("insults", [])
def on_load(self): self.places = utility.load_data("postnr_addresses", {})
import time, datetime import matplotlib # Force matplotlib to not use any Xwindows backend. matplotlib.use('Agg') import matplotlib.pyplot as plt from constants import DEV_MODE, IMAGE_SIZE, SIMPLE_3LAYERS_FILENAME, LENET5_CNN_FILENAME from lasagne_neuralnet import NeuralNet1, NeuralNet2, load_model_if_exists, predict, plot_neural_net, reshape_data from utility import clean_data, data_preview, get_label_args, load_data, save_result, scale_data from visualization import plot_images, plot_label_distribution, plot_lasagne_learning_curves # Count running time starttime = datetime.datetime.now() # Load and preview data train, test, label = load_data(dev_mode=DEV_MODE) data_preview(train, test, label) plot_images(train, label) # Fill NA with average. # TODO: remove abnormal samples label = clean_data(label) label_min, label_max, label_median, label_var = get_label_args(label) plot_label_distribution(label) # Convert dataframe to array and normalize the data train_array, label_array, test_array = scale_data(train, test, label) # Train and predict # Simple 3-layers neural nets
def on_load(self): self.favorites = utility.load_data("favorites", {})
def printPOS(pos_words): #pos_words is a list of (word, tag) s = "" t = "" for p in pos_words: l = len(p[0]) if len(p[0]) > len(p[1]) else len(p[1]) s = s + p[0].rjust(l) + ' ' t = t + p[1].rjust(l) + ' ' print '-----------' print s print t print "" if __name__ == '__main__': if len(sys.argv) != 2: print "Usage: python showTaggedSentences.py <input file>" sys.exit(0) qaTests = load_data(sys.argv[1]) showAllTaggedSentences(qaTests)
self.i2o = nn.Linear(input_size + hidden_size, output_size) self.softmax = nn.LogSoftmax(dim=1) def forward(self, input_tensor, hidden_tensor): combined = torch.cat((input_tensor, hidden_tensor), 1) # doubt hidden = self.i2h(combined) output = self.i2o(combined) output = self.softmax(output) return output, hidden def init_hidden(self): return torch.zeros(1, self.hidden_size) category_lines, all_categories = load_data() n_categories = len(all_categories) print(n_categories) n_hidden = 128 rnn = RNN(N_LETTERS, n_hidden, n_categories) input_tensor = letter_to_tensor("A") hidden_tensor = rnn.init_hidden() output, next_hidden = rnn(input_tensor, hidden_tensor) print(output.size()) print(next_hidden.size()) input_tensor = line_to_tensor('Albert') hidden_tensor = rnn.init_hidden() output, next_hidden = rnn(input_tensor[0], hidden_tensor)
def load_games(self): self.games = utility.load_data("games", {})
def on_load(self): self.watch_list = utility.load_data("rss_watch_list", [])
def on_load(self): self.id_directory = utility.load_data('schema_id', {}) self.id_presets = utility.load_data('schema_fav', {})
def load_urls(self): self.url_lists = utility.load_data("urls", {})
def train(config_file): """ Main function that train and persists model based on training set/ Args: config_file [str]: path to config file Returns: None """ ################ # config logger ################ logger = set_logger("../log/train.log") ############################### # Load config from config file ############################### logger.info(f"Load config from {config_file}") config = parse_config(config_file) keypoints_csv = Path(config['common']['labels_csv_path']) val_split = config['common']['val_split'] train_img_scr_path = config['common']['img_source_path'] test_img_scr_path = config['common']['img_source_path'] image_width = config['common']['in_image_width'] image_height = config['common']['in_image_height'] epochs = config['train']['epochs'] train_batch_size = config['train']['batch_size'] weight_path = config['common']['weight_path'] no_of_aug = config['train']['no_of_aug'] test_batch_size = config['test']['batch_size'] ############ # Load Data ############ logger.info(f"----------------Load the data----------------") selected_img, keypoint_df = load_data(keypoints_csv) logger.info(f"Number of selected images are {selected_img.shape}") logger.info(f"Few of the selected images are {selected_img[0:5]}") #################################### # Get train and test data generators #################################### X_train, y_train, X_test, y_test = train_test_split( selected_img, keypoint_df, val_split) train_gen = Car(x_set=X_train, y_set=y_train, mode='Train', data_path=train_img_scr_path, image_width=image_width, image_height=image_height, batch_size=train_batch_size, augmentations='Self', no_of_aug=no_of_aug) test_gen = Car( x_set=X_test, y_set=y_test, mode='Test', data_path=test_img_scr_path, image_width=image_width, image_height=image_height, batch_size=test_batch_size, ) ##################### # Set and train model ##################### logger.info( f"-------------------------Initiate Model---------------------") model = KeyPointModel().getModel() logger.info( f"--------------------Model Summary---------------------------") logger.info(f"{model.summary}") # compile the model model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mean_absolute_error']) # modelCheckPoint = ModelCheckpoint('car-{val_loss:.2f}.h5', monitor='val_loss', verbose=1, save_best_only=True, save_weights_only=True) earlyS = EarlyStopping(monitor='val_loss', min_delta=1, patience=3, restore_best_weights=True) reducelr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=2, min_lr=1e-7) history = model.fit(x=train_gen, validation_data=test_gen, callbacks=[earlyS, reducelr], epochs=epochs) logger.info(history) logger.info("------------Saving Weights--------------") model.save_weights(weight_path)
decoding2.decoding(j) + '\n') scores = model.evaluate(X, Y) f.write("\n%s: %.2f%% \n%s: %.2f%% \n%s: %.2f%%" % (model.metrics_names[1], scores[1] * 100, model.metrics_names[2], scores[2] * 100, model.metrics_names[3], scores[3] * 100)) f.close() return print("Predictions saved in the file: " + name) # 0. Load Data path = '../data/prop_55.csv' # len 464 # path = '../data/prop_65.csv' # path = '../data/prop_75.csv' # path = '../data/prop_85.csv' # path = '../data/prop_95.csv' X, Y = ut.load_data(path) batch = len(X) # 1. Define Model model = Sequential() model.add(Dense(460, input_shape=(460, ), activation='sigmoid')) # 2. Compile model keras_metrics = KerasMetrics() model.compile(optimizer='adam', loss='mean_squared_error', metrics=[ keras_metrics.fbeta_score, keras_metrics.recall, keras_metrics.precision ])
def load_refs(self): self.references = utility.load_data("spotify", {})
def train(): batch_size = TRAIN_BATCH_SIZE num_labels = 10 train_images = {} train_labels = {} test_images = {} test_labels = {} validation_images = {} validation_labels = {} for data in utility.Data: path = utility.generate_data_path(TRAINING_DATA_SIZE, data, label=False) train_images[data] = utility.load_data(path) path = utility.generate_data_path(TRAINING_DATA_SIZE, data, label=True) train_labels[data] = utility.load_data(path, label=True) path = utility.generate_data_path(utility.Size.TEST, data, label=False) test_images[data] = utility.load_data(path) path = utility.generate_data_path(utility.Size.TEST, data, label=True) test_labels[data] = utility.load_data(path, label=True) path = utility.generate_data_path(utility.Size.VALIDATION, data, label=False) validation_images[data] = utility.load_data(path) path = utility.generate_data_path(utility.Size.VALIDATION, data, label=True) validation_labels[data] = utility.load_data(path, label=True) train_size = train_images[utility.Data.CUSTOM].shape[0] + train_images[ utility.Data.STREET].shape[0] + train_images[ utility.Data.MNIST].shape[0] total_batch = int(train_size / batch_size) # Boolean for MODE of train or test #is_training = tf.placeholder(tf.bool, name='MODE') data_type = tf.placeholder(tf.string) # tf input x = tf.placeholder(tf.float32) y_ = tf.placeholder(tf.float32, [None, 10]) #answer y = cnn_model.CNN(x, data_type) # Get loss of model with tf.name_scope("LOSS"): loss = cnn_model.loss( y, y_, ) # Define optimizer with tf.name_scope("ADAM"): # Optimizer: set up a variable that's incremented once per batch and # controls the learning rate decay. batch = tf.Variable(0) learning_rate = tf.train.exponential_decay( 1e-4, # Base learning rate. batch * batch_size, # Current index into the dataset. train_size, # Decay step. 0.95, # Decay rate. staircase=True) # Use simple momentum for the optimization. train_step = tf.train.AdamOptimizer(learning_rate).minimize( loss, global_step=batch) # Get accuracy of model with tf.name_scope("ACC"): correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) # Create a summary to monitor learning_rate tensor tf.summary.scalar('learning_rate', learning_rate) # Create a summary to monitor accuracy tensor tf.summary.scalar('acc', accuracy) # Merge all summaries into a single op merged_summary_op = tf.summary.merge_all() # Add ops to save and restore all the variables saver = tf.train.Saver() sess = tf.InteractiveSession() sess.run(tf.global_variables_initializer(), feed_dict={}) # op to write logs to Tensorboard summary_writer = tf.summary.FileWriter(LOGS_DIRECTORY, graph=tf.get_default_graph()) max_acc = 0. start_time = time.time() # Loop for epoch for epoch in range(training_epochs): type_order = utility.generate_random_order(train_size / (3 * TRAIN_BATCH_SIZE)) # Random shuffling for data in utility.Data: [validation_images[data], validation_labels[data] ] = utility.shuffle_data(validation_images[data], validation_labels[data]) [train_images[data], train_labels[data] ] = utility.shuffle_data(train_images[data], train_labels[data]) count = { utility.Data.CUSTOM: 0, utility.Data.MNIST: 0, utility.Data.STREET: 0 } # Loop over all batches for i in range(total_batch): set_type = type_order[i] #print("HELLLLLLLLO") #print("GOOODBYYYYYYE") offset = (count[set_type] * batch_size) % (train_size) count[set_type] = count[set_type] + 1 # Compute the offset of the current minibatch in the data. #offset = (i * batch_size) % (train_size) batch_xs = train_images[set_type][offset:(offset + batch_size), :] batch_ys = train_labels[set_type][offset:(offset + batch_size), :] assert set_type.value in [ utility.Data.CUSTOM.value, utility.Data.MNIST.value, utility.Data.STREET.value ] # Run optimization op (backprop), loss op (to get loss value) # and summary nodes _, train_accuracy, summary = sess.run( [train_step, accuracy, merged_summary_op], feed_dict={ x: batch_xs, y_: batch_ys, data_type: set_type.value }) # Write logs at every iteration summary_writer.add_summary(summary, epoch * total_batch + i) # Display logs if i % display_step == 0: print( "Epoch:", '%04d,' % (epoch + 1), "batch_index %4d/%4d, training accuracy %.5f" % (i, total_batch, train_accuracy)) # Get accuracy for validation data # need to average 3 validation data entries if i % validation_step == 0: # Calculate accuracy validation_accuracy_a = sess.run( accuracy, feed_dict={ x: validation_images[utility.Data.CUSTOM], y_: validation_labels[utility.Data.CUSTOM], data_type: utility.Data.CUSTOM.value }) #print("HIIII") #print(validation_images[utility.Data.STREET].size) validation_accuracy_b = sess.run( accuracy, feed_dict={ x: validation_images[utility.Data.STREET], y_: validation_labels[utility.Data.STREET], data_type: utility.Data.STREET.value }) validation_accuracy_c = sess.run( accuracy, feed_dict={ x: validation_images[utility.Data.MNIST], y_: validation_labels[utility.Data.MNIST], data_type: utility.Data.MNIST.value }) validation_accuracy = (validation_accuracy_a + validation_accuracy_b + validation_accuracy_c) / 3 print( "Epoch:", '%04d,' % (epoch + 1), "batch_index %4d/%4d, validation accuracy %.5f" % (i, total_batch, validation_accuracy)) # Save the current model if the maximum accuracy is updated if validation_accuracy > max_acc: max_acc = validation_accuracy save_path = saver.save(sess, MODEL_DIRECTORY) print("Model updated and saved in file: %s" % save_path) print("Optimization Finished!") print("--- %s seconds ---" % (time.time() - start_time)) # Restore variables from disk saver.restore(sess, MODEL_DIRECTORY) # TESTING # TODO: edit testing for data in utility.Data: labels = test_labels[data] images = test_images[data] test_size = labels.shape[0] batch_size = TEST_BATCH_SIZE total_batch = int(test_size / batch_size) acc_buffer = [] for i in range(total_batch): offset = (i * batch_size) % (test_size) batch_xs = images[offset:(offset + batch_size), :] batch_ys = labels[offset:(offset + batch_size), :] y_final = sess.run(y, feed_dict={ x: batch_xs, y_: batch_ys, data_type: data.value }) correct_prediction = numpy.equal(numpy.argmax(y_final, 1), numpy.argmax(batch_ys, 1)) acc_buffer.append(numpy.sum(correct_prediction) / batch_size) print("test accuracy for the stored model for %s images: %g" % (data, numpy.mean(acc_buffer)))
def on_load(self): self._api_key = utility.load_data("betacie_api_key", "readonly")
def main(): usr = pd.concat((utility.load_data('training', 'user'), utility.load_data('test', 'user'))) for cat in usrdelcats : if hasattr(usr, cat) : del usr[cat] usr = usr.rename(columns={'average_stars' : 'user_average_stars', 'review_count' : 'user_review_count'}) bus = pd.concat((utility.load_data('training', 'business'), utility.load_data('test', 'business'))) for cat in busdelcats : if hasattr(bus, cat) : del bus[cat] bus = bus.rename(columns={'stars' : 'business_average_stars', 'review_count' : 'business_review_count',}) rtxttag = 'rtext_rev{}'.format(rtext_rev) rtxt_tr = utility.load_data('training', rtxttag) rtxt_te = utility.load_data('test', rtxttag) rtxt_te.index = rtxt_te.index + len(rtxt_tr) rtxt = pd.concat((rtxt_tr, rtxt_te)) sgdtag = 'rtext_sgd_rev{}'.format(rtext_sgd_rev) sgdtxt_tr = utility.load_data('training', sgdtag) sgdtxt_te = utility.load_data('test', sgdtag) sgdtxt_te.index = sgdtxt_te.index + len(sgdtxt_tr) sgdtxt = pd.concat((sgdtxt_tr, sgdtxt_te)) tesrev = utility.load_data('test', 'review') trarev = utility.load_data('training', 'review') revlist = [trarev, tesrev] for i in range(len(revlist)) : revlength = revlist[i]['text'].apply(lambda t : len(t.split())) revlist[i]['review_length'] = revlength for col in revlist[i].columns : if not col in ['review_id', 'stars', 'date', 'review_length'] : del revlist[i][col] revlist[i] = revlist[i].rename(columns={'stars' : 'review_stars'}) revlist[i] = pd.merge(revlist[i], rtxt, 'left') revlist[i] = pd.merge(revlist[i], sgdtxt, 'left') revlist[i] = pd.merge(revlist[i], usr, 'left') revlist[i] = pd.merge(revlist[i], bus, 'left') revlist[i] = revlist[i].fillna(-1) for c in normedcols : norm_col(revlist, c) dates = [pd.to_datetime('2013-01-19'), pd.to_datetime('2013-03-12')] for i in range(len(revlist)) : ddiff = dates[i] - revlist[i]['date'] ddiff = ddiff.apply(lambda x: x / np.timedelta64(1, 'D')) revlist[i]['datediff'] = ddiff norm_col(revlist, 'datediff') for i in range(len(revlist)) : for c in delcols : if hasattr(revlist[i], c) : del revlist[i][c] utility.save_data(revlist[0], 'training', 'finalinput') utility.save_data(revlist[1], 'test', 'finalinput')