def main(): #Load the data content_data = data_pro.main() #title_feats title_feats = title_transform(content_data) #source_feats source_feats = pd.get_dummies(content_data['source']).values #reporter_feats reporter_feats = pd.get_dummies(content_data['reporter']).values #week_feats week_feats = pd.get_dummies(content_data['week']).values #topology_feat try: content_data['l1_norm'] = pd.read_csv('l1_norm.csv')['l1_norm'] except: print('csv file not found recalculate') content_data['l1_norm'] = content_topology(content_data) train_data,_,_ = utils.split(content_data) sh_ratio_dict,total_count_dict = utils.compute_author(train_data) content_data['sh_ratio_range'] = content_data['author'].apply(lambda x: fill_way(x,sh_ratio_dict)) content_data['total_count_range'] = content_data['author'].apply(lambda x:fill_way(x,total_count_dict)) return title_feats,source_feats,reporter_feats,week_feats,content_data
def main(): data_praw.main() print("Downloaded data!") data_processing.main() print("Extracted features from training!") data_processing.test(TEST_IN_PATHS, TEST_OUT_PATH) print("Extracted features from test!") with open('data_preprocessed.pickle', 'rb') as data_file: training_data = pickle.load(data_file) with open('test_preprocessed.pickle', 'rb') as data_file: test_data = pickle.load(data_file) with open('classifier_model.pickle', 'rb') as model_file: model = pickle.load(model_file) training_labels = training_data['labels'] new_training_data = remove_extra_features(training_data) results = classify(test_data['data'], new_training_data, training_labels) score = 0 for idx, res in enumerate(results): if res == GROUND_TRUTH[idx]: score += 1 print("Accuracy: ", score/len(GROUND_TRUTH))
def main(): DIR = "../TaxiBJ" X_hour, X_day, X_week, X_extra, y, timeslots = data_processing.main( DIR, True) num_epochs = 1 batch_size = 16 learning_rate = 1e-3 model = Attention() # summary(model, input_size = [(18,32,32),(1,1,21)]) criterion = nn.MSELoss() optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=1e-5) train_dataset, test_dataset = train_test_split(X_hour, X_day, X_week, X_extra, y, timeslots) train_dataloader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True) test_dataloader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True) for epoch in range(num_epochs): for X1, X2, y, timeslot in train_dataloader: X1 = Variable(X1.float()) X2 = Variable(X2.float()) y = Variable(y.float()) output = model(X1, X2) # m = output[0][0].detach().numpy() # path = '../data/{}_{}.original.png'.format(2016021515, epoch) # print('drawing fig') # draw_fig(m, path) loss = criterion(output, y) optimizer.zero_grad() loss.backward() optimizer.step() print('epoch [{}/{}], loss:{}'.format(epoch + 1, num_epochs, loss.data)) torch.save(model, "model") for X1, X2, y, timeslot in test_dataloader: X1 = Variable(X1.float()) X2 = Variable(X2.float()) y = Variable(y.float()) output = model(X1, X2) loss = criterion(output, y) print(loss.data) return model
def main(): # Get data from data_engineering step df = dp.main() df.cache() # Class imbalance sample rates sample_fractions = { "ChurnedInTimeBin": 0.065, "WillChurnInNextBin": 0.05, "WillChurnSoon": 0.12 } # Use areaUnderPR as evaluation metric evaluator = BinaryClassificationEvaluator(metricName='areaUnderPR') # Loop through the labels we're going to try and build models for for label, sample_fraction in sample_fractions.items(): print("Building model for predicting this label: ", label) # Define features and labels into a dataframe for modeling dataset = define_features_and_label(df, label) # Sample the data b/c of the class imbalance sampled_data = dataset.sampleBy('label', fractions={ 0: sample_fraction, 1: 1.0 }) dataset.unpersist() sampled_data.unpersist() # Do 80/20 split of the dataset for training/testing train, test = sampled_data.randomSplit([0.8, 0.2], seed=147309) train.cache() test.cache() # Train, CrossValidate, and Save models # Logistic Regression bestLR = lr_model(train, test, evaluator) bestLR.write().overwrite().save("./Model/" + label + ".LRmodel") del bestLR # Random Forest bestRF = rf_model(train, test, evaluator) bestRF.write().overwrite().save("./Model/" + label + ".RFmodel") del bestRF # Gradient-Boosted Trees bestGBT = gbt_model(train, test, evaluator) bestGBT.write().overwrite().save("./Model/" + label + ".GBTmodel") del bestGBT train.unpersist() test.unpersist() df.unpersist()
import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler import tensorflow as tf from tensorflow import keras from tensorflow.keras.optimizers import Adam import data_processing from data_post_process import bin_to_families, get_family_membership_idx, \ get_train_test_idx from helper_functions import plot_parity processed_data = data_processing.main() X, y = processed_data['X'], processed_data['y'] bin_bounds = [2, 5] n_bins = len(bin_bounds) + 1 descriptor_names = processed_data['descriptor_names'] family_one_hotx = processed_data['family_one_hotx'] y, y_ordinal = bin_to_families(y, bin_upper_bounds=bin_bounds) # get class membership ids to split to train/ test independently in each set class_membership_idx = get_family_membership_idx(y_ordinal) # pyplot parameters plt.rcParams['svg.fonttype'] = 'none' plt.rc('xtick', labelsize=16) plt.rc('ytick', labelsize=16)
def main(): # Filter warnings that polute the project stdout. filter_warnings() # Rationale: produce cleaner results. # Set the random seed for the entire project. du.set_random_seed(0) # Rationale: ensure reproducibility of the results. # Flush previous runs. # constants.flush_project_results(constants.TMP_PATH, # constants.OUTPUT_PATH) # Rationale: provide a clear state for the project to run and enforces # reproducibility of the results. # Download, load and save data. data_loading.main() dataframe = data_loading.load_data(constants.DATASET_PATH, constants.TMP_PATH) data_loading.save_data(dataframe, constants.TMP_PATH, constants.DATASET_PATH) # Rationale: *Loading*: load data in the main module and pass it as a first # argument to every other defined function (that relates to the data set) # thus saving precious time with data loading. *Saving*: for big data sets # saving the dataset as a fast read format (such as HDF5) saves time. # Load and combine data processing pipelines. data_processing.main(dataframe, nan_strategy='drop') # Rationale: prepare data to be fed into the models. # Different algorithms make use of different data structures. For instance # XGBoost allow for nans. Data transformations usually don't. # Perform exploratory data analyses. data_exploration.main(dataframe) # Rationale: conduct exploratory data analyses. # Data split. # Removed. # Rationale: module 'models' should execute this. # Perform grid search. # Iteration over processed data sets may occur here since they are model # dependent. grid_search.main(constants.MODELS, constants.GRIDS) best_combination_of_datasets_and_grids = ( grid_search.dict_of_best_datasets_and_grids(constants.MODELS, constants.GRIDS)) best_datasets = best_combination_of_datasets_and_grids['best_datasets'] best_grids = best_combination_of_datasets_and_grids['best_grids'] # Rationale: perform grid search as part of machine learning best # practices. # Summary of what was executed so far: # 1) Setting of the random seed for reproducibility. # 2) Flusing of intermediate results for a clean run. # 3) Data loading and data saving. # 4) Conduction of exploratory data analyses. # 5) Grid search of best model hyper parameters. # To conclude our project we need the grand finale: model selection and # evaluation/comparison. models.main(constants.MODELS, best_datasets, best_grids, constants.MODEL_FITTING_PARAMETERS)
##### # Runs API and data processing modules ##### from api_connect import url_dict, callAPI import data_processing as dp if __name__ == "__main__": callAPI(url_dict) # connects to API dp.main() # processes raw tables
def main(): # import the data processed_data = data_processing.main() X, y = processed_data['X'], processed_data['y'] feature_names = processed_data['descriptor_names'] X = torch.tensor(X, device=device) y = torch.tensor(y.reshape(-1,1), device=device) # normalize X and y y_scale = y.std(dim=0) y_mean = y.mean(dim=0) X = (X - X.mean(dim=0)) / X.std(dim=0) y = (y - y_mean) / y_scale # 50% of the data is used as initial training points. The rest are test data initial_data_size = int(0.4 * X.shape[0]) initial_idx = list(np.random.choice(X.shape[0], initial_data_size, \ replace=False)) X_test, X_train = tensor_pop(X, to_pop=initial_idx) y_test, y_train = tensor_pop(y, to_pop=initial_idx) # set up GPR model gpr_model, gpr_mll = get_gpr_model(X=X_train, y=y_train) # plot model performance plot_testing(gpr_model, X_test=X, X_train=X_train, y_train=y_train, \ y_test=y,x_dim=13, feature_names=feature_names) # do some optimization! N_OPT_STEPS = 25 opt_bounds = torch.stack([X.min(dim=0).values, X.max(dim=0).values]) max_val, upper_confidence, lower_confidence = [], [], [] for _ in range(N_OPT_STEPS): # get the point which has the maximum posterior and the variance to it print('X_train ', len(X_train), 'X_test ', len(X_test), \ 'max y_train', y_train.max()* y_scale + y_mean) gpr_model.eval(); posterior = gpr_model.posterior(X) lower, upper = posterior.mvn.confidence_region() max_posterior, index = posterior.mean.max(dim=0) print(max_posterior * y_scale + y_mean, 'max post') max_val.append(float(max_posterior * y_scale + y_mean)) upper_confidence.append(float(upper[index] * y_scale + y_mean)) lower_confidence.append(float(lower[index] * y_scale + y_mean)) updated_model = optimize_loop( model=gpr_model, \ loss=gpr_mll, X_train=X_train, y_train=y_train, \ X_test=X_test, y_test=y_test, \ bounds=opt_bounds) gpr_model, gpr_mll = updated_model['model'], updated_model['loss'] X_train, y_train = updated_model['X_train'], updated_model['y_train'] X_test, y_test = updated_model['X_test'], updated_model['y_test'] X_new, y_new = updated_model['X_new'], updated_model['y_new'] # plot model performance plot_testing(gpr_model, X_test=X, X_train=X_train, y_train=y_train, \ y_test=y,x_dim=13, feature_names=feature_names, \ X_new=X_new, y_new=y_new) plt.plot([_ for _ in range(N_OPT_STEPS)], max_val, \ 'go--', linewidth=2, markersize=12) plt.fill_between([_ for _ in range(N_OPT_STEPS)], lower_confidence, \ upper_confidence, alpha=0.5, label = '95% Credibility') plt.show()
def launch_process_data(self): print("Launching data processing program") data_processing.main()