def get_test_data(): ''' Input: None Output: DataFrame - data from test set ''' df = load_data(train=False) df = transform_data(df) X, ids = prep_for_modeling(df, column='id', columns_to_drop=columns_to_drop[:-1]) return X, ids
def main(): training_data, test_data, output_rsts = load_data() ####################################### #Training ####################################### input_layer_size = len(training_data[0][0]) net = NN((input_layer_size, 30, 2), output_rsts) net.SGD(training_data, mini_batch_size=10, epochs=30, eta=3.0, test_data=test_data)
def get_model(ticker): n_sizes = [14, 11, 8, 5, 3, 2] n_filters = 64 es = EarlyStopping(monitor="val_loss", min_delta=10e-3, patience=10) red = ReduceLROnPlateau() inputs = Input((20, 11)) convs = [] for size in n_sizes: conv = Conv1D(n_filters, size, strides=1, activation='relu')(inputs) #conv_max = GlobalMaxPooling1D()(conv) conv_avg = GlobalAveragePooling1D()(conv) #concat = concatenate([conv_max, conv_avg]) convs.append(conv_avg) conv = concatenate(convs) dense = Dense(128)(conv) dropout = Dropout(0.2)(conv) dense = Dense(64)(dropout) dropout = Dropout(0.2)(dense) dense = Dense(32)(dropout) output = Dense(11)(dense) model = Model(inputs=[inputs], outputs=[output]) model.compile('rmsprop', loss='mse') print(model.summary()) X_train, y_train, X_val, y_val = dp.load_data(ticker) model.fit(x=X_train, y=y_train, epochs=100, batch_size=3200, validation_data=(X_val, y_val), callbacks=[es, red]) new_model = Model(inputs=[model.layers[0].output], outputs=[model.layers[-2].output]) new_model.save('{}/timeseries_embeddor'.format(dir_)) return new_model
def main(): ## Load Config file, make sure type of object are correct. config: json = {} if len(sys.argv) < 2: print("No argv provided, using DEFAULT config profile.") with open('./config/default.json') as jsonConfig: config = json.load(jsonConfig) else: print("Using ", sys.argv[1] + " config profile.") with open('./config/' + sys.argv[1] + '.json') as jsonConfig: config = json.load(jsonConfig) # get cuda device device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') ## Create checkpoint path if not os.path.exists("./checkpoints"): os.makedirs("./checkpoints") #default hyperparams num_epoch = config['batch_size'] batch_size = config['num_epoch'] if config['cnn_type'] == "resnet50": model = models.resnet50(pretrained=True).to(device) # model = models.resnet50(pretrained=False).to(device) # feature extraction, disable to finetune whole model # for name, param in model.named_parameters(): # if ("layer4" not in name): # if ("layer3" not in name): # if ("layer2" not in name): # param.requires_grad = False # print(name) num_ftrs = model.fc.in_features model.fc = nn.Sequential( # nn.Dropout(0.5), nn.Linear(num_ftrs, 2)).to(device) # params_to_update = [] # for name, param in model.named_parameters(): # if param.requires_grad == True: # params_to_update.append(param) # print("\t",name) new_train_dataset, new_test_dataset, _ = data_prep.load_data() criterion = nn.CrossEntropyLoss() optimizer = optim.Adam(model.parameters(), lr=config["lr"]) trainloader = torch.utils.data.DataLoader( new_train_dataset, batch_size=config['batch_size'], shuffle=True) unknownloader = torch.utils.data.DataLoader(new_test_dataset, batch_size=22, shuffle=False) model.train() train(trainloader, unknownloader, model, criterion, optimizer, device, config['num_epoch']) exit() elif config['cnn_type'] == "demo": print("Demo Time!") model = models.resnet50(pretrained=False).to(device) # model = models.vgg11_bn(pretrained=True).to(device) num_ftrs = model.fc.in_features model.fc = nn.Sequential( # nn.Dropout(0.5), nn.Linear(num_ftrs, 2)).to(device) # num_ftrs = model.classifier[6].in_features # model.classifier[6] = nn.Linear(num_ftrs, 2).to(device) #loading the 99% accurate one if torch.cuda.is_available(): model.load_state_dict(torch.load('checkpoints/100')) else: model = torch.load('checkpoints/100', map_location='cpu') new_train_dataset, new_test_dataset, raw_unknown_data = data_prep.load_data( ) unknownloader = torch.utils.data.DataLoader(new_test_dataset, batch_size=22, shuffle=False) # my_raw_unknown_dataset = MyDataset(raw_unknown_data, unknown_labels, 224) # raw_unknownloader = torch.utils.data.DataLoader(my_unknown_dataset, batch_size=5, # shuffle=False) demo_test(unknownloader, raw_unknown_data, model, device) exit() else: print("No Model Provided!") exit()
def main(enc='le'): X_train, X_test, y_train, y_test = data_prep.load_data() data_prep.describe_df(pd.DataFrame(X_train)) if enc == 'ohe': X_train = data_prep.ohe_encode( X_train, np.delete(list(range(0, X_train.shape[1])), [1, 4, 12])) ohe = pickle.load(open('../res/ohe.pkl', 'rb')) X_test = ohe.transform(X_test) # ============================================================================= # No feature engineering # ============================================================================= print("Random Forest.....") clf = random_forest(X_train, y_train) print(test_clf(clf, X_test, y_test)) print("Decision Tree.....") clf = dt(X_train, y_train) print(test_clf(clf, X_test, y_test)) print("SVC.....") clf = svc(X_train, y_train) print(test_clf(clf, X_test, y_test)) print("Adaboost.....") clf = ada(X_train, y_train) print(test_clf(clf, X_test, y_test)) # ============================================================================= # PCA # ============================================================================= X_train_pca = data_prep.feature_selection_pca(X_train, 17) pca = pickle.load(open('../res/pca.pkl', 'rb')) X_test_pca = pca.transform(X_test) print("With PCA") print("Random Forest.....") clf = random_forest(X_train_pca, y_train) print(test_clf(clf, X_test_pca, y_test)) print("Decision Tree.....") clf = dt(X_train_pca, y_train) print(test_clf(clf, X_test_pca, y_test)) print("SVC.....") clf = svc(X_train_pca, y_train) print(test_clf(clf, X_test_pca, y_test)) print("Adaboost.....") clf = ada(X_train_pca, y_train) print(test_clf(clf, X_test_pca, y_test)) # ============================================================================= # Select K features # ============================================================================= if enc == "le": np.random.seed(9) X_train = pd.DataFrame(X_train) X_test = pd.DataFrame(X_test) feat = data_prep.feature_selection(X_train, y_train, 15) X_train_feat = X_train[feat] X_test_feat = X_test[feat] print("With Features selector") print("Random Forest.....") clf = random_forest(X_train_feat, y_train) print(test_clf(clf, X_test_feat, y_test)) print("Decision Tree.....") clf = dt(X_train_feat, y_train) print(test_clf(clf, X_test_feat, y_test)) print("SVC.....") clf = svc(X_train_feat, y_train) print(test_clf(clf, X_test_feat, y_test)) print("Adaboost.....") clf = ada(X_train_feat, y_train) print(test_clf(clf, X_test_feat, y_test))
x = cv2.imread(path, cv2.IMREAD_GRAYSCALE) x = cv2.resize(x, (256, 256)) x = x/255.0 x = np.expand_dims(x, axis=-1) return x def mask_parse(mask): mask = np.squeeze(mask) mask = [mask,mask,mask] mask = np.transpose(mask, (1,2,0)) return mask if __name__ == "__main__": (train_x, train_y), (valid_x, valid_y), (test_x, test_y) = load_data("dataset/") print(len(train_x), len(test_x)) batch = 8 test_dataset = tf_dataset(test_x, test_y, batch = batch) test_steps = len(test_x)//batch if len(test_x) % batch != 0: test_steps +=1 with CustomObjectScope({'iou':iou}): model = tf.keras.models.load_model("files/model.h5") model.evaluate(test_dataset, steps = test_steps) for i , (x,y) in tqdm(enumerate(zip(test_x, test_y)), total = len(test_x)):
from sklearn.tree import DecisionTreeClassifier from sklearn.svm import SVC from sklearn.neural_network import MLPClassifier from sklearn.naive_bayes import GaussianNB classifier_idx=['SVC','DT','RF','MLP','ADA','NB'] classifiers = [ SVC(gamma=1, C=0.001), DecisionTreeClassifier(max_depth=7,random_state=0), RandomForestClassifier(n_estimators= 1000,criterion='entropy',random_state=0), MLPClassifier(activation='relu',learning_rate='adaptive'), AdaBoostClassifier(DecisionTreeClassifier(random_state=0,max_depth=10), algorithm="SAMME", n_estimators=1000,random_state=0 ), GaussianNB()] X_train,X_test,y_train,y_test = data_prep.load_data() #data_prep.describe_df(pd.DataFrame(X_train)) def process_clf(X_train,y_train,X_test,y_test,filename): record = pd.DataFrame() for count,clf in enumerate(classifiers): print('Working on '+classifier_idx[count]) clf = classifiers[count] clf= clf.fit(X_train,y_train) pred= clf.predict(X_test) acc = sum(pred==y_test)/len(y_test) recall = recall_score(y_test, pred, average=None) rec = pd.DataFrame({'accuracy':acc,'recall':recall,'confusion_mat':str(confusion_matrix(y_test, pred)),'classifier':classifier_idx[count]}) record= record.append(rec) record.to_csv(filename)
import pandas as pd from data_prep import load_data def category_column_correlation(column, target): ''' Input: Series - Column to be dummied, Series - Target, undummified Output: DataFrame - Correlation coefficient matrix dummies of column and target Non-redundant ''' column_dummies = pd.get_dummies(column) target_dummies = pd.get_dummies(target) correlation_df = pd.concat([column_dummies, target_dummies], axis=1).corr() return correlation_df.ix[column_dummies.columns, target_dummies.columns] if __name__ == '__main__': df = load_data()
import tensorflow as tf import sys from datetime import datetime now = datetime.now() #Supplemetary Files import data_prep as data print("--- Dependancies Loaded ---") data_type = "" if (len(sys.argv) == 2): data_type = sys.argv[1].lower() if (data_type == "raw"): print ("Loading raw data") m_data, idx_q, idx_a = data.load_data(True) elif (data_type == "clean"): print ("Loading clean data") m_data, idx_q, idx_a = data.load_data(False) else: sys.exit("Error, incorrect command line arguments. Please supply either 'raw' or 'clean' for which dataset should be used.") else: sys.exit("Error, incorrect command line arguments. Please supply either 'raw' or 'clean' for which dataset should be used.") (trX, trY), (teX, teY), (vaX, vaY) = data.split_dataset(idx_q, idx_a) print("--- Data Loaded ---") #Hyperparameters epochs = 500 batch_size = 64
def iou(y_true, y_pred): def f(y_true, y_pred): intersection = (y_true * y_pred).sum() union = y_true.sum() + y_pred.sum() - intersection x = (intersection + 1e-15) / (union + 1e-15) x = x.astype(np.float32) return x return tf.numpy_function(f, [y_true, y_pred], tf.float32) if __name__ == "__main__": (train_x, train_y), (valid_x, valid_y), (test_x, test_y) = load_data("dataset/") print(len(train_x), len(test_x)) batch = 8 lr = 1e-4 epochs = 50 train_dataset = tf_dataset(train_x, train_y, batch=batch) valid_dataset = tf_dataset(valid_x, valid_y, batch=batch) model = build_model() optimizer = tf.keras.optimizers.Adam(lr) metrics = ['acc', Recall(), Precision(), iou]
import util # %% # Work around some truncated images: https://stackoverflow.com/a/23575424 ImageFile.LOAD_TRUNCATED_IMAGES = True # %% # prepare data data_dir = "data" all_photos_dir = os.path.join(data_dir, "photos") split_photos_dir = os.path.join(data_dir, "from") df = data_prep.load_data(os.path.join(data_dir, "_chat.txt")) data_prep.show_plots(df) dataloaders, dataset_sizes, class_names = \ data_prep.prepare_loaders(split_photos_dir) # %% # View some data # Get a batch of training data inputs, classes = next(iter(dataloaders["train"])) # Make a grid from batch out = torchvision.utils.make_grid(inputs)
def run_ap_model(outcome, ier_prior=False, measure='log_ratio', include_smoking=False, include_shs=True, mono=False, cvcv=False, oap_gold_standard=False, n_splines=100, n_ns_knots=4, n_s_knots=4, n_bins=200): # load data if outcome.startswith('cvd'): age_adjust = True else: age_adjust = False df, model_cols, obs_mean, obs_std, study_sizes, N = load_data( outcome, measure=measure, age_adjust=age_adjust, include_smoking=include_smoking, include_shs=include_shs) # if outcome == 'lri': # model_cols = ['child'] # add_age = False # elif outcome.startswith('cvd'): # model_cols = ['incidence'] # add_age = False # else: # model_cols = [] # add_age = False model_cols = [] add_age = False if oap_gold_standard: model_cols = model_cols + ['other_ap'] # check for NAs for model_col in model_cols: # df.loc[df[model_col].isnull(), model_col] = 0 if len(df.loc[df[model_col].isnull()]) > 0: problem_nid_list = df.loc[df[model_col].isnull(), 'nid'].tolist() problem_nid = ', '.join(problem_nid_list) raise ValueError( f'Missing value for {model_col} in NID(s) {problem_nid}') if add_age: assert df['median_age_fup'].max( ) > 0, 'Age included model with no age data.' # create spline ns_spline_mat = df.loc[df.ier_source != 'AS', ['conc_den', 'conc']].values.flatten() spline_list = create_spline_list(ns_spline_mat, degree=3, n_knots=n_ns_knots, l_linear=False, r_linear=True, n_splines=n_splines, width_pct=0.2, l_zero=True) if include_smoking: s_spline_mat = df.loc[df.ier_source == 'AS', 'conc'].values # just use tail end for smoking s_spline_list = create_spline_list(s_spline_mat, degree=3, n_knots=n_s_knots, n_splines=n_splines, width_pct=0.2, l_zero=False) for i in range(n_splines): spline_list[i].knots = np.hstack( [spline_list[i].knots, s_spline_list[i].knots]) # covs and priors x_cov_list, z_cov_list = get_cov_lists(df, model_cols, measure=measure, add_age=add_age) prior_list = get_priors(outcome=outcome, measure=measure, n_ns_knots=n_ns_knots, exp_spline=spline_list[0], age_decreasing=False, cvcv=cvcv, mono=mono) if ier_prior: prior_list = add_ier_prior(prior_list, outcome, spline_list, n_ns_knots) # run meta-regression mr = MR_BeRT(obs_mean=obs_mean, obs_std=obs_std, study_sizes=study_sizes, x_cov_list=x_cov_list, z_cov_list=z_cov_list, spline_list=spline_list, inlier_percentage=0.9) mr.addPriors(prior_list) if measure == 'log_ratio': x0 = ratioInit(mr, 0) else: x0 = None mr.fitModel(x0=x0) mr.scoreModel(np.array([0.4, 0.6])) given_samples = get_parameter_samples(mr, len(mr.spline_list) * 10) # if include_smoking: # exp_pred_array = np.linspace(spline_list[0].knots[0], spline_list[0].knots[n_ns_knots-1], int(n_bins / 2) + 1) # s_exp_pred_array = np.linspace(spline_list[0].knots[n_ns_knots-1], spline_list[0].knots[-1], int(n_bins / 2)) # exp_pred_array = np.hstack([exp_pred_array[:-1], s_exp_pred_array]) # exp_pred_array = np.unique(exp_pred_array) # else: # exp_pred_array = np.linspace(spline_list[0].knots[0], spline_list[0].knots[-1], n_bins) exp_pred_array = np.hstack([ np.arange(0, 10, 0.01), np.arange(10, 100, 0.1), np.arange(100, 1000), np.arange(1000, 10010, 10) ]) if add_age: age_pred_array = np.percentile(df['median_age_fup'], 50) age_pred_array = np.repeat(age_pred_array, n_bins) else: age_pred_array = None pred_x_cov_list, pred_z_cov_list = get_cov_lists( model_cols=model_cols, measure=measure, add_age=add_age, linear=False, pred=True, pred_ref=True, exp_pred_array=exp_pred_array, age_pred_array=age_pred_array) if len(x_cov_list) > 1: pred_x_cov_list_alt, pred_z_cov_list_alt = get_cov_lists( model_cols=model_cols, measure=measure, add_age=add_age, linear=False, pred=True, pred_ref=False, exp_pred_array=exp_pred_array, age_pred_array=age_pred_array) for i in range(len(pred_x_cov_list)): pred_x_cov_list[i]['mat'] = np.hstack( [pred_x_cov_list[i]['mat'], pred_x_cov_list_alt[i]['mat']]) for i in range(len(pred_z_cov_list)): pred_z_cov_list[i]['mat'] = np.hstack( [pred_z_cov_list[i]['mat'], pred_z_cov_list_alt[i]['mat']]) pred_x_cov_list_data_l, pred_z_cov_list_data_l = get_cov_lists( model_cols=model_cols, measure=measure, add_age=add_age, linear=False, pred=True, pred_ref=True, exp_pred_array=df['conc_den'].values) if measure == 'log_ratio': ref_point = spline_list[0].knots[0] elif measure == 'diff': ref_point = None y_samples = mr.predictData( pred_x_cov_list, pred_z_cov_list, sample_size=n_splines * 10, pred_study_sizes=[len(pred_x_cov_list[0]['mat'])], include_random_effect=True, ref_point=ref_point, **given_samples)[0] y_samples = np.vstack(y_samples) y_samples_fe = mr.predictData( pred_x_cov_list, pred_z_cov_list, sample_size=n_splines * 10, pred_study_sizes=[len(pred_x_cov_list[0]['mat'])], include_random_effect=False, ref_point=ref_point, **given_samples)[0] y_samples_fe = np.vstack(y_samples_fe) y_samples_fe_data_l = mr.predictData(pred_x_cov_list_data_l, pred_z_cov_list_data_l, sample_size=n_splines * 10, pred_study_sizes=mr.study_sizes, include_random_effect=False, ref_point=ref_point, **given_samples)[0] y_samples_fe_data_l = np.vstack(y_samples_fe_data_l) return df, mr, given_samples, pred_x_cov_list, y_samples, y_samples_fe, y_samples_fe_data_l