def main(): test = data_io.read_test() ## deal with the NAs, and add features train.feature_eng(test) ## predict the booking_bool print("Loading the Booking classifier..") tstart = datetime.now() classifier = data_io.load_model(True) print("Time used,") print(datetime.now() - tstart) print("Making predictions on the booking_bool..") tstart = datetime.now() b_fnames = train.get_features(test, True) b_test_f = test[b_fnames].values b_prob = classifier.predict_proba(b_test_f)[:, 1] b_prob = list(-1.0 * b_prob) print("Time used,") print(datetime.now() - tstart) ## predict the click_bool print("Loading the Click classifier..") tstart = datetime.now() classifier = data_io.load_model(False) print("Time used,") print(datetime.now() - tstart) print("Making predictions on the click_bool..") tstart = datetime.now() c_fnames = train.get_features(test, False) c_test_f = test[c_fnames].values c_prob = classifier.predict_proba(c_test_f)[:, 1] c_prob = list(-1.0 * c_prob) print("Time used,") print(datetime.now() - tstart) ## Making Recommendations recommendations = zip(test["srch_id"], test["prop_id"], 4 * b_prob + c_prob) print("Writing predictions to file..") tstart = datetime.now() data_io.write_submission(recommendations) print("Time used,") print(datetime.now() - tstart)
def main(): test = data_io.read_test() ## deal with the NAs, and add features train.feature_eng(test) ## predict the booking_bool print("Loading the Booking classifier..") tstart = datetime.now() classifier = data_io.load_model(True) print("Time used,") print datetime.now() - tstart print("Making predictions on the booking_bool..") tstart = datetime.now() b_fnames = train.get_features(test, True) b_test_f = test[b_fnames].values b_prob = classifier.predict_proba(b_test_f)[:,1] b_prob = list(-1.0*b_prob) print("Time used,") print datetime.now() - tstart ## predict the click_bool print("Loading the Click classifier..") tstart = datetime.now() classifier = data_io.load_model(False) print("Time used,") print datetime.now() - tstart print("Making predictions on the click_bool..") tstart = datetime.now() c_fnames = train.get_features(test, False) c_test_f = test[c_fnames].values c_prob = classifier.predict_proba(c_test_f)[:,1] c_prob = list(-1.0*c_prob) print("Time used,") print datetime.now() - tstart ## Making Recommendations recommendations = zip(test["srch_id"], test["prop_id"], 4*b_prob+c_prob) print("Writing predictions to file..") tstart = datetime.now() data_io.write_submission(recommendations) print("Time used,") print datetime.now() - tstart
def main(): ## load test data set and do feature engineering test = data_import.load_test() train.feature_eng(test) ## load classifier for the booking_bool print("Loading the Booking classifier..") tstart = datetime.now() classifier = data_import.load_model(True) print("Time used:" + str(datetime.now() - tstart) + "\n") ## predict the booking_bool print("Making predictions on the booking_bool..") tstart = datetime.now() book_feature_names = train.get_features(test) book_X = test[book_feature_names].values book_Y_pred = classifier.predict_proba(book_X)[:, 1] book_Y_pred = list(-1.0 * book_Y_pred) print("Time used:" + str(datetime.now() - tstart) + "\n") ## load classifier for the click_bool print("Loading the Click classifier..") tstart = datetime.now() classifier = data_import.load_model(False) print("Time used:" + str(datetime.now() - tstart) + "\n") ## predict the click_bool print("Making predictions on the click_bool..") tstart = datetime.now() click_feature_names = train.get_features(test) click_X = test[click_feature_names].values click_Y_pred = classifier.predict_proba(click_X)[:, 1] click_Y_pred = list(-1.0 * click_Y_pred) print("Time used:" + str(datetime.now() - tstart) + "\n") ## Making results where 3rd column is the score based on likelihood of click and booking results = zip(test["srch_id"], test["prop_id"], 4 * book_Y_pred + click_Y_pred) print("Writing predictions to file..") tstart = datetime.now() data_import.write_submission(results) print("Time used:" + str(datetime.now() - tstart) + "\n")
def initialize_model(): if not os.path.exists(cfg.MODEL_BIN): car_features, notcar_features = get_features() svc, X_scaler = train_model(car_features, notcar_features) model = {'svc': svc, 'X_scaler': X_scaler} # Save the model on disk with open(cfg.MODEL_BIN, 'wb') as f: pickle.dump(model, f) else: svc, X_scaler = load_model() return svc, X_scaler
def main(args): pairs = [] features, labels = [], [] dist_predictions = [] val = {'True': 1, 'False': 0} sys.stdout.write('> Computing features for test data ...') with open(args['flashprofile_output'], 'r') as f: val = {'True': 1, 'False': 0} data = f.read().split('\n')[:-1] dist_predictions.append( ('FlashProfile', np.fromiter((float(s.split(' :: ')[0].split(' @ ')[1]) for s in data[::3]), float))) labels.extend(val[s.split('|')[0].strip()] for s in data[::3]) strings = iter(s[11:-1] for (i, s) in enumerate(data) if i % 3 > 0) for s1, s2 in zip(strings, strings): features.append(get_features(s1, s2)) pairs.append((s1, s2)) print('\r> Feature vector computation DONE (on %d points)\n' % len(pairs)) dist_predictions.append( ('JaroWinkler', [jelly.jaro_winkler(*p) for p in pairs])) for pair in args['sim-dis-combination']: num_sim_pairs, num_dis_pairs = pair.split(',') num_sim_pairs, num_dis_pairs = int(num_sim_pairs), int(num_dis_pairs) model = joblib.load( os.path.join( args['root_dir'], 'logs', 'RandomForest.%d.%d.pkl' % (num_sim_pairs, num_dis_pairs))) dist_predictions.append(('RF.%d.%d' % (num_sim_pairs, num_dis_pairs), model.predict(features))) for (dfile, predictions) in dist_predictions: with open( os.path.join(args['root_dir'], 'logs', 'Similarity.%sPR.log' % dfile), 'w') as f: f.write('precision\trecall\n') precision, recall, _ = precision_recall_curve(labels, predictions) for pr in zip(precision, recall): f.write('%f\t%f\n' % pr) vauc = auc(recall, precision, reorder=True) print('AUC(%s) = %f' % (dfile, vauc))
type='string', dest='config', default='train_config_threelayer.yml', help='configuration file') (options, args) = parser.parse_args() yamlConfig = parse_config(options.config) if os.path.isdir(options.outputDir): #raise Exception('output directory must not exists yet') raw_input( "Warning: output directory exists. Press Enter to continue...") else: os.mkdir(options.outputDir) X_train_val, X_test, y_train_val, y_test, labels = get_features( options, yamlConfig) model_constraint = getattr(models, yamlConfig['KerasModelRetrain']) # Instantiate new model with added custom constraints if 'L1RegR' in yamlConfig: keras_model = model_constraint(Input(shape=X_train_val.shape[1:]), y_train_val.shape[1], l1Reg=yamlConfig['L1Reg'], l1RegR=yamlConfig['L1RegR'], h5fName=options.dropWeights) else: keras_model = model_constraint(Input(shape=X_train_val.shape[1:]), y_train_val.shape[1], l1Reg=yamlConfig['L1Reg'], h5fName=options.dropWeights)
# Declare what we will be optimizing, and how: "spec": { "metric": "ROC", "objective": "maximize", }, } parameters = open("parameters.yml") yamlparameters = yaml.load(parameters, Loader=yaml.FullLoader) opt = Optimizer(config, api_key=yamlparameters["comet_api_key"], project_name="NNqhmv6", auto_metric_logging=True) X_train, X_test, y_train, y_test = get_features(yamlparameters["DataDir"]) for experiment in opt.get_experiments(): keras_model = models.qdense_model( Input(shape=X_train.shape[1:]), l1Reg=experiment.get_parameter("Regularization"), bits=14, ints=2) #keras_model = models.dense_model(Input(shape=X_train.shape[1:]), l1Reg=experiment.get_parameter("Regularization")) startlearningrate = experiment.get_parameter("learning_rate") adam = Adam(lr=startlearningrate, beta_1=experiment.get_parameter("learning_beta1"), beta_2=experiment.get_parameter("learning_beta2"), amsgrad=experiment.get_parameter("Adagrad")) keras_model.compile(optimizer=adam, loss='binary_crossentropy',
def add_embeddings(CLASSES, model_name, fold_index, checkPoint_start, features_file): # Get the model device = torch.device('cuda') model = model_whale(num_classes=CLASSES * 2, inchannels=4, model_name=model_name).to(device) # Find result dir resultDir = './result/{}_{}'.format(model_name, fold_index) checkPoint = os.path.join(resultDir, 'checkpoint') # Load the pretrained weights if not checkPoint_start == 0: ckp = torch.load( os.path.join(checkPoint, '%08d_optimizer.pth' % (checkPoint_start))) model.load_state_dict( torch.load( os.path.join(checkPoint, '%08d_model.pth' % (checkPoint_start)))) # Load image data to_add = pd.read_csv('./input/embed_split_{}_add.csv'.format(fold_index)) # Only do if necessary if 0: to_add = pd.read_csv('./input/embed_split_{}.csv'.format(fold_index)) # Split up the embedding images and save to different files data_test = to_add[1::2] outfile = "./input/embed_split_{}_test.csv".format(fold_index) data_test.to_csv(outfile, index=None) to_add = to_add[::2] outfile = "./input/embed_split_{}_add.csv".format(fold_index) to_add.to_csv(outfile, index=None) names_embed = to_add['Image'].tolist() labels_embed = to_add['Id'].tolist() batch_size = 16 mode = 'embed' print("\nNumber of images to add:", len(names_embed)) # Setup dataloader dst_embed = WhaleTestDataset(names_embed, labels_embed, mode=mode, transform=transform) dataloader_embed = DataLoader(dst_embed, shuffle=False, drop_last=False, batch_size=batch_size, num_workers=8, collate_fn=embed_collate) # Load the embeddings infile = "train_features{}.csv".format(features_file) embeddings = torch.Tensor(pd.read_csv(infile).to_numpy()).float() infile2 = "train_ids{}.csv".format(features_file) ids = torch.Tensor(pd.read_csv(infile2).to_numpy()).long() # Get the features to add new_ids, feats = get_features(dataloader_embed, model, CLASSES * 2) # Concatenate and save the features added_feats = torch.cat([embeddings, torch.Tensor(feats).float()], 0) added_ids = torch.cat([ids.view(-1), torch.Tensor(new_ids).long()], 0) outfile = "train_ids{}_added.csv".format(features_file) outfile2 = "train_features{}_added.csv".format(features_file) df1 = pd.DataFrame(added_ids.numpy()) df2 = pd.DataFrame(added_feats.numpy()) # Keep track of id, vector and some info about where this # was gotten (model, fold, iteration?) df1.to_csv(outfile, index=None) df2.to_csv(outfile2, index=None) print("Files {} and {} created with added ids and features.".format( outfile, outfile2))