def _load_features(dataset, data_path, block_size=128): """Load the features and the associated metadata for a dataset. The metadata is read from a CSV file and returned as a DataFrame. Each DataFrame entry corresponds to an instance in the dataset. Args: dataset (Dataset): Information about the dataset. data_path (str): Path to directory containing feature vectors. Returns: tuple: Tuple containing the array of feature vectors and the metadata of the dataset. """ import features import utils # Load feature vectors from disk features_path = os.path.join(data_path, dataset.name + '.h5') x, n_blocks = utils.timeit( lambda: features.load_features(features_path, block_size, block_size // 4), f'Loaded features of {dataset.name} dataset') # Reshape feature vectors: NxTxF -> NxTxFx1 x = np.expand_dims(x, axis=-1) # Load metadata and duplicate entries based on number of blocks df = pd.read_csv(dataset.metadata_path, index_col=0) df = df.loc[np.repeat(df.index, n_blocks)] return x, df
def main(): parser = argparse.ArgumentParser() parser.add_argument('featurefile') parser.add_argument('modelfile', nargs='?', default='generated/model.pickle') parser.add_argument('outfile', nargs='?') args = parser.parse_args() if args.outfile == None: args.outfile = args.featurefile.replace('.feat', '') + '.prob' print_err("Loading saved classifier") clf, feat_indices, feat_ind_remaining, affil_median = pickle.load( open(args.modelfile, 'rb')) ids, X = feat.load_features(args.featurefile) X = X[:, feat_ind_remaining] # affil_ind = feat_indices.index('affil_sharedidf') # X[np.isnan(X[:, affil_ind]), affil_ind] = affil_median X[np.isnan(X)] = 0. print_err("Making predictions") predictions = clf.predict_proba(X)[:, 1] # predictions = clf.predict(X) predictions = list(predictions) print_err("Writing predictions") writer = csv.writer(open(args.outfile, 'wb')) for i, ((id1, id2), prob) in enumerate(zip(ids, predictions)): writer.writerow([id1, id2, '{:g}'.format(prob)]) if (i + 1) % 10000 == 0: print_err(i + 1, 'rows done')
def main(): parser = argparse.ArgumentParser() parser.add_argument('featurefile') parser.add_argument('modelfile', nargs='?', default='generated/model.pickle') parser.add_argument('outfile', nargs='?') args = parser.parse_args() if args.outfile == None: args.outfile = args.featurefile.replace('.feat','') + '.prob' print_err("Loading saved classifier") clf, feat_indices, feat_ind_remaining, affil_median = pickle.load(open(args.modelfile, 'rb')) ids, X = feat.load_features(args.featurefile) X = X[:, feat_ind_remaining] # affil_ind = feat_indices.index('affil_sharedidf') # X[np.isnan(X[:, affil_ind]), affil_ind] = affil_median X[np.isnan(X)] = 0. print_err("Making predictions") predictions = clf.predict_proba(X)[:,1] # predictions = clf.predict(X) predictions = list(predictions) print_err("Writing predictions") writer = csv.writer(open(args.outfile, 'wb')) for i, ((id1, id2), prob) in enumerate(zip(ids, predictions)): writer.writerow([id1, id2, '{:g}'.format(prob)]) if (i+1) % 10000 == 0: print_err(i+1, 'rows done')
def _load_dataset(dataset): """Load input data and the associated metadata for a dataset. Args: dataset: Structure encapsulating dataset information. Returns: tuple: Tuple containing: x (np.ndarray): The input data of the dataset. df (pd.DataFrame): The metadata of the dataset. """ import features # Load feature vectors and reshape to 4D tensor features_path = os.path.join(cfg.extraction_path, dataset.name + '.h5') x, n_chunks = utils.timeit(lambda: features.load_features(features_path), 'Loaded features of %s dataset' % dataset.name) x = np.expand_dims(x, -1) assert x.ndim == 4 # Load metadata and duplicate entries based on number of chunks df = io.read_metadata(dataset.metadata_path) return x, df
def locate_cup(scene, prefix, already_found=[]): print prefix # Load saved features try: filename = "features/{0}.txt".format(prefix) features = load_features(filename) except IOError: return already_found # Get scene features, but hide features that have already been found scene = mask_image(scene, already_found) img2 = scene["img"] kp2, des2 = scene["kp"], scene["des"] if not len(kp2): return already_found # Find matches and a decent homography matched = find_matches(features, des2) found, dst, matches = find_homography(matched, features, kp2) if not found: return already_found # ... unless it's already been found exists = False for polygon in already_found: if (dst == polygon).all(): return already_found return locate_cup(scene, prefix, already_found + [dst])
def test(**kwargs): opt.parse(kwargs, show_config=True) if opt.hdf5: from datasets import Train_Dataset_HDF5 as Train_Dataset from datasets import Test_Dataset_HDF5 as Test_Dataset else: from datasets import Train_Dataset_IMAGE as Train_Dataset from datasets import Test_Dataset_IMAGE as Test_Dataset reiddataset_downloader(opt.data_dir, opt.dataset_name, opt.hdf5) num_classes = Train_Dataset(train_val='train', data_dir=opt.data_dir, dataset_name=opt.dataset_name).num_ids test_dataloaders = { x: DataLoader(Test_Dataset(query_gallery=x, data_dir=opt.data_dir, dataset_name=opt.dataset_name), batch_size=opt.batch_size, shuffle=False, num_workers=opt.num_workers) for x in ['query', 'gallery'] } model = getattr(models, opt.model)(num_classes) model.load(opt.load_epoch_label) # Remove the final fc layer and classifier layer model.model.fc = nn.Sequential() model.classifier = nn.Sequential() # Change to test mode model = model.eval() model = model.cuda() if opt.load_features: all_features = load_features() else: all_features = extract_features(model, test_dataloaders, opt.flip) save_features(all_features) query_feature = all_features['query'][0] gallery_feature = all_features['gallery'][0] print('-' * 30) rank = ranking(query_feature, gallery_feature) print('-' * 30) query_label = all_features['query'][1] query_cam = all_features['query'][2] query_name = all_features['query'][3] gallery_label = all_features['gallery'][1] gallery_cam = all_features['gallery'][2] gallery_name = all_features['gallery'][3] result, CMC, mAP = evaluate(rank, query_label, query_cam, gallery_label, gallery_cam) save_result(result, query_name, gallery_name, CMC, mAP)
def _load_data(dataset, is_training=False): """Load input data, target values and file names for a dataset. The input data is assumed to be a dataset of feature vectors. These feature vectors are standardized using a scaler that is either loaded from disk (if it exists) or computed on-the-fly. The latter is only possible if the input data is training data, which is indicated by the `is_training` parameter. Target values and file names are read from the metadata file. Args: dataset: Structure encapsulating dataset information. training (bool): Whether the input data is training data. Returns: x (np.ndarray): The input data. y (np.ndarray): The target values. names (list): The associated file names. """ import data_augmentation as aug import features features_path = os.path.join(cfg.extraction_path, dataset.name + '.h5') x = utils.timeit(lambda: features.load_features(features_path), 'Loaded features of %s dataset' % dataset.name) # Clip dynamic range to 90 dB x = np.maximum(x, x.max() - 90.0) # Load scaler from file if cached, or else compute it. scaler_path = cfg.scaler_path if os.path.exists(scaler_path) or not is_training: with open(scaler_path, 'rb') as f: scaler = pickle.load(f) else: scaler = utils.timeit(lambda: utils.compute_scaler(x), 'Computed standard scaler') with open(scaler_path, 'wb') as f: pickle.dump(scaler, f) x = utils.timeit(lambda: utils.standardize(x, scaler), 'Standardized %s features' % dataset.name) names, y = utils.timeit(lambda: utils.read_metadata(dataset.metadata_path), 'Loaded %s metadata' % dataset.name) if dataset == cfg.training_set and cfg.enable_augmentation: names, y = aug.expand_metadata((names, y)) return x, y, names
def predict_parc(): # Load the features. features, attribution_ids = f.load_features(PARC_FEATURES_PATH) # Load the model. model = m.load_model('svr') # Make predictions. Convert percentage to decimal. predictions = model.predict(features) / 100. results = zip(attribution_ids, predictions) # sort on predicted value. results.sort(key=lambda x: x[1]) predictions_path = os.path.join(DATA_DIR, 'parc-verifiability', 'predictions.tsv') open(predictions_path, 'w').write('\n'.join( ['%s\t%f' % (attr_id, score) for attr_id, score in results])) return results
def get_data_ddi(path: str, skip_invalid_smiles: bool = True, args: Namespace = None, features_path: List[str] = None, max_data_size: int = None, use_compound_names: bool = None, logger: Logger = None): debug = logger.debug if logger is not None else print if args is not None: # Prefer explicit function arguments but default to args if not provided features_path = features_path if features_path is not None else args.features_path max_data_size = max_data_size if max_data_size is not None else args.max_data_size use_compound_names = use_compound_names if use_compound_names is not None else args.use_compound_names else: use_compound_names = False max_data_size = max_data_size or float('inf') # Load features if features_path is not None: features_data = [] for feat_path in features_path: features_data.append(load_features(feat_path)) # each is num_data x num_features features_data = np.concatenate(features_data, axis=1) else: features_data = None skip_smiles = set() df = pd.read_csv(path, index_col=False) dictionaries = [] for line_no, line in df.iterrows(): dictionary = dict(line) dictionaries.append(dictionary) data = DDIDataset([ DDIDatapoint( dictionary=dictionary, args=args, features_1=features_data[i] if features_data is not None else None, features_2=features_data[i] if features_data is not None else None, ) for i, dictionary in tqdm(enumerate(dictionaries), total=len(dictionaries)) ]) # Filter out invalid SMILES if skip_invalid_smiles: original_data_len = len(data) # data = filter_invalid_smiles(data) data = filter_invalid_smiles_ddi(data) if len(data) < original_data_len: debug(f'Warning: {original_data_len - len(data)} SMILES are invalid.') if data.data[0].features_1 is not None and data.data[0].features_2 is not None: features_dim_1 = len(data.data[0].features_1) features_dim_2 = len(data.data[0].features_2) assert features_dim_1 == features_dim_2 args.features_dim = features_dim_1 return data
for d in tqdm(range(len(X_train))): X_train[d] = (X_train[d] - mean) / std for d in tqdm(range(len(X_test))): X_test[d] = (X_test[d] - mean) / std #Make once fully black vectors fully black again X_test = X_test*mask return X_train, X_test if __name__ == "__main__": print "\nLoading X" X_train, X_test = features.load_features() print "Loading Y" y_train, y_test = features.load_y() print "Removing fully black features" X_train, y_train = remove_completely_black(X_train, y_train) print "Normalizing features" X_train, X_test = normalize_features(X_train, X_test) print "Balancing classes" X_train,y_train = balance_classes(X_train,y_train) print "Writing to file" features.write_features((X_train, X_test),"balanced") features.write_y((y_train, y_test),"balanced")
def dice(prediction, y): dices = [dice_score_img(p,t) for p,t in zip(prediction,y)] mean = np.mean(dices) std = np.std(dices) return mean, std, dices def dice_score_img(p, y): return np.sum(p[y == 1]) * 2.0 / (np.sum(p) + np.sum(y)) def features_to_images(features, dim=0): images = util.chunks(features,384*512) for im in images: end_image = im[:,dim].reshape((512,384)) print np.mean(end_image) if __name__ == "__main__": print "\nLoading X" X_train, X_test = features.load_features("balanced") print "Loading Y" y_train, y_test = features.load_y("balanced") #train(X_train, X_test, y_train, y_test,LogisticRegression(), predict_black=True,name="logreg") #train(X_train, X_test, y_train, y_test,AdaBoostClassifier(n_estimators=200,random_state=42), predict_black=True,name="adaboost200") #train(X_train, X_test, y_train, y_test,RandomForestClassifier(n_estimators=250,n_jobs=-1,random_state=42), use_probability=True, predict_black=True,name="rf200") #train(X_train, X_test, y_train, y_test,SVC(verbose=2,max_iter=10000), use_probability=False,name="svmrbf") train(X_train, X_test, y_train, y_test,SVC(kernel="linear",verbose=2,max_iter=10000), use_probability=False,name="svmlinear")
std = np.std(X, axis=0) for d in tqdm(range(len(X_train))): X_train[d] = (X_train[d] - mean) / std for d in tqdm(range(len(X_test))): X_test[d] = (X_test[d] - mean) / std #Make once fully black vectors fully black again X_test = X_test * mask return X_train, X_test if __name__ == "__main__": print "\nLoading X" X_train, X_test = features.load_features() print "Loading Y" y_train, y_test = features.load_y() print "Removing fully black features" X_train, y_train = remove_completely_black(X_train, y_train) print "Normalizing features" X_train, X_test = normalize_features(X_train, X_test) print "Balancing classes" X_train, y_train = balance_classes(X_train, y_train) print "Writing to file" features.write_features((X_train, X_test), "balanced") features.write_y((y_train, y_test), "balanced")
from classifier import * from features import load_features, save_prediction """ Parameters """ classifier = 'LSTM' method = 'we' database = 'twitter' language = 'en' query = 'Trump' extended = True """ Load the data """ # x_train, x_test, y_train, y_test = load_features(database, language, method) x_test = load_features(database, language, method, query, extended) # x_test = np.concatenate((x_train, x_test), axis=0) """ Load a classifier """ # load_file = 'data/model/untrained_' + classifier load_file = 'data/model/best_trained_' + classifier + '_' + method + '_' + language model = load_classifier(classifier, load_file) ''' """ Train the classifier """ epochs = 5 batch_size = 16 validation_data = (x_test, y_test) save_file = 'data/model/trained_' + classifier + '_' + method + '_' + language model = train_classifier(classifier, model, x_train, y_train, epochs, batch_size, validation_data, save_file=save_file+'.h5') model = load_classifier(classifier, save_file) '''
# Parameters classifier = 'LSTM' method = 'we' language = 'fr' epochs = 5 batch_size = 32 duration = 3600 * 6 # Initialization t0 = t.time() best_model, best_mse = None, 0 save_fname = 'data/model/test_3_random_trained_' + classifier + '_' + method + '_' + language # Load the data x_train, x_test, y_train, y_test = load_features(language, method) validation_data = (x_test, y_test) # Loop for the specified duration cmpt = 1 while t.time() - t0 < duration: print("Trial number {}:".format(cmpt)) # Create a random classifier new_model, new_info = create_random_classifier(classifier) # Train the classifier print("Training...") new_model, history = train_classifier(classifier, new_model,
def main(): print("Connecting to services...") couchdb = CouchDB(user=os.environ["COUCHDB_USERNAME"], auth_token=os.environ["COUCHDB_PASSWORD"], url="http://%s:5984/" % os.environ["COUCHDB_HOST"], connect=False, auto_renew=True) redis = Redis(os.environ["REDIS_HOST"], 6379, 0) # Load features print("Loading features...") start_time = time() load_features(os.environ["MAP_PATH"], couchdb) print("Loading Time: %.2fs" % (time() - start_time)) # Load classifier print("Loading classifier...") start_time = time() model, tokenizer = load_model(os.environ["MODEL_PATH"]) print("Loading Time: %.2fs" % (time() - start_time)) while True: try: # Select a location backfill = False couchdb.connect() # Use most out of date location result = couchdb["features"].get_query_result(selector={ "newest": { "$or": [{ "$exists": False }, { "$lt": int((datetime.utcnow() - timedelta(days=1)).timestamp()) }] }, "status": { "$or": [{ "$exists": False }, { "$ne": "in_use" }] } }, sort=[{ "newest": "asc" }], limit=1).all() if len(result) == 0: # Backfill historical data if all are in date backfill = True result = couchdb["features"].get_query_result(selector={ "oldest": { "$or": [{ "$exists": False }, None] }, "status": { "$or": [{ "$exists": False }, { "$ne": "in_use" }] } }, limit=1).all() if len(result) == 0: result = couchdb["features"].get_query_result( selector={ "oldest": { "$gt": datetime(2006, 4, 1).timestamp() }, "status": { "$or": [{ "$exists": False }, { "$ne": "in_use" }] } }, sort=[{ "oldest": "desc" }], limit=1).all() if len(result) == 0: print("No jobs...") couchdb.disconnect() sleep(3600) continue # Mark location as "in use" doc = couchdb["features"][result[0]["_id"]] doc["status"] = "in_use" doc.save() try: # Process tweets at that location feature = result[0] print("Calling %sfor feature: %s..." % ("backfill " if backfill else "", feature["_id"])) call_for_feature(feature, model, tokenizer, couchdb, redis, backfill) print() finally: # Mark location as "available" couchdb.connect() doc = couchdb["features"][result[0]["_id"]] doc["status"] = "available" doc.save() except Exception as e: print(e) sleep(random() * 0.3 + 0.1) finally: couchdb.disconnect()
def train_model( features_files, feature_columns, classifier, model_args, outlier_sigma=None, scale_features=True, submission_file=None, save_settings=False, plot=False, normalize_probs=None, n_cv=10, f_cv=0.3, verbose=False, ): """ Fit a classification model (classifier, using arguments in model_args) to the features in columns feature_columns in the file(s) in features_files. Use CV with n_cv random training-CV sample splittings, each containing a fraction f_cv in the CV subsample, to estimate AUC for the fit. """ settings = locals() hour_column = 0 type_column = 1 # read in feature matrix from file(s) X = features.load_features(features_files) # remove outliers if outlier_sigma is not None: X, retained_indices = features.remove_outliers(X, n_sigma=outlier_sigma) # scale features if scale_features: X = features.scale_features(X) # set up model model = classifier(**model_args) # set up plot if plot: fig = plt.figure(figsize=(8, 4)) fig.set_tight_layout(True) ax0 = plt.subplot(121) ax1 = plt.subplot(122) # initialize plot arrays n_learn = np.zeros(10) learn_cv_avg = np.zeros(len(n_learn)) learn_train_avg = np.zeros(len(n_learn)) fp_rate_avg = np.linspace(0, 1, num=100) tp_rate_avg = np.zeros(len(fp_rate_avg)) # loop over training-CV sample splittings auc_values = [] for i_cv in range(n_cv): cv_indices = cv.cv_split_by_hour(X, n_pre_hrs=f_cv) if verbose: print "\nCV iteration", i_cv + 1 print len(cv_indices["train"]), "training instances" print len(cv_indices["cv"]), "CV instances" # get feature matrices and class arrays for training and CV samples train_features_all, cv_features_all = [X[cv_indices[k], :] for k in ["train", "cv"]] train_features, cv_features = [y[:, np.array(feature_columns)] for y in [train_features_all, cv_features_all]] train_class = train_features_all[:, type_column] cv_class = cv_features_all[:, type_column] # compute learning curve if plot: learn_mask, n_train, learn_train, learn_cv = learning_curve( model, (train_features, train_class), (cv_features, cv_class), n=len(n_learn), normalize_probs=normalize_probs, ) if len(learn_mask) > 0: n_learn[learn_mask] += 1 learn_train_avg[learn_mask] += learn_train learn_cv_avg[learn_mask] += learn_cv ax0.plot(n_train, learn_train, linestyle="-", color=(1, 0.6, 0.6)) ax0.plot(n_train, learn_cv, linestyle="-", color=(0.7, 0.7, 0.7)) # predict probabilities train_prob, cv_prob = predict_probs(model, train_class, train_features, cv_features, normalize_probs) check_for_nan(train_prob) check_for_nan(cv_prob) if verbose: try: model_coef = model.coef_ print "Feature coefficients:", model_coef except: pass # compute AUC auc = roc_auc_score(cv_class, cv_prob) auc_values.append(auc) if verbose: print "training AUC =", roc_auc_score(train_class, train_prob) print "CV AUC =", auc # plot ROC curve if plot: fp_rate, tp_rate, thresholds = roc_curve(cv_class, cv_prob) tp_rate_avg += np.interp(fp_rate_avg, fp_rate, tp_rate) ax1.plot(fp_rate, tp_rate, linestyle="-", color=(0.7, 0.7, 0.7)) # compute mean and std. dev. of AUC over CV iterations auc_mean = np.mean(auc_values) auc_std = np.std(auc_values) if verbose: print "\nAverage AUC:", auc_mean, "+/-", auc_std # update submission CSV file if submission_file is not None: train_features_all = X[(X[:, type_column] == 0) | (X[:, type_column] == 1), :] train_features = train_features_all[:, np.array(feature_columns)] train_class = train_features_all[:, type_column] test_features_all = X[X[:, type_column] == -1, :] test_features = test_features_all[:, np.array(feature_columns)] train_prob, test_prob = predict_probs(model, train_class, train_features, test_features, normalize_probs) check_for_nan(train_prob, message="Replacing NaN probabilities with 0.") check_for_nan(test_prob, message="Replacing NaN probabilities with 0.") for i, ff in enumerate(features_files): data_list_file = ".".join(ff.split(".")[:-1]) + "_data_files.txt" with open(data_list_file, "r") as df: if i == 0: data_files = np.array(df.readlines()) else: data_files = np.concatenate((data_files, df.readlines()), axis=0) if outlier_sigma is not None: data_files = data_files[retained_indices] test_files = [] for f in data_files: if "test" in f: test_files.append(f.strip()) submission.update_submission(dict(zip(test_files, test_prob)), submission_file) # save settings if save_settings: if submission_file is not None: settings_file = ".".join(submission_file.split(".")[:-1]) + "_settings.txt" open_mode = "a" else: settings_file = "train_model_settings.txt" open_mode = "w" with open(settings_file, open_mode) as sf: for s in [ "features_files", "feature_columns", "classifier", "model_args", "outlier_sigma", "scale_features", "submission_file", "normalize_probs", ]: if s in settings: sf.write(s + ": " + str(settings[s]) + "\n") sf.write("AUC = {0:.2f}+/-{1:.2f}\n\n".format(auc_mean, auc_std)) # plot average learning curves and ROC curve if plot: n_train_array = len(cv_indices["train"]) / float(len(n_learn)) * np.array(range(1, len(n_learn) + 1)) ax0.plot(n_train_array, learn_train_avg / (n_learn + 1.0e-3), "r-", linewidth=3) ax0.plot(n_train_array, learn_cv_avg / (n_learn + 1.0e-3), "k-", linewidth=3) tp_rate_avg /= float(n_cv) ax1.plot(fp_rate_avg, tp_rate_avg, "k-", linewidth=3) # display plot ax0.set_ylim((0.5, 1)) ax0.set_xlabel("number of training instances") ax0.set_ylabel("AUC") ax1.plot(np.linspace(0, 1), np.linspace(0, 1), "k:", linewidth=2) ax1.set_xlabel("false positive rate") ax1.set_ylabel("true positive rate") plt.show(block=False) return (model, auc_mean, auc_std)
def get_data(path: str, skip_invalid_smiles: bool = True, args: Namespace = None, features_path: List[str] = None, max_data_size: int = None, use_compound_names: bool = None, logger: Logger = None) -> MoleculeDataset: """ Gets smiles string and target values (and optionally compound names if provided) from a CSV file. :param path: Path to a CSV file. :param skip_invalid_smiles: Whether to skip and filter out invalid smiles. :param args: Arguments. :param features_path: A list of paths to files containing features. If provided, it is used in place of args.features_path. :param max_data_size: The maximum number of data points to load. :param use_compound_names: Whether file has compound names in addition to smiles strings. :param logger: Logger. :return: A MoleculeDataset containing smiles strings and target values along with other info such as additional features and compound names when desired. """ debug = logger.debug if logger is not None else print if args is not None: # Prefer explicit function arguments but default to args if not provided features_path = features_path if features_path is not None else args.features_path max_data_size = max_data_size if max_data_size is not None else args.max_data_size use_compound_names = use_compound_names if use_compound_names is not None else args.use_compound_names else: use_compound_names = False max_data_size = max_data_size or float('inf') # Load features if features_path is not None: features_data = [] for feat_path in features_path: features_data.append(load_features(feat_path)) # each is num_data x num_features features_data = np.concatenate(features_data, axis=1) else: features_data = None skip_smiles = set() # Load data with open(path) as f: reader = csv.reader(f) next(reader) # skip header lines = [] for line in reader: smiles = line[0] if smiles in skip_smiles: continue lines.append(line) if len(lines) >= max_data_size: break data = MoleculeDataset([ MoleculeDatapoint( line=line, args=args, features=features_data[i] if features_data is not None else None, use_compound_names=use_compound_names ) for i, line in tqdm(enumerate(lines), total=len(lines)) ]) # Filter out invalid SMILES if skip_invalid_smiles: original_data_len = len(data) data = filter_invalid_smiles(data) if len(data) < original_data_len: debug(f'Warning: {original_data_len - len(data)} SMILES are invalid.') if data.data[0].features is not None: args.features_dim = len(data.data[0].features) return data
def main(): parser = argparse.ArgumentParser() parser.add_argument('trainfile', nargs='?', default='data/train.csv') parser.add_argument('featfile', nargs='?', default='generated/train.feat') parser.add_argument('outfile', nargs='?', default='generated/model.pickle') parser.add_argument('--clf', default='rf') parser.add_argument('--removefeat', nargs='+', default=[]) parser.add_argument('--cv', action='store_true') parser.add_argument('--folds', default=3) parser.add_argument('--gridsearch', action='store_true') parser.add_argument('--usegrid', action='store_true') args = parser.parse_args() if args.removefeat: feat_to_remove = args.removefeat else: feat_to_remove = [ # 'conferences', # 'journals', # 'affiliations', # 'jaro_distance' ] n_jobs = min(multiprocessing.cpu_count(), 8) params = { # Random Forest 'rf': { 'max_features': 3, 'n_estimators': 300, 'min_samples_split': 1, 'min_samples_leaf': 1, }, # GBM 'gbm': { 'n_estimators': 20000, 'learning_rate': 1e-03, 'max_depth': 3, } } params_grid = { 'rf': { 'min_samples_split': [1, 2], 'min_samples_leaf': [1, 2], 'n_estimators': [130, 200, 250, 300, 500, 750, 1000, 1250], # [130, 400, 1000] 'max_features': [3, 4, 5, 6, 7, 8, 9] # [4, 6, 9] }, 'gbm': { # 'n_estimators': [500, 200], # 'learning_rate': [1e-04], # 'max_depth': [7] 'n_estimators': [15000, 20000] + [17500], 'learning_rate': [1e-04, 1e-03, 1e-02] + [5e-03], 'max_depth': [7, 16] + [3, 5, 6, 8, 12, 14, 18] } } params_fixed = { 'rf': { 'random_state': 100, 'n_jobs': n_jobs, # -1 = no. of cores on machine 'oob_score': True, 'verbose': 0, 'compute_importances': True }, 'gbm': { 'min_samples_split': 1, 'min_samples_leaf': 2, 'subsample': 0.5, 'verbose': 0 } } for k, v in params_fixed.iteritems(): params[k].update(v) if args.usegrid or args.gridsearch: print params_grid[args.clf] else: print params[args.clf] X_ids, X = feat.load_features(args.featfile) idmap = {id: i for i, id in enumerate(X_ids)} feat_indices = feat.FeaturesGenerator.fields feat_ind_remaining = [i for i, faid in enumerate(feat_indices) if faid not in feat_to_remove] feat_indices = [v for v in feat_indices if v not in feat_to_remove] X = X[:, feat_ind_remaining] print feat_indices print_err("Loading training dataset labels") Y, Y_ids = loadTrainingLabels(args.trainfile, set(X_ids)) training_indices = [idmap[id] for id in Y_ids] X = X[training_indices] # Filling in missing values # affil_ind = feat_indices.index('affil_sharedidf') # affil_median = sp.stats.nanmedian(X[:, affil_ind]) affil_median = 0 # X[np.isnan(X[:, affil_ind]), affil_ind] = affil_median # X[np.isnan(X[:, affil_ind]), affil_ind] = 0. X[np.isnan(X)] = 4. if args.clf == 'rf': clf = RandomForestClassifier() elif args.clf == 'gbm': clf = GradientBoostingClassifier() clf.set_params(**params[args.clf]) if args.usegrid or args.gridsearch: print_err("Running grid search for best parameters") kwargs = { 'n_jobs': n_jobs } if args.clf == 'rf': clf.set_params(n_jobs=1) elif args.clf == 'gbm': kwargs['loss_func'] = zero_one_loss clf_grid = grid(clf, params_grid[args.clf], X, Y, folds=args.folds, **kwargs) pprint(clf_grid.grid_scores_) print(clf_grid.best_score_) print(clf_grid.best_params_) if args.usegrid: clf = clf_grid.best_estimator_ elif args.cv: print_err("Running cross-validation") m_cv(clf, X, Y, args.folds) if not args.cv and (not args.gridsearch or args.usegrid): print_err("Fitting data for training") clf.fit(X, Y) # for GBM if hasattr(clf, 'train_score_'): print_err("Train Score:", clf.train_score_[-1]) print_err("OOB Score (CV-estimate):", clf.oob_score_) print_err("Saving trained model") pickle.dump((clf, feat_indices, feat_ind_remaining, affil_median), open(args.outfile, 'wb'), pickle.HIGHEST_PROTOCOL)
def dice_score_img(p, y): return np.sum(p[y == 1]) * 2.0 / (np.sum(p) + np.sum(y)) def features_to_images(features, dim=0): images = util.chunks(features, 384 * 512) for im in images: end_image = im[:, dim].reshape((512, 384)) print np.mean(end_image) if __name__ == "__main__": print "\nLoading X" X_train, X_test = features.load_features("balanced") print "Loading Y" y_train, y_test = features.load_y("balanced") #train(X_train, X_test, y_train, y_test,LogisticRegression(), predict_black=True,name="logreg") #train(X_train, X_test, y_train, y_test,AdaBoostClassifier(n_estimators=200,random_state=42), predict_black=True,name="adaboost200") #train(X_train, X_test, y_train, y_test,RandomForestClassifier(n_estimators=250,n_jobs=-1,random_state=42), use_probability=True, predict_black=True,name="rf200") #train(X_train, X_test, y_train, y_test,SVC(verbose=2,max_iter=10000), use_probability=False,name="svmrbf") train(X_train, X_test, y_train, y_test, SVC(kernel="linear", verbose=2, max_iter=10000), use_probability=False, name="svmlinear")
def classify(): print('-' * 30) print('TRAINING TYPE: {0}'.format(TRAINING_TYPE)) print('-' * 30) # Load data and masks features = load_features() features_info = load_features_info() masks = load_masks() num_of_images = len(features_info) num_of_training_pixels = 0 num_of_validation_pixels = 0 assert (num_of_images > NUM_OF_VALIDATION_IMAGES) for i in range(num_of_images - NUM_OF_VALIDATION_IMAGES): num_of_training_pixels += features_info[i]['num_of_pixels'] for i in range(num_of_images - NUM_OF_VALIDATION_IMAGES, num_of_images): num_of_validation_pixels += features_info[i]['num_of_pixels'] print('Training data: {0} \nValidation data: {1} '.format( num_of_training_pixels, num_of_validation_pixels)) # Standardize data x_train = features[:num_of_training_pixels] scaler = preprocessing.StandardScaler().fit(x_train) x_train = scaler.transform(x_train) x_train = preprocessing.normalize(x_train) x_validation = scaler.transform(features[num_of_training_pixels:]) x_validation = preprocessing.normalize(x_validation) y_train = masks[:num_of_training_pixels] clf = choose_training_type() if TRAINING_TYPE != 'FROM_SAVED_CLASSIFIER': print('-' * 30) print('Training started...') start_time = time.time() clf.fit(x_train, y_train) print('-' * 30) print('Training ended: {:.2f} s'.format(time.time() - start_time)) print('-' * 30) if TRAINING_TYPE == 'GRID_SEARCH': print("GRID SEARCH RESULTS\n") print('Best parameters: {}\n'.format(clf.best_params_)) means = clf.cv_results_['mean_test_score'] for mean, params in zip(means, clf.cv_results_['params']): print('Mean score: {:0.3f} Parameters: {}'.format( mean, params)) print('-' * 30) scores = clf.cv_results_['mean_test_score'].reshape( len(GRID_SEARCH_PARAMETERS['base_estimator__C']), len(GRID_SEARCH_PARAMETERS['base_estimator__gamma'])) mpl_style(dark=True) # plt.figure(figsize=(10, 10)) for ind, i in enumerate( GRID_SEARCH_PARAMETERS['base_estimator__C']): plt.plot(GRID_SEARCH_PARAMETERS['base_estimator__gamma'], scores[ind], label='C parameter: ' + str(i)) plt.title('GRID SEARCH RESULTS') plt.xlabel('Gamma parameter') plt.ylabel('Mean score') plt.grid('on') plt.legend() plt.savefig('grid_search_results_figure.png', bbox_inches='tight', dpi=200) print('Saving model...') start_time = time.time() dump(clf, os.path.join('saved_models/', 'SVM_classifier.joblib')) print('Saving ended: {:.2f} s'.format(time.time() - start_time)) print('-' * 30) print('Predicting started...') print('-' * 30) start_time = time.time() predicted_masks = clf.predict(x_validation) print('Predicting ended: {:.2f} s'.format(time.time() - start_time)) print('-' * 30) previous_mask_pixels = 0 current_num_of_pixels = 0 masks_predicted = 0 # Saving predicted and truth masks as pairs # There is a need for converting predicted vector to 2D masks for i in range(num_of_images - NUM_OF_VALIDATION_IMAGES, num_of_images): current_num_of_pixels = features_info[i]['num_of_pixels'] predicted_mask = np.asarray( predicted_masks)[previous_mask_pixels:previous_mask_pixels + current_num_of_pixels] predicted_mask = predicted_mask.reshape(features_info[i]['height'], features_info[i]['width']) predicted_mask = ndimage.binary_opening(predicted_mask) predicted_mask = ndimage.binary_closing(predicted_mask) masks_predicted += 1 plt.imsave(os.path.join( 'predictions/', 'predicted_mask_' + str(masks_predicted) + '.png'), predicted_mask, cmap='gray') offset = num_of_training_pixels + previous_mask_pixels truth_mask = masks[offset:offset + current_num_of_pixels].reshape( features_info[i]['height'], features_info[i]['width']) plt.imsave(os.path.join('truth/', 'truth_mask_' + str(masks_predicted) + '.png'), truth_mask, cmap='gray') previous_mask_pixels = current_num_of_pixels