def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]): self.config["task"] = "predict" self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) result = { "line_id": [], "prediction": [], } for X in pd.read_csv(test_csv, encoding="utf-8", low_memory=False, dtype=self.config["dtype"], parse_dates=self.config["parse_dates"], chunksize=self.config["nrows"]): result["line_id"] += list(X["line_id"]) preprocess(X, self.config) result["prediction"] += list(predict(X, self.config)) result = pd.DataFrame(result) result.to_csv(prediction_csv, index=False) target_csv = test_csv.replace("test", "test-target") if os.path.exists(target_csv): score = validate(result, target_csv, self.config["mode"]) else: score = None return result, score
def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]): self.config["task"] = "predict" self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) df = read_df(test_csv, self.config) result = { "line_id": list(df["line_id"]), "prediction": [], } def chunker(seq, size): return (seq[pos:pos+size] for pos in range(0, len(seq), size)) for chunk in chunker(df, 100000): X = chunk.copy() preprocess(X, self.config) result["prediction"] += list(predict(X, self.config)) result = pd.DataFrame(result) result.sort_values("line_id", inplace=True) result.to_csv(prediction_csv, index=False) target_csv = test_csv.replace("test", "test-target") if os.path.exists(target_csv): score = validate(result, target_csv, self.config["mode"]) else: score = None return result, score
def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]): self.config["task"] = "predict" self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) result = { "id": [], "prediction": [], } for X in pd.read_csv( test_csv, encoding="utf-8", low_memory=False, dtype=self.config["dtype"], parse_dates=self.config["parse_dates"], chunksize=self.config["nrows"] ): result["id"] += list(X["id"]) preprocess(X, self.config) result["prediction"] += list(predict(X, self.config)) result = pd.DataFrame(result) result.to_csv(prediction_csv, index=False) return result
def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]): self.config["task"] = "predict" self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) result = {"line_id": [], "prediction": []} if 'holiday_detect' in self.config: result["datetime"] = [] for X in pd.read_csv(test_csv, encoding="utf-8", low_memory=False, dtype=self.config["dtype"], parse_dates=self.config["parse_dates"], chunksize=self.config["nrows"]): result["line_id"] += list(X["line_id"]) if 'holiday_detect' in self.config: dt_fea = self.config['holiday_detect'] result["datetime"] += list(X[dt_fea]) preprocess(X, self.config) result["prediction"] += list(predict(X, self.config)) result = pd.DataFrame(result) # post process for holiday if 'holiday_detect' in self.config: holiday = self.config['holiday'] for idx, row in result.iterrows(): dt = row['datetime'] dt_str = str(dt).split(' ')[0].strip() if dt_str in holiday or dt.weekday() == 5 or dt.weekday() == 6: result.loc[idx, 'prediction'] = 0 result.drop(["datetime"], axis=1, inplace=True) result.to_csv(prediction_csv, index=False) target_csv = test_csv.replace("test", "test-target") if os.path.exists(target_csv): score = validate(result, target_csv, self.config["mode"]) else: score = None return result, score
def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]): self.config["task"] = "predict" self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp" os.makedirs(self.config.tmp_dir, exist_ok=True) self.config["prediction_csv"] = prediction_csv self.config["line_id"] = [] self.config["start_time"] = time.time() result = { "line_id": [], "prediction": [], } X = pd.read_csv( test_csv, encoding="utf-8", low_memory=False, dtype=self.config["dtype"], parse_dates=self.config["parse_dates"], ) self.config["line_id"] = X["line_id"].values result["line_id"] = (X["line_id"].values) X = preprocess(X, self.config) X = X[self.config["columns"]] # for right columns order result["prediction"] = predict(X, self.config) result = pd.DataFrame(result) result.to_csv(prediction_csv, index=False) target_csv = test_csv.replace("test", "test-target") if os.path.exists(target_csv): score = validate(result, target_csv, self.config["mode"], self.config) else: score = None return result, score
def main( running_on_google_colab, datadir, rootdir, outdir, percent_of_data, regression, dataname, tag, train, new_model, callback_timeout, epochs, batch_size, model_function, use_fret_for_training, exclude_alex_fret, ): gpu_available = tf.test.is_gpu_available() if new_model: print("**Training new model**") else: print("**Training most recent model**") rootdir = Path(rootdir) if running_on_google_colab: rootdir = "./gdrive/My Drive/Colab Notebooks/DeepFRET-Model" rootdir = Path(rootdir) outdir = rootdir.joinpath(outdir).expanduser() datadir = rootdir.joinpath(datadir).expanduser() X, labels = lib.utils.load_npz_data( top_percentage=percent_of_data, path=datadir, set_names=("X_" + dataname, "y_" + dataname), ) n_classes = len(np.unique(labels)) if not regression: # Use labels as classification target y = lib.ml.class_to_one_hot(labels, num_classes=n_classes) y = lib.ml.smoothe_one_hot_labels(y, amount=0.05) else: # Use E_true column as regression target y = np.expand_dims(X[..., 3], axis=-1) if use_fret_for_training: # Use E_raw column as input X = np.expand_dims(X[..., 4], axis=-1) X = X.clip(2, -2) else: X = X[..., 0:2] if exclude_alex_fret else X[..., 0:3] X = lib.utils.sample_max_normalize_3d(X) print("X: ", X.shape) print("y: ", y.shape) print("Splitting dataset...") X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split( X, y, test_size=0.2, random_state=1) model_name = "{}_best_model.h5".format(dataname) model = lib.model.get_model( n_features=X.shape[-1], n_classes=n_classes, train=train, new_model=new_model, model_name=model_name, model_path=outdir, gpu=gpu_available, tag=tag, regression=regression, model_function=model_function, ) if tag is not None: dataname += "_" + tag model_name = model_name.replace("best_model", tag + "_best_model") if train: callbacks = lib.ml.generate_callbacks(patience=callback_timeout, outdir=outdir, name=dataname) model.fit( x=X_train, y=y_train, validation_data=(X_val, y_val), epochs=epochs, batch_size=batch_size, callbacks=callbacks, ) try: lib.plotting.plot_losses(logpath=outdir, outdir=outdir, name=dataname) except IndexError: pass # Convert final model to GPU if gpu_available: print("Converted model from GPU to CPU-compatible") cpu_model = model_function( gpu=False, n_features=X.shape[-1], regression=regression, n_classes=n_classes, ) lib.ml.gpu_model_to_cpu( trained_gpu_model=model, untrained_cpu_model=cpu_model, outdir=outdir, modelname=model_name, ) print("Evaluating...") y_pred = model.predict(X_val) if not regression: lib.plotting.plot_confusion_matrices( y_target=y_val, y_pred=y_pred, y_is_binary=False, targets_to_binary=[4, 5, 6, 7, 8], outdir=outdir, name=dataname, )
num_workers=1) print("Test set size: {}.".format(len(dataset))) # Load model resnet = torchvision.models.resnet18(pretrained=True) if use_metadata: model_base = torch.nn.Sequential(*list(resnet.children())[:-1]) model = MetadataModel(model_base, base_out_dim=512) else: resnet.fc = torch.nn.Linear(512, 2) model = resnet # model.to(lib.model.device) model = load_model(model, weight_path) results = predict(model, data_loader, use_metadata=use_metadata) results = np.concatenate(results, axis=0) test_csv = pd.read_csv(csv_path) # test_csv = test_csv.loc[int(len(test_csv)*data_sample_size) ,:] submission_csv = pd.DataFrame({ 'image_name': test_csv['image_name'], 'target': results }) print(submission_csv.head()) submission_csv.to_csv(Path(dirs.csv) / "submission.csv", index=False)