Esempio n. 1
0
    def predict(self, test_csv: str,
                prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]):
        self.config["task"] = "predict"
        self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        result = {
            "line_id": [],
            "prediction": [],
        }

        for X in pd.read_csv(test_csv,
                             encoding="utf-8",
                             low_memory=False,
                             dtype=self.config["dtype"],
                             parse_dates=self.config["parse_dates"],
                             chunksize=self.config["nrows"]):
            result["line_id"] += list(X["line_id"])
            preprocess(X, self.config)
            result["prediction"] += list(predict(X, self.config))

        result = pd.DataFrame(result)
        result.to_csv(prediction_csv, index=False)

        target_csv = test_csv.replace("test", "test-target")
        if os.path.exists(target_csv):
            score = validate(result, target_csv, self.config["mode"])
        else:
            score = None

        return result, score
Esempio n. 2
0
    def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]):
        self.config["task"] = "predict"
        self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        df = read_df(test_csv, self.config)
        result = {
            "line_id": list(df["line_id"]),
            "prediction": [],
        }

        def chunker(seq, size):
            return (seq[pos:pos+size] for pos in range(0, len(seq), size))

        for chunk in chunker(df, 100000):
            X = chunk.copy()
            preprocess(X, self.config)
            result["prediction"] += list(predict(X, self.config))

        result = pd.DataFrame(result)
        result.sort_values("line_id", inplace=True)
        result.to_csv(prediction_csv, index=False)

        target_csv = test_csv.replace("test", "test-target")
        if os.path.exists(target_csv):
            score = validate(result, target_csv, self.config["mode"])
        else:
            score = None

        return result, score
Esempio n. 3
0
    def predict(self, test_csv: str, prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]):
        self.config["task"] = "predict"
        self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        result = {
            "id": [],
            "prediction": [],
        }

        for X in pd.read_csv(
                test_csv,
                encoding="utf-8",
                low_memory=False,
                dtype=self.config["dtype"],
                parse_dates=self.config["parse_dates"],
                chunksize=self.config["nrows"]
        ):
            result["id"] += list(X["id"])
            preprocess(X, self.config)
            result["prediction"] += list(predict(X, self.config))

        result = pd.DataFrame(result)
        result.to_csv(prediction_csv, index=False)

        return result
Esempio n. 4
0
    def predict(self, test_csv: str,
                prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]):
        self.config["task"] = "predict"
        self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        result = {"line_id": [], "prediction": []}
        if 'holiday_detect' in self.config:
            result["datetime"] = []

        for X in pd.read_csv(test_csv,
                             encoding="utf-8",
                             low_memory=False,
                             dtype=self.config["dtype"],
                             parse_dates=self.config["parse_dates"],
                             chunksize=self.config["nrows"]):
            result["line_id"] += list(X["line_id"])
            if 'holiday_detect' in self.config:
                dt_fea = self.config['holiday_detect']
                result["datetime"] += list(X[dt_fea])

            preprocess(X, self.config)
            result["prediction"] += list(predict(X, self.config))

        result = pd.DataFrame(result)

        # post process for holiday
        if 'holiday_detect' in self.config:
            holiday = self.config['holiday']
            for idx, row in result.iterrows():
                dt = row['datetime']
                dt_str = str(dt).split(' ')[0].strip()
                if dt_str in holiday or dt.weekday() == 5 or dt.weekday() == 6:
                    result.loc[idx, 'prediction'] = 0

            result.drop(["datetime"], axis=1, inplace=True)

        result.to_csv(prediction_csv, index=False)

        target_csv = test_csv.replace("test", "test-target")
        if os.path.exists(target_csv):
            score = validate(result, target_csv, self.config["mode"])
        else:
            score = None

        return result, score
Esempio n. 5
0
    def predict(self, test_csv: str,
                prediction_csv: str) -> (pd.DataFrame, Optional[np.float64]):
        self.config["task"] = "predict"
        self.config.tmp_dir = os.path.dirname(prediction_csv) + "/tmp"
        os.makedirs(self.config.tmp_dir, exist_ok=True)

        self.config["prediction_csv"] = prediction_csv
        self.config["line_id"] = []

        self.config["start_time"] = time.time()

        result = {
            "line_id": [],
            "prediction": [],
        }

        X = pd.read_csv(
            test_csv,
            encoding="utf-8",
            low_memory=False,
            dtype=self.config["dtype"],
            parse_dates=self.config["parse_dates"],
        )
        self.config["line_id"] = X["line_id"].values

        result["line_id"] = (X["line_id"].values)
        X = preprocess(X, self.config)

        X = X[self.config["columns"]]  # for right columns order

        result["prediction"] = predict(X, self.config)

        result = pd.DataFrame(result)
        result.to_csv(prediction_csv, index=False)

        target_csv = test_csv.replace("test", "test-target")
        if os.path.exists(target_csv):
            score = validate(result, target_csv, self.config["mode"],
                             self.config)
        else:
            score = None

        return result, score
Esempio n. 6
0
def main(
    running_on_google_colab,
    datadir,
    rootdir,
    outdir,
    percent_of_data,
    regression,
    dataname,
    tag,
    train,
    new_model,
    callback_timeout,
    epochs,
    batch_size,
    model_function,
    use_fret_for_training,
    exclude_alex_fret,
):

    gpu_available = tf.test.is_gpu_available()

    if new_model:
        print("**Training new model**")
    else:
        print("**Training most recent model**")

    rootdir = Path(rootdir)
    if running_on_google_colab:
        rootdir = "./gdrive/My Drive/Colab Notebooks/DeepFRET-Model"

    rootdir = Path(rootdir)
    outdir = rootdir.joinpath(outdir).expanduser()
    datadir = rootdir.joinpath(datadir).expanduser()

    X, labels = lib.utils.load_npz_data(
        top_percentage=percent_of_data,
        path=datadir,
        set_names=("X_" + dataname, "y_" + dataname),
    )
    n_classes = len(np.unique(labels))

    if not regression:
        # Use labels as classification target
        y = lib.ml.class_to_one_hot(labels, num_classes=n_classes)
        y = lib.ml.smoothe_one_hot_labels(y, amount=0.05)
    else:
        # Use E_true column as regression target
        y = np.expand_dims(X[..., 3], axis=-1)

    if use_fret_for_training:
        # Use E_raw column as input
        X = np.expand_dims(X[..., 4], axis=-1)
        X = X.clip(2, -2)
    else:
        X = X[..., 0:2] if exclude_alex_fret else X[..., 0:3]
        X = lib.utils.sample_max_normalize_3d(X)

    print("X: ", X.shape)
    print("y: ", y.shape)

    print("Splitting dataset...")
    X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split(
        X, y, test_size=0.2, random_state=1)

    model_name = "{}_best_model.h5".format(dataname)

    model = lib.model.get_model(
        n_features=X.shape[-1],
        n_classes=n_classes,
        train=train,
        new_model=new_model,
        model_name=model_name,
        model_path=outdir,
        gpu=gpu_available,
        tag=tag,
        regression=regression,
        model_function=model_function,
    )

    if tag is not None:
        dataname += "_" + tag
        model_name = model_name.replace("best_model", tag + "_best_model")

    if train:
        callbacks = lib.ml.generate_callbacks(patience=callback_timeout,
                                              outdir=outdir,
                                              name=dataname)
        model.fit(
            x=X_train,
            y=y_train,
            validation_data=(X_val, y_val),
            epochs=epochs,
            batch_size=batch_size,
            callbacks=callbacks,
        )
        try:
            lib.plotting.plot_losses(logpath=outdir,
                                     outdir=outdir,
                                     name=dataname)
        except IndexError:
            pass

        # Convert final model to GPU
        if gpu_available:
            print("Converted model from GPU to CPU-compatible")
            cpu_model = model_function(
                gpu=False,
                n_features=X.shape[-1],
                regression=regression,
                n_classes=n_classes,
            )
            lib.ml.gpu_model_to_cpu(
                trained_gpu_model=model,
                untrained_cpu_model=cpu_model,
                outdir=outdir,
                modelname=model_name,
            )

    print("Evaluating...")
    y_pred = model.predict(X_val)

    if not regression:
        lib.plotting.plot_confusion_matrices(
            y_target=y_val,
            y_pred=y_pred,
            y_is_binary=False,
            targets_to_binary=[4, 5, 6, 7, 8],
            outdir=outdir,
            name=dataname,
        )
                                          num_workers=1)

print("Test set size: {}.".format(len(dataset)))

# Load model
resnet = torchvision.models.resnet18(pretrained=True)
if use_metadata:
    model_base = torch.nn.Sequential(*list(resnet.children())[:-1])
    model = MetadataModel(model_base, base_out_dim=512)
else:
    resnet.fc = torch.nn.Linear(512, 2)
    model = resnet
# model.to(lib.model.device)

model = load_model(model, weight_path)

results = predict(model, data_loader, use_metadata=use_metadata)

results = np.concatenate(results, axis=0)

test_csv = pd.read_csv(csv_path)
# test_csv = test_csv.loc[int(len(test_csv)*data_sample_size) ,:]

submission_csv = pd.DataFrame({
    'image_name': test_csv['image_name'],
    'target': results
})

print(submission_csv.head())
submission_csv.to_csv(Path(dirs.csv) / "submission.csv", index=False)