Esempio n. 1
0
def main():

    # get experiment arguments
    args, config_dataset, _ = get_args()

    # [STEP 0 and 1] load the .mat files (sample-level) and partition the datasets (segment-level)
    preprocess_pipeline(args)

    # [STEP 2] create HAR datasets
    dataset = SensorDataset(**config_dataset, prefix="train", verbose=True)
Esempio n. 2
0
def get_data(path):
    X_train, y_train, ID_train = read_unprocessed_data(
        path, filename="twitter-training-data.txt")
    X_test, y_test, ID_test = read_unprocessed_data(
        path, filename="twitter-test.txt")

    X_train = preprocess.preprocess_pipeline(X_train)
    X_test = preprocess.preprocess_pipeline(X_test)

    X = X_train + X_test
    y = y_train + y_test

    return X_train, X_test, X, y_train, y_test, y, ID_train, ID_test
Esempio n. 3
0
def main():

    # get experiment arguments
    args, config_dataset, config_model = get_args()

    # [STEP 0 and 1] load the .mat files (sample-level) and partition the datasets (segment-level)
    preprocess_pipeline(args)

    if args.train_mode:

        # [STEP 2] create HAR datasets
        dataset = SensorDataset(**config_dataset, prefix="train")
        dataset_val = SensorDataset(**config_dataset, prefix="val")

        # [STEP 3] create HAR models
        if torch.cuda.is_available():
            model = create(args.model, config_model).cuda()
            torch.backends.cudnn.benchmark = True
            sys.stdout = Logger(
                os.path.join(model.path_logs,
                             f"log_main_{args.experiment}.txt"))

        # show args
        print("##" * 50)
        print(paint(f"Experiment: {model.experiment}", "blue"))
        print(
            paint(
                f"[-] Using {torch.cuda.device_count()} GPU: {torch.cuda.is_available()}"
            ))
        print(args)
        get_info_params(model)
        get_info_layers(model)
        print("##" * 50)

        # [STEP 4] train HAR models
        model_train(model, dataset, dataset_val, args)

    # [STEP 5] evaluate HAR models
    dataset_test = SensorDataset(**config_dataset, prefix="test")
    if not args.train_mode:
        config_model["experiment"] = "inference"
        model = create(args.model, config_model).cuda()
    model_eval(model, dataset_test, args)
Esempio n. 4
0
def sur():
    ori_df = pd.read_excel(output_root_dir + "/all-nf.xlsx")

    pp_df = preprocess_pipeline(ori_df)
    pp_df.to_excel(preprocess_output_dir + "/all-preprocessed.xlsx",
                   index=False)
    pp_df.to_excel(output_root_dir + "/all-nf.xlsx", index=False)

    options = ["freq", "cite"]
    for opt in options:
        a_df = analyzer_pipeline(pp_df, opt)
        plot_pipeline(a_df, opt)
Esempio n. 5
0
def sec():

    ori_df = pd.read_excel(output_root_dir + "/all.xlsx")

    pp_df = preprocess_pipeline(ori_df)
    pp_df.to_excel(preprocess_output_dir + "/all-preprocessed.xlsx",
                   index=False)

    f_df = filter_pipeline(pp_df)
    f_df.to_excel(filter_output_auto_dir + "/all-no_filtered.xlsx",
                  index=False)
    pp_df[~pp_df["title"].isin(f_df["title"])].to_excel(
        filter_output_auto_dir + "/all-filtered.xlsx", index=False)

    # # do filter manually
    f_df.to_excel(output_root_dir + "/all-nf.xlsx", index=False)
    pp_df[~pp_df["title"].isin(f_df["title"])].to_excel(
        filter_output_manual_dir + "/all-f.xlsx", index=False)

    # # f_df = pd.read_excel(filter_output_manual_dir + "/all-no_filtered.xlsx")
    options = ["freq", "cite"]
    for opt in options:
        a_df = analyzer_pipeline(f_df, opt)
        plot_pipeline(a_df, opt)
def main():
    """Run training process."""
    parser = argparse.ArgumentParser(
        description="Train Parallel WaveGAN (See detail in parallel_wavegan/bin/train.py)."
    )
    parser.add_argument(
        "--outdir", type=str, required=True, help="Path of output directory."
    )
    parser.add_argument(
        "--dpgmmdir", type=str, default="", help="Path of dpgmm directory."
    )
    parser.add_argument(
        "--checkpoints",
        type=str,
        default="",
        nargs="+",
        help="list of checkpoints file.",
    )
    parser.add_argument(
        "--config", type=str, required=True, help="Path of config file."
    )
    parser.add_argument("--verbose", type=int, default=1, help="verbose")
    parser.add_argument("--is_save", type=int, default=-1, help="is save")
    args = parser.parse_args()
    # load and save config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    config.update(vars(args))
    # global setting
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    seed_everything(config["seed"])
    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            stream=sys.stdout,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            stream=sys.stdout,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            stream=sys.stdout,
            format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")
    for key, value in config.items():
        logging.info(f"{key} = {value}")
    # preprocess
    train_features = pd.read_csv("../input/lish-moa/train_features.csv")
    train_targets = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
    test_features = pd.read_csv("../input/lish-moa/test_features.csv")
    logging.info("Successfully load input files.")
    train, test = preprocess_pipeline(
        train_features,
        test_features,
        config,
        path="",
        is_concat=config.get("is_concat", False),
    )
    drop_cols = train.columns[train.std() < 0.2]
    train.drop(columns=drop_cols, inplace=True)
    test.drop(columns=drop_cols, inplace=True)
    top_feats = np.arange(train.shape[1])
    drop_idx = train["cp_type"] == 0
    train = train.loc[drop_idx].reset_index(drop=True)
    del train_targets["sig_id"]
    train_targets = train_targets.loc[drop_idx].reset_index(drop=True)
    targets = [col for col in train_targets.columns]
    train = train.values
    test = test.values
    train_targets = train_targets.values
    ntargets = train_targets.shape[1]

    logging.info("Successfully preprocessed.")
    # for GPU/CPU
    kfold = MultilabelStratifiedKFold(
        n_splits=config["n_fold"], random_state=config["seed"], shuffle=True
    )
    eval_set = MoaDataset(test, None, top_feats, mode="test")
    eval_loader = {
        "eval": DataLoader(
            eval_set,
            batch_size=config["batch_size"],
            num_workers=config["num_workers"],
            pin_memory=config["pin_memory"],
            shuffle=False,
        ),
    }
    model_class = getattr(
        node,
        # keep compatibility
        config.get("model_type", "NODE"),
    )

    oof_targets = np.zeros((len(train), ntargets))
    preds = np.zeros((config["n_fold"], len(test), ntargets))
    for n, (tr, te) in enumerate(kfold.split(train_targets, train_targets)):
        logging.info(f"Start to train fold {n}.")
        xval = train[te]
        yval = train_targets[te]
        dev_set = MoaDataset(xval, yval, top_feats)
        dev_loader = {
            "eval": DataLoader(
                dev_set,
                batch_size=config["batch_size"],
                num_workers=config["num_workers"],
                pin_memory=config["pin_memory"],
                shuffle=False,
            ),
        }
        model = model_class(
            input_dim=len(top_feats),
            out_dim=config["out_dim"],
            **config["model_params"],
        ).to(device)
        # develop data
        trainer = TabTrainer(
            steps=0,
            epochs=0,
            data_loader=dev_loader,
            model=model.to(device),
            criterion={},
            optimizer={},
            scheduler={},
            config=config,
            device=device,
            add_name=f"{n}fold",
        )
        trainer.load_checkpoint(args.checkpoints[n])
        logging.info(f"Successfully load checkpoint from {args.checkpoints[n]}.")
        oof_targets[te] = trainer.inference()
        logging.info(f"Successfully inference dev data at fold{n}.")
        fold_score = mean_log_loss(yval, oof_targets[te])
        logging.info(
            f"fold{n} score: {fold_score:.6f}, Step:{trainer.steps}, Epoch:{trainer.epochs}."
        )
        # eval data
        trainer = TabTrainer(
            steps=0,
            epochs=0,
            data_loader=eval_loader,
            model=model.to(device),
            criterion={},
            optimizer={},
            scheduler={},
            config=config,
            device=device,
            add_name=f"{n}fold",
        )
        trainer.load_checkpoint(args.checkpoints[n])
        logging.info(f"Successfully load checkpoint from {args.checkpoints[n]}.")
        # run training loop
        preds[n] = trainer.inference()
        logging.info(f"Successfully inference eval data at fold{n}.")
    # calculate oof score
    cv_score = mean_log_loss(train_targets, oof_targets)
    logging.info(f"CV score: {cv_score:.6f}")
    if args.is_save > 0:
        train_targets_df = pd.read_csv("../input/lish-moa/train_targets_scored.csv")
        train_targets_df.loc[drop_idx, targets] = oof_targets
        oof_path = os.path.join(args.outdir, "oof.csv")
        train_targets_df.to_csv(oof_path, index=False)
        logging.info(f"saved at {oof_path}")
        # calculate eval data's submission file
        preds_mean = preds.mean(axis=0)
        ss = pd.read_csv("../input/lish-moa/sample_submission.csv")
        ss[targets] = preds_mean
        ss.loc[test_features["cp_type"] == "ctl_vehicle", targets] = 0
        sub_path = os.path.join(args.outdir, "submission.csv")
        ss.to_csv(sub_path, index=False)
        logging.info(f"saved at {sub_path}")
Esempio n. 7
0
def main():
    """Run training process."""
    parser = argparse.ArgumentParser(
        description=
        "Train Parallel WaveGAN (See detail in parallel_wavegan/bin/train.py)."
    )
    parser.add_argument("--outdir",
                        type=str,
                        required=True,
                        help="Path of output directory.")
    parser.add_argument("--resume",
                        type=str,
                        default="",
                        help="Path of resumed model file.")
    parser.add_argument("--config",
                        type=str,
                        required=True,
                        help="Path of config file.")
    parser.add_argument("--dpgmmdir",
                        type=str,
                        default="",
                        help="Path of dpgmm directory.")
    parser.add_argument("--verbose", type=int, default=1, help="verbose")
    args = parser.parse_args()
    # load and save config
    with open(args.config) as f:
        config = yaml.load(f, Loader=yaml.Loader)
    config.update(vars(args))
    # global setting
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    seed_everything(config["seed"])
    # set logger
    if args.verbose > 1:
        logging.basicConfig(
            level=logging.DEBUG,
            stream=sys.stdout,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    elif args.verbose > 0:
        logging.basicConfig(
            level=logging.INFO,
            stream=sys.stdout,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
    else:
        logging.basicConfig(
            level=logging.WARN,
            stream=sys.stdout,
            format=
            "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
        )
        logging.warning("Skip DEBUG/INFO messages")
    for key, value in config.items():
        logging.info(f"{key} = {value}")
    # preprocess
    train_features = pd.read_csv("../input/lish-moa/train_features.csv")
    train_targets = pd.read_csv("../input/lish-moa/train_targets_scored.csv")

    # train_nontargets = pd.read_csv("../input/lish-moa/train_targets_nonscored.csv")
    test_features = pd.read_csv("../input/lish-moa/test_features.csv")
    logging.info("Successfully load input files.")
    # dpgmm_dir = args.outdir
    train, test = preprocess_pipeline(
        train_features,
        test_features,
        config,
        path=args.dpgmmdir,
        is_concat=config.get("is_concat", False),
    )
    logging.info(f"{train.shape}\n{train.head()}")
    logging.info(f"{test.shape}\n{test.head()}")
    drop_cols = train.columns[train.std() < 0.2]
    train.drop(columns=drop_cols, inplace=True)
    test.drop(columns=drop_cols, inplace=True)
    top_feats = np.arange(train.shape[1])
    drop_idx = train["cp_type"] == 0
    train = train.loc[drop_idx].reset_index(drop=True)
    del train_targets["sig_id"]
    # from IPython import embed

    # embed()
    train_targets = train_targets.loc[drop_idx].reset_index(drop=True)
    train = train.values
    test = test.values
    train_targets = train_targets.values
    logging.info("Successfully preprocessed.")
    resumes = [""]
    resumes += [
        f"{config['outdir']}/best/best_loss{fold}fold.pkl"
        for fold in range(config["n_fold"] - 1)
    ]
    logging.info(f"resumes: {resumes}")

    if config.get("loss_type", "BCELoss") == "SmoothBCEwLogits":
        loss_class = SmoothBCEwLogits
    else:
        loss_class = getattr(
            torch.nn,
            # keep compatibility
            config.get("loss_type", "BCELoss"),
        )
    criterion = loss_class(**config["loss_params"]).to(device)
    model_class = getattr(
        node,
        # keep compatibility
        config.get("model_type", "NODE"),
    )

    # for GPU/CPU
    kfold = MultilabelStratifiedKFold(n_splits=config["n_fold"],
                                      random_state=config["seed"],
                                      shuffle=True)
    for n, (tr, te) in enumerate(kfold.split(train_targets, train_targets)):
        logging.info(f"Start to train fold {n}.")
        xtrain, xval = train[tr], train[te]
        ytrain, yval = train_targets[tr], train_targets[te]

        train_set = MoaDataset(xtrain, ytrain, top_feats)
        val_set = MoaDataset(xval, yval, top_feats)
        logging.info(
            f"train_set:{train_set[0]['X'].shape}, val_set:{val_set[0]['X'].shape}"
        )

        data_loader = {
            "train":
            DataLoader(
                train_set,
                batch_size=config["batch_size"],
                num_workers=config["num_workers"],
                pin_memory=config["pin_memory"],
                shuffle=True,
            ),
            "dev":
            DataLoader(
                val_set,
                batch_size=config["batch_size"],
                num_workers=config["num_workers"],
                pin_memory=config["pin_memory"],
                shuffle=False,
            ),
        }
        model = model_class(
            input_dim=len(top_feats),
            out_dim=config["out_dim"],
            **config["model_params"],
        ).to(device)

        if config["optimizer_type"] == "QHAdam":
            optimizer_class = QHAdam
        else:
            optimizer_class = getattr(
                torch.optim,
                # keep compatibility
                config.get("optimizer_type", "Adam"),
            )
        optimizer = optimizer_class(params=model.parameters(),
                                    **config["optimizer_params"])

        scheduler_class = getattr(
            torch.optim.lr_scheduler,
            # keep compatibility
            config.get("scheduler_type", "StepLR"),
        )
        scheduler = scheduler_class(optimizer=optimizer,
                                    **config["scheduler_params"])
        trainer = TabTrainer(
            steps=0,
            epochs=0,
            data_loader=data_loader,
            model=model.to(device),
            criterion=criterion,
            optimizer=optimizer,
            scheduler=scheduler,
            config=config,
            device=device,
            add_name=f"{n}fold",
            train=True,
        )
        # resume from checkpoint
        if len(resumes[n]) != 0:
            trainer.load_checkpoint(resumes[n])
            trainer.steps = 0
            logging.info(f"Successfully resumed from {resumes[n]}.")

        # run training loop
        try:
            logging.info("Start training!")
            trainer.run()
        except KeyboardInterrupt:
            trainer.save_checkpoint(
                os.path.join(config["outdir"],
                             f"checkpoint-{trainer.steps}steps.pkl"))
            logging.info(
                f"Successfully saved checkpoint @ {trainer.steps}steps.")