def main(): # get experiment arguments args, config_dataset, _ = get_args() # [STEP 0 and 1] load the .mat files (sample-level) and partition the datasets (segment-level) preprocess_pipeline(args) # [STEP 2] create HAR datasets dataset = SensorDataset(**config_dataset, prefix="train", verbose=True)
def get_data(path): X_train, y_train, ID_train = read_unprocessed_data( path, filename="twitter-training-data.txt") X_test, y_test, ID_test = read_unprocessed_data( path, filename="twitter-test.txt") X_train = preprocess.preprocess_pipeline(X_train) X_test = preprocess.preprocess_pipeline(X_test) X = X_train + X_test y = y_train + y_test return X_train, X_test, X, y_train, y_test, y, ID_train, ID_test
def main(): # get experiment arguments args, config_dataset, config_model = get_args() # [STEP 0 and 1] load the .mat files (sample-level) and partition the datasets (segment-level) preprocess_pipeline(args) if args.train_mode: # [STEP 2] create HAR datasets dataset = SensorDataset(**config_dataset, prefix="train") dataset_val = SensorDataset(**config_dataset, prefix="val") # [STEP 3] create HAR models if torch.cuda.is_available(): model = create(args.model, config_model).cuda() torch.backends.cudnn.benchmark = True sys.stdout = Logger( os.path.join(model.path_logs, f"log_main_{args.experiment}.txt")) # show args print("##" * 50) print(paint(f"Experiment: {model.experiment}", "blue")) print( paint( f"[-] Using {torch.cuda.device_count()} GPU: {torch.cuda.is_available()}" )) print(args) get_info_params(model) get_info_layers(model) print("##" * 50) # [STEP 4] train HAR models model_train(model, dataset, dataset_val, args) # [STEP 5] evaluate HAR models dataset_test = SensorDataset(**config_dataset, prefix="test") if not args.train_mode: config_model["experiment"] = "inference" model = create(args.model, config_model).cuda() model_eval(model, dataset_test, args)
def sur(): ori_df = pd.read_excel(output_root_dir + "/all-nf.xlsx") pp_df = preprocess_pipeline(ori_df) pp_df.to_excel(preprocess_output_dir + "/all-preprocessed.xlsx", index=False) pp_df.to_excel(output_root_dir + "/all-nf.xlsx", index=False) options = ["freq", "cite"] for opt in options: a_df = analyzer_pipeline(pp_df, opt) plot_pipeline(a_df, opt)
def sec(): ori_df = pd.read_excel(output_root_dir + "/all.xlsx") pp_df = preprocess_pipeline(ori_df) pp_df.to_excel(preprocess_output_dir + "/all-preprocessed.xlsx", index=False) f_df = filter_pipeline(pp_df) f_df.to_excel(filter_output_auto_dir + "/all-no_filtered.xlsx", index=False) pp_df[~pp_df["title"].isin(f_df["title"])].to_excel( filter_output_auto_dir + "/all-filtered.xlsx", index=False) # # do filter manually f_df.to_excel(output_root_dir + "/all-nf.xlsx", index=False) pp_df[~pp_df["title"].isin(f_df["title"])].to_excel( filter_output_manual_dir + "/all-f.xlsx", index=False) # # f_df = pd.read_excel(filter_output_manual_dir + "/all-no_filtered.xlsx") options = ["freq", "cite"] for opt in options: a_df = analyzer_pipeline(f_df, opt) plot_pipeline(a_df, opt)
def main(): """Run training process.""" parser = argparse.ArgumentParser( description="Train Parallel WaveGAN (See detail in parallel_wavegan/bin/train.py)." ) parser.add_argument( "--outdir", type=str, required=True, help="Path of output directory." ) parser.add_argument( "--dpgmmdir", type=str, default="", help="Path of dpgmm directory." ) parser.add_argument( "--checkpoints", type=str, default="", nargs="+", help="list of checkpoints file.", ) parser.add_argument( "--config", type=str, required=True, help="Path of config file." ) parser.add_argument("--verbose", type=int, default=1, help="verbose") parser.add_argument("--is_save", type=int, default=-1, help="is save") args = parser.parse_args() # load and save config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) # global setting device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") seed_everything(config["seed"]) # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, stream=sys.stdout, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) elif args.verbose > 0: logging.basicConfig( level=logging.INFO, stream=sys.stdout, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, stream=sys.stdout, format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") for key, value in config.items(): logging.info(f"{key} = {value}") # preprocess train_features = pd.read_csv("../input/lish-moa/train_features.csv") train_targets = pd.read_csv("../input/lish-moa/train_targets_scored.csv") test_features = pd.read_csv("../input/lish-moa/test_features.csv") logging.info("Successfully load input files.") train, test = preprocess_pipeline( train_features, test_features, config, path="", is_concat=config.get("is_concat", False), ) drop_cols = train.columns[train.std() < 0.2] train.drop(columns=drop_cols, inplace=True) test.drop(columns=drop_cols, inplace=True) top_feats = np.arange(train.shape[1]) drop_idx = train["cp_type"] == 0 train = train.loc[drop_idx].reset_index(drop=True) del train_targets["sig_id"] train_targets = train_targets.loc[drop_idx].reset_index(drop=True) targets = [col for col in train_targets.columns] train = train.values test = test.values train_targets = train_targets.values ntargets = train_targets.shape[1] logging.info("Successfully preprocessed.") # for GPU/CPU kfold = MultilabelStratifiedKFold( n_splits=config["n_fold"], random_state=config["seed"], shuffle=True ) eval_set = MoaDataset(test, None, top_feats, mode="test") eval_loader = { "eval": DataLoader( eval_set, batch_size=config["batch_size"], num_workers=config["num_workers"], pin_memory=config["pin_memory"], shuffle=False, ), } model_class = getattr( node, # keep compatibility config.get("model_type", "NODE"), ) oof_targets = np.zeros((len(train), ntargets)) preds = np.zeros((config["n_fold"], len(test), ntargets)) for n, (tr, te) in enumerate(kfold.split(train_targets, train_targets)): logging.info(f"Start to train fold {n}.") xval = train[te] yval = train_targets[te] dev_set = MoaDataset(xval, yval, top_feats) dev_loader = { "eval": DataLoader( dev_set, batch_size=config["batch_size"], num_workers=config["num_workers"], pin_memory=config["pin_memory"], shuffle=False, ), } model = model_class( input_dim=len(top_feats), out_dim=config["out_dim"], **config["model_params"], ).to(device) # develop data trainer = TabTrainer( steps=0, epochs=0, data_loader=dev_loader, model=model.to(device), criterion={}, optimizer={}, scheduler={}, config=config, device=device, add_name=f"{n}fold", ) trainer.load_checkpoint(args.checkpoints[n]) logging.info(f"Successfully load checkpoint from {args.checkpoints[n]}.") oof_targets[te] = trainer.inference() logging.info(f"Successfully inference dev data at fold{n}.") fold_score = mean_log_loss(yval, oof_targets[te]) logging.info( f"fold{n} score: {fold_score:.6f}, Step:{trainer.steps}, Epoch:{trainer.epochs}." ) # eval data trainer = TabTrainer( steps=0, epochs=0, data_loader=eval_loader, model=model.to(device), criterion={}, optimizer={}, scheduler={}, config=config, device=device, add_name=f"{n}fold", ) trainer.load_checkpoint(args.checkpoints[n]) logging.info(f"Successfully load checkpoint from {args.checkpoints[n]}.") # run training loop preds[n] = trainer.inference() logging.info(f"Successfully inference eval data at fold{n}.") # calculate oof score cv_score = mean_log_loss(train_targets, oof_targets) logging.info(f"CV score: {cv_score:.6f}") if args.is_save > 0: train_targets_df = pd.read_csv("../input/lish-moa/train_targets_scored.csv") train_targets_df.loc[drop_idx, targets] = oof_targets oof_path = os.path.join(args.outdir, "oof.csv") train_targets_df.to_csv(oof_path, index=False) logging.info(f"saved at {oof_path}") # calculate eval data's submission file preds_mean = preds.mean(axis=0) ss = pd.read_csv("../input/lish-moa/sample_submission.csv") ss[targets] = preds_mean ss.loc[test_features["cp_type"] == "ctl_vehicle", targets] = 0 sub_path = os.path.join(args.outdir, "submission.csv") ss.to_csv(sub_path, index=False) logging.info(f"saved at {sub_path}")
def main(): """Run training process.""" parser = argparse.ArgumentParser( description= "Train Parallel WaveGAN (See detail in parallel_wavegan/bin/train.py)." ) parser.add_argument("--outdir", type=str, required=True, help="Path of output directory.") parser.add_argument("--resume", type=str, default="", help="Path of resumed model file.") parser.add_argument("--config", type=str, required=True, help="Path of config file.") parser.add_argument("--dpgmmdir", type=str, default="", help="Path of dpgmm directory.") parser.add_argument("--verbose", type=int, default=1, help="verbose") args = parser.parse_args() # load and save config with open(args.config) as f: config = yaml.load(f, Loader=yaml.Loader) config.update(vars(args)) # global setting device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") seed_everything(config["seed"]) # set logger if args.verbose > 1: logging.basicConfig( level=logging.DEBUG, stream=sys.stdout, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) elif args.verbose > 0: logging.basicConfig( level=logging.INFO, stream=sys.stdout, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) else: logging.basicConfig( level=logging.WARN, stream=sys.stdout, format= "%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", ) logging.warning("Skip DEBUG/INFO messages") for key, value in config.items(): logging.info(f"{key} = {value}") # preprocess train_features = pd.read_csv("../input/lish-moa/train_features.csv") train_targets = pd.read_csv("../input/lish-moa/train_targets_scored.csv") # train_nontargets = pd.read_csv("../input/lish-moa/train_targets_nonscored.csv") test_features = pd.read_csv("../input/lish-moa/test_features.csv") logging.info("Successfully load input files.") # dpgmm_dir = args.outdir train, test = preprocess_pipeline( train_features, test_features, config, path=args.dpgmmdir, is_concat=config.get("is_concat", False), ) logging.info(f"{train.shape}\n{train.head()}") logging.info(f"{test.shape}\n{test.head()}") drop_cols = train.columns[train.std() < 0.2] train.drop(columns=drop_cols, inplace=True) test.drop(columns=drop_cols, inplace=True) top_feats = np.arange(train.shape[1]) drop_idx = train["cp_type"] == 0 train = train.loc[drop_idx].reset_index(drop=True) del train_targets["sig_id"] # from IPython import embed # embed() train_targets = train_targets.loc[drop_idx].reset_index(drop=True) train = train.values test = test.values train_targets = train_targets.values logging.info("Successfully preprocessed.") resumes = [""] resumes += [ f"{config['outdir']}/best/best_loss{fold}fold.pkl" for fold in range(config["n_fold"] - 1) ] logging.info(f"resumes: {resumes}") if config.get("loss_type", "BCELoss") == "SmoothBCEwLogits": loss_class = SmoothBCEwLogits else: loss_class = getattr( torch.nn, # keep compatibility config.get("loss_type", "BCELoss"), ) criterion = loss_class(**config["loss_params"]).to(device) model_class = getattr( node, # keep compatibility config.get("model_type", "NODE"), ) # for GPU/CPU kfold = MultilabelStratifiedKFold(n_splits=config["n_fold"], random_state=config["seed"], shuffle=True) for n, (tr, te) in enumerate(kfold.split(train_targets, train_targets)): logging.info(f"Start to train fold {n}.") xtrain, xval = train[tr], train[te] ytrain, yval = train_targets[tr], train_targets[te] train_set = MoaDataset(xtrain, ytrain, top_feats) val_set = MoaDataset(xval, yval, top_feats) logging.info( f"train_set:{train_set[0]['X'].shape}, val_set:{val_set[0]['X'].shape}" ) data_loader = { "train": DataLoader( train_set, batch_size=config["batch_size"], num_workers=config["num_workers"], pin_memory=config["pin_memory"], shuffle=True, ), "dev": DataLoader( val_set, batch_size=config["batch_size"], num_workers=config["num_workers"], pin_memory=config["pin_memory"], shuffle=False, ), } model = model_class( input_dim=len(top_feats), out_dim=config["out_dim"], **config["model_params"], ).to(device) if config["optimizer_type"] == "QHAdam": optimizer_class = QHAdam else: optimizer_class = getattr( torch.optim, # keep compatibility config.get("optimizer_type", "Adam"), ) optimizer = optimizer_class(params=model.parameters(), **config["optimizer_params"]) scheduler_class = getattr( torch.optim.lr_scheduler, # keep compatibility config.get("scheduler_type", "StepLR"), ) scheduler = scheduler_class(optimizer=optimizer, **config["scheduler_params"]) trainer = TabTrainer( steps=0, epochs=0, data_loader=data_loader, model=model.to(device), criterion=criterion, optimizer=optimizer, scheduler=scheduler, config=config, device=device, add_name=f"{n}fold", train=True, ) # resume from checkpoint if len(resumes[n]) != 0: trainer.load_checkpoint(resumes[n]) trainer.steps = 0 logging.info(f"Successfully resumed from {resumes[n]}.") # run training loop try: logging.info("Start training!") trainer.run() except KeyboardInterrupt: trainer.save_checkpoint( os.path.join(config["outdir"], f"checkpoint-{trainer.steps}steps.pkl")) logging.info( f"Successfully saved checkpoint @ {trainer.steps}steps.")