def main():
    args = set_args()
    time = datetime.datetime.now()
    args.output_dir += time.strftime("%b%d/")
    os.makedirs(args.output_dir, exist_ok=True)
    drop_NA_data = pd.read_csv(args.data_path, index_col=0)
    X, y = data_label_split(drop_NA_data)
    if args.use_nuclei:
        X = use_nuclei_feature(X)
    elif args.use_nuclei_gran:
        X = use_nuclei_gran_feature(X)
    X['Metadata_PlateID_Nuclei'] = drop_NA_data[
        'Metadata_PlateID_Nuclei'].tolist()
    X = normalize_by_group(X, 'Metadata_PlateID_Nuclei')
    X.dropna('columns', inplace=True)
    X['compound'] = drop_NA_data['compound'].tolist()

    models = [
        KNeighborsClassifier(30),
        LogisticRegression(max_iter=1000, solver="saga", n_jobs=-1),
        RandomForestClassifier(min_samples_split=50, random_state=0),
        MLPClassifier(solver="adam", max_iter=100)
    ]

    envs = os.environ
    if "SLURM_ARRAY_TASK_ID" in envs:
        model = models[int(envs['SLURM_ARRAY_TASK_ID'])]
    else:
        model = models[1]
    print('using model %s, data %s' %
          (str(model).split("(")[0], args.data_path))
    train(args, X, model, 0)
def train(args, data, model, verbose, parallel=True, bag_perc=0.5):
    if parallel:
        results = multi_mini_noise_signal_cv(args, data, "taxol", "DMSO",
                                             model, verbose, bag_perc)
    else:
        results = mini_noise_signal_cv(args.bagsize, data, "taxol", "DMSO",
                                       model, verbose, bag_perc)

    results = pd.DataFrame.from_dict(results, orient="index")

    results.columns = [
        "mean_accuracy",
        "std_accuracy",
        "mean_pred_score_control",
        "std_pred_score_control",
        "mean_pred_score_treatment",
        "std_pred_score_treatment",
    ]

    model_name = str(model).split("(")[0]
    feature_size = len(data_label_split(data)[0].columns)
    result_path = os.path.join(
        args.output_dir,
        "%s_sample%s_feature%s.csv" % (model_name, args.bagsize, feature_size))
    if os.path.exists(result_path):
        results.to_csv(
            result_path,
            mode="a",
            header=False,
        )
    else:
        results.to_csv(result_path)
def mini_noise_signal_cv(
    size: int,
    data: pd.DataFrame,
    treatment: str,
    control: str,
    model,
    cv: int,
    verbose: int,
    bag_perc: float = 0.5,
) -> tuple:
    mean_mean_accuracy = []
    std_mean_accuracy = []
    mean_pred_score_control = []
    std_pred_score_control = []
    mean_pred_score_treatment = []
    std_pred_score_treatment = []
    for i in tqdm(range(5, 96, 5)):
        mini_batch = generate_data_set(size, i / 100, data, treatment, control,
                                       bag_perc)
        X, y = data_label_split(mini_batch)

        # encode string class into numerical class, 0 for control, 1 for treatment
        y = y["compound"]  # .map({treatment:1, control:0})

        mean_accuracy, pred_score_control, pred_score_treatment = kfold_train(
            cv, X, y, model, "DMSO", "taxol", verbose=verbose)
        mean_mean_accuracy.append(np.mean(mean_accuracy))
        std_mean_accuracy.append(np.std(mean_accuracy))
        mean_pred_score_control.append(np.mean(pred_score_control))
        std_pred_score_control.append(np.std(pred_score_control))
        mean_pred_score_treatment.append(np.mean(pred_score_treatment))
        std_pred_score_treatment.append(np.std(pred_score_treatment))
    return (
        mean_mean_accuracy,
        std_mean_accuracy,
        mean_pred_score_control,
        std_pred_score_control,
        mean_pred_score_treatment,
        std_pred_score_treatment,
    )
Esempio n. 4
0
def main():
    args = set_args()
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    print(args)

    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
        print("GPU is ON! with GPU %s" % torch.cuda.current_device())

    time = datetime.datetime.now()
    args.output_dir += time.strftime("%b%d/")
    os.makedirs(args.output_dir, exist_ok=True)
    drop_NA_data = pd.read_csv(args.data_path, index_col=0)
    X, y = data_label_split(drop_NA_data)
    if args.use_nuclei:
        X = use_nuclei_feature(X)
    elif args.use_nuclei_gran:
        X = use_nuclei_gran_feature(X)
    X['Metadata_PlateID_Nuclei'] = drop_NA_data[
        'Metadata_PlateID_Nuclei'].tolist()
    X = normalize_by_group(X, 'Metadata_PlateID_Nuclei')
    X.dropna('columns', inplace=True)
    X['compound'] = drop_NA_data['compound'].tolist()
    data = X
    feature_size = len(data_label_split(data)[0].columns)

    pools = ['att', 'mean', 'max', 'min']
    if "SLURM_ARRAY_TASK_ID" in os.environ:
        args.pool = pools[int(os.environ['SLURM_ARRAY_TASK_ID'])]

    for i in range(args.start, args.end, 5):
        # define model
        if args.pool == 'att':
            model = profile_AttSet(feature_size, args.thres)
        else:
            model = FullDeepSet(feature_size, args.pool, args.thres)
        if args.cuda:
            model.cuda()

        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               betas=(0.9, 0.999),
                               weight_decay=args.reg)

        results = mini_noise_signal_cv(i, i + 1, data, "taxol", "DMSO", model,
                                       args)

        results = pd.DataFrame.from_dict(results, orient="index")

        results.columns = [
            "mean_accuracy",
            "std_accuracy",
            "mean_control_accuracy",
            "std_control_accuracy",
            "mean_treat_accuracy",
            "std_treat_accuracy",
            "mean_pred_score_control",
            "std_pred_score_control",
            "mean_pred_score_treatment",
            "std_pred_score_treatment",
        ]

        res_path = os.path.join(
            args.output_dir,
            "%s_deepset_thres%.1f_bags%d*%d_bagsize%d_feature%d.csv" % (
                args.pool,
                args.thres,
                args.num_bags_train,
                args.batch_size,
                args.mean_bag_length,
                feature_size,
            ))

        if os.path.exists(res_path):
            results.to_csv(
                res_path,
                mode="a",
                header=False,
            )
        else:
            results.to_csv(res_path)
def multi_kfold_train_bag(perc,
                          args,
                          data,
                          model,
                          control=0,
                          treatment=1,
                          verbose=0):
    X, y = data_label_split(data)
    y = y["compound"]
    skf = StratifiedKFold(n_splits=args.splits)
    pred_score_control = np.array([])
    pred_score_treatment = np.array([])
    mean_accuracy = []
    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        print('start perc %s, split %s' % (perc, i))
        X_train, X_test = (
            data_standardization(X.iloc[train_index]),
            data_standardization(X.iloc[test_index]),
        )
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        X_train = pd.concat([X_train, y_train], axis=1, sort=False)
        X_train, y_train = data_label_split(
            generate_data_set(args.bagsize, perc, X_train, treatment, control))
        y_train = y_train['compound']
        X_test = pd.concat([X_test, y_test], axis=1, sort=False)
        valida_dataset = dmso_taxol_ProfileBag(
            X_test,
            int(args.num_bags_train / args.splits),
            args.mean_bag_length,
            args.var_bag_length,
            perc,
            treatment,
            control,
            args.batch_size,
            0.5,
        )
        valida_loader = D.DataLoader(valida_dataset,
                                     batch_size=1,
                                     shuffle=True)
        lgs = model.fit(X_train, y_train)
        acc_control, acc_treat, pred_score_cont, pred_score_treat = test_bag_model(
            lgs, valida_loader)
        pred_score_control = np.append(pred_score_control, pred_score_cont)
        pred_score_treatment = np.append(pred_score_treatment,
                                         pred_score_treat)

        mean_accuracy.append(np.mean(acc_control + acc_treat))
    if args.save_score:
        with open('%s_%f.txt' % (model, perc), 'w') as f:
            f.write(','.join(["%.4f" % i
                              for i in pred_score_control.tolist()]) + '\n')
            f.write(','.join(
                ["%.4f" % i for i in pred_score_treatment.tolist()]))

    return {
        perc: [
            np.mean(mean_accuracy),
            np.std(mean_accuracy),
            np.mean(pred_score_control),
            np.std(pred_score_control),
            np.mean(pred_score_treatment),
            np.std(pred_score_treatment),
        ]
    }
def multi_kfold_train(perc,
                      args,
                      data,
                      model,
                      control=0,
                      treatment=1,
                      verbose=0):

    mini_batch = generate_data_set(args.bagsize, perc, data, treatment,
                                   control)
    X, y = data_label_split(mini_batch)
    y = y["compound"]

    skf = StratifiedKFold(n_splits=args.splits)
    pred_score_control = np.array([])
    pred_score_treatment = np.array([])
    mean_accuracy = []
    if type(X) == np.ndarray:
        for i, (train_index, test_index) in tqdm(enumerate(skf.split(X, y)),
                                                 desc="K fold CV"):
            if verbose != 0:
                print("Fold %d" % i, "TRAIN:", train_index, "TEST:",
                      test_index)
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            lgs = model.fit(X_train, y_train)
            pred_score_control = np.append(
                pred_score_control,
                lgs.predict_proba(X_test[y_test == "DMSO"])[:, 0])
            pred_score_treatment = np.append(
                pred_score_treatment,
                lgs.predict_proba(X_test[y_test == "taxol"])[:, 1])
            mean_accuracy.append(lgs.score(X_test, y_test))
    #         print(y_test, lgs.predict_proba(X_test[y_test==0])[:,0], lgs.predict_proba(X_test[y_test==1])[:,1])
    elif type(X) == pd.core.frame.DataFrame:
        for i, (train_index, test_index) in tqdm(enumerate(skf.split(X, y)),
                                                 desc="K fold CV"):
            if verbose != 0:
                print("Fold %d" % i, "TRAIN:", train_index, "TEST:",
                      test_index)
            X_train, X_test = (
                data_standardization(X.iloc[train_index]),
                data_standardization(X.iloc[test_index]),
            )
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]

            lgs = model.fit(X_train, y_train)
            pred_score_control = np.append(
                pred_score_control,
                lgs.predict_proba(X_test[y_test == "DMSO"])[:, 1])
            pred_score_treatment = np.append(
                pred_score_treatment,
                lgs.predict_proba(X_test[y_test == "taxol"])[:, 1])
            mean_accuracy.append(lgs.score(X_test, y_test))
    return {
        perc: [
            np.mean(mean_accuracy),
            np.std(mean_accuracy),
            np.mean(pred_score_control),
            np.std(pred_score_control),
            np.mean(pred_score_treatment),
            np.std(pred_score_treatment),
        ]
    }
Esempio n. 7
0
def main():
    args.cuda = not args.no_cuda and torch.cuda.is_available()
    print(args)

    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)
        print("GPU is ON! with GPU %s" % torch.cuda.current_device())

    data_path = 'moa_data_drop_NA.csv'
    drop_NA_data = pd.read_csv(data_path, index_col=0)
    X, y = data_label_split(drop_NA_data)
    X['Metadata_PlateID_Nuclei'] = drop_NA_data[
        'Metadata_PlateID_Nuclei'].tolist()
    X = normalize_by_group(X, 'Metadata_PlateID_Nuclei')
    X.dropna('columns', inplace=True)
    X['compound'] = drop_NA_data['compound'].tolist()
    data = X

    feature_size = len(data_label_split(data)[0].columns)
    for i in range(args.start, args.end, 5):
        # define model
        model = profile_AttSet(feature_size, "att", args.thres)
        if args.cuda:
            model.cuda()

        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               betas=(0.9, 0.999),
                               weight_decay=args.reg)

        results = mini_noise_signal_cv(
            i,
            i + 1,
            data,
            args.num_bags_train,
            args.mean_bag_length,
            args.var_bag_length,
            "taxol",
            "DMSO",
            args.batch_size,
            model,
            args.lr,
            args.reg,
            args.splits,
            args.epochs,
        )
        feature_size = len(data_label_split(data)[0].columns)

        results = pd.DataFrame.from_dict(results, orient="index")

        results.columns = [
            "mean_accuracy",
            "std_accuracy",
            "mean_control_accuracy",
            "std_control_accuracy",
            "mean_treat_accuracy",
            "std_treat_accuracy",
            "mean_pred_score_control",
            "std_pred_score_control",
            "mean_pred_score_treatment",
            "std_pred_score_treatment",
        ]

        if os.path.exists("deepset_att%.1f_bags%d*%d_bagsize%d_feature%d.csv" %
                          (
                              args.thres,
                              args.num_bags_train,
                              args.batch_size,
                              args.mean_bag_length,
                              feature_size,
                          )):
            results.to_csv(
                "deepset_att%.1f_bags%d*%d_bagsize%d_feature%d.csv" % (
                    args.thres,
                    args.num_bags_train,
                    args.batch_size,
                    args.mean_bag_length,
                    feature_size,
                ),
                mode="a",
                header=False,
            )
        else:
            results.to_csv(
                "deepset_att%.1f_bags%d*%d_bagsize%d_feature%d.csv" % (
                    args.thres,
                    args.num_bags_train,
                    args.batch_size,
                    args.mean_bag_length,
                    feature_size,
                ))
def mini_noise_signal_cv(start, end, data, treatment, control, model, args):
    dic = {}
    # Set different percentage of treatment v.s. control
    for j in range(start, end, 5):
        X, y = data_label_split(data)
        y = y["compound"]

        acc_control_list = []
        acc_treat_list = []
        pred_score_control_list = []
        pred_score_treat_list = []
        # Stratified K fold
        skf = StratifiedKFold(n_splits=args.splits)
        for i, (train_index, test_index) in enumerate(skf.split(X, y)):
            X_train, X_test = (
                data_standardization(X.iloc[train_index]),
                data_standardization(X.iloc[test_index]),
            )
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            X_train = pd.concat([X_train, y_train], axis=1, sort=False)
            X_test = pd.concat([X_test, y_test], axis=1, sort=False)

            # Redefine dataloader and train model at each fold
            train_dataset = dmso_taxol_ProfileBag(
                X_train,
                int(args.num_bags_train * (args.splits - 1) / args.splits),
                args.mean_bag_length,
                args.var_bag_length,
                j / 100,
                treatment,
                control,
                args.batch_size,
                0.5,
                True,
            )
            valida_dataset = dmso_taxol_ProfileBag(
                X_test,
                int(args.num_bags_train / args.splits),
                args.mean_bag_length,
                args.var_bag_length,
                j / 100,
                treatment,
                control,
                args.batch_size,
                0.5,
            )
            train_loader = D.DataLoader(train_dataset,
                                        batch_size=1,
                                        shuffle=True)
            valida_loader = D.DataLoader(valida_dataset,
                                         batch_size=1,
                                         shuffle=True)
            # Start training
            model.__init__(model.input_feature, model.pool, model.thres)
            if args.cuda:
                model.cuda()
            optimizer = optim.Adam(
                model.parameters(),
                lr=args.lr,
                betas=(0.9, 0.999),
                weight_decay=args.reg,
            )

            minimum_error = float("inf")
            early_stop = []
            for epoch in range(args.epochs):
                epoch_result = []
                print("Train, Percent:%d, Fold: %d, " % (j, i), end="")
                train_loss, train_error = train(args, epoch, train_loader,
                                                model, optimizer, 1)
                #                 if train_loss >= 49:
                #                     X_train.to_csv("bag_perc%d_fold%d.csv" % (j, i))
                #                     break
                epoch_result.append(train_loss)
                epoch_result.append(train_error)
                # Conduct testing
                print("Test, Percent:%d, Fold:%d, " % (j, i), end="")
                acc_control, acc_treat, pred_score_control, pred_score_treat = test(
                    args, model, valida_loader)
                if 1 - np.mean(acc_control + acc_treat) < minimum_error:
                    minimum_error = 1 - np.mean(acc_control + acc_treat)
                    best_result = (
                        acc_control,
                        acc_treat,
                        pred_score_control,
                        pred_score_treat,
                    )

                epoch_result.append(1 - np.mean(acc_control))
                epoch_result.append(1 - np.mean(acc_treat))
                if len(early_stop) < 5:
                    early_stop.append(epoch_result)
                else:
                    early_stop.append(epoch_result)
                    early_stop.pop(0)
                # Stop if loss and training+testing error is close to 0 in 5 consecutive epochs
                if np.mean(early_stop) <= 1e-6:
                    break
            acc_control_list += best_result[0]
            acc_treat_list += best_result[1]
            pred_score_control_list += best_result[2]
            pred_score_treat_list += best_result[3]
            print(np.mean(best_result[0] + best_result[1]))
        dic[j / 100] = [
            np.mean(acc_control_list + acc_treat_list),
            np.std(acc_control_list + acc_treat_list),
            np.mean(acc_control_list),
            np.std(acc_control_list),
            np.mean(acc_treat_list),
            np.std(acc_treat_list),
            np.mean(pred_score_control_list),
            np.std(pred_score_control_list),
            np.mean(pred_score_treat_list),
            np.std(pred_score_treat_list),
        ]
        return dic