Exemple #1
0
def knn_NCA(X_train, Y_train, X_test, K=1) -> list:
    """
    Reduce the dimensionalty of the dataset using the NCA method
    This is slower than using PCA or not using anything at all,
    but yields better results for now

    If the dataset sample is too large this takes really long to run
    """
    # Scale all the output using a standard scaler
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    # Reduce the dimensionalty of the data using NCA
    nca = NeighborhoodComponentsAnalysis(2).fit(X_train, Y_train)
    X_train_nca = nca.transform(X_train)
    X_test_nca = nca.transform(X_test)

    X_train_nca = pd.DataFrame(X_train_nca)
    X_test_nca = pd.DataFrame(X_test_nca)

    # Classify using a KNN classifier
    clf = KNeighborsClassifier(n_neighbors=K, leaf_size=2)
    clf.fit(X_train_nca, Y_train)
    # Return the predicted results
    return clf.predict(X_test_nca)
Exemple #2
0
def knnGridSearch(X_train, Y_train, X_test, Y_test) -> list:
    """
    Used to run a grid search to find the best params for later usage
    Only runs if the param -grid is provided 
    """
    # Params used for the gird search
    grid_params = {
        'n_neighbors': [1, 3, 5],
    }
    # Scale all the output using a standard scaler
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    # Reduce the dimensionalty of the data using NCA
    nca = NeighborhoodComponentsAnalysis(2).fit(X_train, Y_train)
    X_train_nca = nca.transform(X_train)
    X_test_nca = nca.transform(X_test)
    # Run the Grid search and print out the best params
    classifier = KNeighborsClassifier()
    gs = GridSearchCV(classifier, grid_params, verbose=1, cv=3, n_jobs=-1)
    gs.fit(X_train_nca, Y_train)
    print(gs.best_params_)
    # Score the best found params using a confusion matrix
    Y_pred = gs.predict(X_test_nca)
    print(confusion_matrix(Y_test, Y_pred))
Exemple #3
0
def dim_reduc(X_train, Y_train, X_test, Y_test, K=1) -> None:
    """
    Compare PCA, kernel PCA, and NCA dimensionalty reduction.
    Slightly modified version of this code:
    https://scikit-learn.org/stable/auto_examples/neighbors/plot_nca_dim_reduction.html
    Only runs if the -dim argument is provided
    KernelPCA and standard PCA give the same results
    While NCA seems to have a slight edge
    """
    X = pd.concat([X_train, X_test])
    Y = Y_train + Y_test
    random_state = 0
    # Reduce dimension to 2 with PCA
    pca = make_pipeline(StandardScaler(),
                        PCA(n_components=2, random_state=random_state))

    # Reduce dimension to 2 with NeighborhoodComponentAnalysis
    nca = make_pipeline(
        StandardScaler(),
        NeighborhoodComponentsAnalysis(n_components=2,
                                       random_state=random_state))
    # Reduce the dimensionalty using Kernel PCA
    kernel_pca = make_pipeline(StandardScaler(),
                               KernelPCA(2, random_state=random_state))

    # Use a nearest neighbor classifier to evaluate the methods
    knn = KNeighborsClassifier(n_neighbors=K)

    # Make a list of the methods to be compared
    dim_reduction_methods = [('PCA', pca), ('NCA', nca),
                             ('KernelPCA', kernel_pca)]

    # plt.figure()
    for i, (name, model) in enumerate(dim_reduction_methods):
        plt.figure()
        # plt.subplot(1, 3, i + 1, aspect=1)
        # Fit the method's model
        model.fit(X_train, Y_train)
        # Fit a nearest neighbor classifier on the embedded training set
        knn.fit(model.transform(X_train), Y_train)
        # Compute the nearest neighbor accuracy on the embedded test set
        acc_knn = knn.score(model.transform(X_test), Y_test)
        print(name, acc_knn)
        # Embed the data set in 2 dimensions using the fitted model
        X_embedded = model.transform(X)
        # Plot the projected points and show the evaluation score
        plt.scatter(
            X_embedded[:, 0],
            X_embedded[:, 1],
            c=Y,
            s=30,
            cmap='Set1',
        )
        plt.title("KNN with {}\np={}".format(name, round(acc_knn, 3)))
        plt.savefig("figs/KNN_{}.png".format(name))

    plt.show()
Exemple #4
0
def runKNN(X_train, Y_train, X_test, K=1) -> list:
    """
    Trains and tests a KNN algorithm on the supplied data and returns the predictions.
    """
    # Scale all the output using a standard scaler
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X=X_train)
    X_test = scaler.transform(X_test)

    classifier = KNeighborsClassifier(n_neighbors=K)
    classifier.fit(X_train, Y_train)
    # Return the predicted results
    return classifier.predict(X_test)
Exemple #5
0
def normalize_features(scaler: StandardScaler = None,
                       replace_nan_token: int = 0,
                       data: float = None) -> StandardScaler:
    if len(data) == 0 or data[0].features is None:
        return None
    if scaler is not None:
        scaler = scaler
    elif scaler is None:
        features = np.vstack([d.features for d in data])
        scaler = StandardScaler(replace_nan_token=replace_nan_token)
        scaler.fit(features)
    for d in data:
        d.set_features(scaler.transform(d.features.reshape(1, -1))[0])
    return scaler
Exemple #6
0
def knn_PCA(X_train, Y_train, X_test, K=1) -> list:
    """
    Although PCA peforms worse in most cases from our testing it is useful due to being much faster than NCA
    """
    # Scale all the output using a standard scaler
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    pca = PCA(2)
    pca.fit_transform(X_train)
    pca.fit_transform(X_test)
    X_train = pd.DataFrame(X_train)
    X_test = pd.DataFrame(X_test)
    classifier = KNeighborsClassifier(n_neighbors=K)
    classifier.fit(X_train, Y_train)
    # Return the predicted results
    return classifier.predict(X_test)
Exemple #7
0
def load_scalers(path: str) -> Tuple[StandardScaler, StandardScaler]:
    """
    Loads the scalers a model was trained with.

    :param path: Path where model checkpoint is saved.
    :return: A tuple with the data scaler and the features scaler.
    """
    state = torch.load(path, map_location=lambda storage, loc: storage)

    scaler = StandardScaler(
        state['data_scaler']['means'], state['data_scaler']
        ['stds']) if state['data_scaler'] is not None else None
    features_scaler = StandardScaler(
        state['features_scaler']['means'],
        state['features_scaler']['stds'],
        replace_nan_token=0) if state['features_scaler'] is not None else None

    return scaler, features_scaler
Exemple #8
0
def load_dataset(config, used_days=4, used_weeks=4, days=4, weeks=4):
    base_dir = config['data'].get('dataset_dir')
    dataset_dir = config['data'].get('dataset_dir')
    static_dim = config['data'].get('static_dim')
    dynamic_dim = config['data'].get('dynamic_dim')
    method = config['model'].get('method')
    per_period = config['data'].get('per_period', 5)
    seq_len = 1 if method == 'baseline' else config['model'].get('seq_len')
    data = {}  # to return
    tprint("Loading Dataset: " + dataset_dir)
    # loading node features
    info = np.load(os.path.join(base_dir, 'link_info.npz'))['link_info']
    data['link_length'] = info[:, 0] * 1000  # the length of road segments: km -> m
    # static feature normlize
    scaler0 = StaticFeatureScaler()
    static_fes = scaler0.transform(info)[:, 0:static_dim]
    print("static_fes.shape=", static_fes.shape)

    dynamic_fes = np_load(os.path.join(dataset_dir, 'dynamic_fes.npz'))
    eta_label = np_load(os.path.join(dataset_dir, 'eta_label.npz'))  # each row is a list, including list(link_idxs, link_move, timespent)
    # dynamic feature normlize
    fes, fe_periods = dynamic_fes['fes'], dynamic_fes['periods']
    scale_fes = fes[fe_periods < min(eta_label['valid_periods'])] # All samples before the valid periods can be used to scale
    scaler1 = StandardScaler(mean=scale_fes.mean(), std=scale_fes.std())
    scaler1.save(os.path.join(dataset_dir, 'scaler1.npz'))
    if method == 'baseline':
        dynamic_fes0 = fes
    else:
        dynamic_fes0 = scaler1.transform(fes)
    for prefix in ('train', 'valid', 'test'):
        tprint("(%s)dynamic_fes.shape=%s" % (prefix, str(dynamic_fes0.shape)))
        assert (dynamic_fes0.shape[-1] == dynamic_dim)
        data['%s_loader' % prefix] = DataLoader(dynamic_fes=dynamic_fes0,
                                                static_fes=static_fes,
                                                links_time_list=eta_label[prefix].tolist(),
                                                seq_len=seq_len,
                                                days=used_days,
                                                weeks=used_weeks,
                                                fe_periods=dynamic_fes['periods'],
                                                eta_periods=eta_label['%s_periods' % prefix],
                                                shuffle=('train' == prefix),
                                                per_period=per_period)
    tprint('Dataset loaded successfully.')
    return data
Exemple #9
0
def cross_val_regression(X, y, svr, k=5, mth='MSE'):
    """
    SVR(回帰)について交差検証を行う
    StandardScaler 処理も含まれている
    """
    # 交差検証の前にX, yをシャッフルする
    random_mask = np.arange(len(y))
    np.random.shuffle(random_mask)
    X = X[random_mask]
    y = y[random_mask]

    k = 5
    part = len(y) // k
    scores = []
    for i in range(k):
        print("{} th cross validation...".format(i + 1))
        test_X = X[part * i : part * (i+1)]
        test_y = y[part * i : part * (i+1)]
        train_X = np.concatenate((X[0:part*i], X[part*(i+1):-1]), axis=0)
        train_y = np.concatenate((y[0:part*i], y[part*(i+1):-1]), axis=0)

        sc = StandardScaler()
        train_X_std = sc.fit_transform(train_X)
        test_X_std = sc.transform(test_X)

        svr.fit(train_X_std, train_y)
        score = svr.score(test_X_std, test_y, mth=mth)

        if math.isnan(score):
            print("nan detected: cross_val_regression terminated!!")
            return np.nan
        
        print("{} score: ".format(mth), score)
        scores.append(score)
    
    res = np.mean(np.array(scores))
    print("{}-fold average score: ".format(k), res)
    return res
Exemple #10
0
import numpy as np
import sys

data_num = 2000
hf = data_num // 2

X, y = load_sanfrancisco(data_num)

# 前半を訓練データ, 後半を評価データとする(load_sanfranciscoの時点でランダムサンプリングされている)
X_train = X[:hf]
y_train = y[:hf]
X_test = X[hf:]
y_test = y[hf:]

# 正規化する
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)
X_test_std = sc.transform(X_test)


def SVRAgent(ker_type='-g', p=9.5, c=27000, eps=0.09):
    svr = SVRegressor(ker_type, p, c, eps)

    # cross_val_regression で正規化したのち交差検証している
    score = cross_val_regression(X_train, y_train, svr, mth='R2')
    print('cross val score: ', score)

    print('start SVR fit...')
    svr.fit(X_train_std, y_train)

    print('start SVR predict...')
Exemple #11
0
def visualize_attention(args: Namespace):
    """Visualizes attention weights."""
    print(f'Loading model from "{args.checkpoint_path}"')
    model = load_checkpoint(args.checkpoint_path, cuda=args.cuda)
    mpn = model.encoder
    print(f'mpn:-->{type(mpn)}')
    print(f'MPNencoder attributes:{mpn.encoder.__dict__}')
    print('Loading data')
    if os.path.exists(args.data_path) and os.path.getsize(args.data_path) > 0:
        DGLtest = args.data_path
        print(f'Loading data -->{DGLtest}')
    else:
        direct = 'data_RE2/tmp/'
        DGLtest = direct + 'viz.csv'
        print(f'Loading data -->{DGLtest}')

    viz_data = DGLDataset(DGLtest, training=False)
    viz_dataloader = DataLoader(viz_data,
                                batch_size=args.batch_size,
                                shuffle=False,
                                num_workers=0,
                                collate_fn=DGLCollator(training=False),
                                drop_last=False,
                                worker_init_fn=worker_init_fn)
    metric_func = get_metric_func(metric=args.metric)

    for it, result_batch in enumerate(tqdm(viz_dataloader)):

        batch_sm = result_batch['sm']
        label_batch = result_batch['labels']
        if args.dataset_type == 'regression':
            if args.scale == "standardization":
                print('Fitting scaler(Z-score standardization)')
                scaler = StandardScaler().fit(label_batch)
                y_train_scaled = scaler.transform(label_batch)
                print(f'train data mean:{scaler.means}\nstd:{scaler.stds}\n')
            if args.scale == "normalization":
                print('Fitting scaler( Min-Max normalization )')
                scaler = minmaxScaler().fit(label_batch)
                y_train_scaled = scaler.transform(label_batch)
                print(
                    f'train data min:{scaler.mins}\ntrain data max:{scaler.maxs}\n'
                )
            if args.scale != 'standardization' and args.scale != 'normalization':
                raise ValueError(
                    "not implemented scaler,use one of [standardization, normalization]"
                )
        else:
            scaler = None
        mpn.viz_attention(batch_sm, viz_dir=args.viz_dir)
        test_targets, test_preds, test_scores = evaluate_batch(
            args,
            model=model,
            data=viz_dataloader,
            num_tasks=args.num_tasks,
            metric_func=metric_func,
            dataset_type=args.dataset_type,
            scaler=scaler,
            logger=None,
            Foldth=0,
            predsLog=args.save_dir)
        print(f'rung viz{args.viz_dir}')
Exemple #12
0
def evaluate_batch(args: Namespace,
                   model: nn.Module,
                   data: DataLoader,
                   num_tasks: int,
                   metric_func: Callable,
                   dataset_type: str,
                   Foldth: int,
                   scaler: StandardScaler = None,
                   logger: logging.Logger = None,
                   predsLog=None) -> List[float]:

    info = logger.info if logger is not None else print
    model.eval()
    preds = []
    targets = []
    smiles = []
    targets_sca = []
    predsBack = []
    if predsLog != None:
        predsLog = Path(
            predsLog) / f'predsLogFoldth{Foldth}_{args.data_filename}'

    for it, result_batch in enumerate(tqdm(data)):

        model.zero_grad()
        batch_labels = result_batch['labels']
        batch_sm = result_batch['sm']
        with torch.no_grad():
            batch_preds = model(batch_sm)

        batch_preds = batch_preds.data.cpu().numpy()

        if scaler is not None:
            batch_preds_Sback = scaler.inverse_transform(batch_preds)
            batch_preds_Sback = batch_preds_Sback.tolist()
            predsBack.extend(batch_preds_Sback)

            batch_labels_sca = scaler.transform(batch_labels)
            targets_sca.extend(batch_labels_sca)

        preds.extend(batch_preds.tolist())
        targets.extend(batch_labels)
        smiles.extend(batch_sm)
    if predsLog != None:
        with open(predsLog, 'w+') as pf:
            if scaler is not None:
                pf.write(f'smiles,labels,predictions,ORI_label,SB_pred,diff\n')
                for i, sm in enumerate(smiles):
                    lab_S = targets_sca[i]
                    pred = preds[i]
                    pred_SB = predsBack[i]
                    lab_noscle = targets[i]
                    diff = np.array(lab_noscle, dtype=np.float32) - np.array(
                        pred_SB, dtype=np.float32)
                    pf.write(
                        f'{sm},{lab_S},{pred},{lab_noscle},{pred_SB},{diff},<{i}>\n'
                    )
            else:
                pf.write(f'smiles,labels,predictions\n')
                for i, sm in enumerate(smiles):
                    pred_noScale = preds[i]
                    lab_noscle = targets[i]
                    pf.write(f'{sm},{pred_noScale},{lab_noscle}\n')

    if len(preds) == 0:
        return [float('nan')] * num_tasks
    valid_preds = [[] for _ in range(num_tasks)]
    valid_targets = [[] for _ in range(num_tasks)]

    if args.dataset_type == 'regression':
        if scaler is None:

            targets = targets
        else:
            preds = predsBack

    for i in range(num_tasks):
        for j in range(len(preds)):
            if targets[j][i] is not None:
                valid_preds[i].append(preds[j][i])
                valid_targets[i].append(targets[j][i])

    results = []
    for i in range(num_tasks):

        if dataset_type == 'classification':
            nan = False
            if all(target == 0 for target in valid_targets[i]) or all(
                    target == 1 for target in valid_targets[i]):
                nan = True
                info(
                    f'Warning: {args.split_type}Found a task with targets all 0s or all 1s,try random split to aviod all 1s or 0s'
                )
            if all(pred == 0 for pred in valid_preds[i]) or all(
                    pred == 1 for pred in valid_preds[i]):
                nan = True
                info('Warning: Found a task with predictions all 0s or all 1s')
            if nan:
                results.append(float('nan'))
                continue
        if len(valid_targets[i]) == 0:
            continue
        metric_func(valid_targets[i], valid_preds[i])
        results.append(metric_func(valid_targets[i], valid_preds[i]))

    return targets, preds, results
Exemple #13
0
def read_split_scale_write(args,
                           data_path=None,
                           tmp_data_dir=None,
                           cols_to_read=None):
    if args.data_path != None:
        data_path = args.data_path
    if args.tmp_data_dir != None:
        tmp_data_dir = args.tmp_data_dir

    data = read_smiles_property_file(args,
                                     args.data_path,
                                     args.cols_to_read,
                                     keep_header=False)

    smiles = data[0]
    if len(data) > 1:
        labels = np.array(data[1:], dtype='float')
        labels = labels.T
        print(f'labels looks like{labels}{labels.shape}')
        args.n_task = n_task = len(data) - 1
    else:
        labels = None
        n_task = None
    try:
        os.stat(tmp_data_dir)
    except:
        os.mkdir(tmp_data_dir)

    cross_validation_split = KFold(n_splits=10,
                                   shuffle=True,
                                   random_state=args.seed)
    data = list(cross_validation_split.split(smiles, labels))
    i = 0
    sizes = (0.8, 0.1, 0.1)
    scalers = []
    train_steps = []
    args.class_weights = []
    for split in data:
        if args.split_type == 'random':
            print('Cross validation with random split, fold number ' + str(i) +
                  ' in progress...')
            train_val, test = split
            train, val = train_test_split(train_val,
                                          test_size=0.11111111111,
                                          random_state=args.seed)
        if args.split_type == 'scaffold_sort':
            scaf_seed = args.seed + i
            train, val, test = scaffold_split(smiles,
                                              sizes,
                                              scaf_seed,
                                              balanced=False,
                                              use_indices=True,
                                              big_small=2)
            print(f'using scaffold split ')
        if args.split_type == 'scaffold_balanced':
            scaf_seed = args.seed + i
            train, val, test = scaffold_split(smiles,
                                              sizes,
                                              scaf_seed,
                                              balanced=True,
                                              use_indices=True,
                                              big_small=2)
        if args.split_type == 'scaffold_random':
            scaf_seed = args.seed + i
            train, val, test = scaffold_split(smiles,
                                              sizes,
                                              scaf_seed,
                                              scaff_random=True,
                                              use_indices=True,
                                              big_small=2)
        X_train = smiles[train]
        train_steps.append(len(X_train) // args.batch_size)
        y_train = labels[train]

        X_val = smiles[val]
        y_val = labels[val]
        X_test = smiles[test]
        y_test = labels[test]
        args.train_size = len(X_train)
        args.val_size = len(X_val)
        args.test_size = len(X_test)

        if args.dataset_type == 'regression':
            if args.scale == "standardization":
                print('Fitting scaler(Z-score standardization)')
                scaler = StandardScaler().fit(y_train)
                y_train_scaled = scaler.transform(y_train)
                print(f'train data mean:{scaler.means}\nstd:{scaler.stds}\n')
            if args.scale == "normalization":
                print('Fitting scaler( Min-Max normalization )')
                scaler = minmaxScaler().fit(y_train)
                y_train_scaled = scaler.transform(y_train)
                print(
                    f'train data min:{scaler.mins}\ntrain data max:{scaler.maxs}\n'
                )
            if args.scale != 'standardization' and args.scale != 'normalization':
                raise ValueError(
                    "not implemented scaler,use one of [standardization, normalization]"
                )
        else:
            scaler = None
        scalers.append(scaler)
        assert n_task != None
        save_smiles_property_file(
            f'{tmp_data_dir}{args.seed}{args.data_filename}_{i}_train',
            X_train, y_train.reshape(-1, n_task))
        if args.dataset_type == 'classification':
            if args.class_balance:
                train_labels = y_train.reshape(-1, n_task).tolist()

                valid_targets = [[] for _ in range(args.n_task)]
                for ij in range(len(train_labels)):
                    for task_num in range(args.n_task):
                        if not math.isnan(train_labels[ij][task_num]):
                            valid_targets[task_num].append(
                                train_labels[ij][task_num])
                train_class_sizes = []

                for task_targets in valid_targets:

                    assert set(np.unique(task_targets)) <= {0, 1}
                    try:
                        ones = np.count_nonzero(task_targets) / len(
                            task_targets)
                    except ZeroDivisionError:
                        ones = float('nan')
                        print('Warning: class has no targets')
                    train_class_sizes.append([1 - ones, ones])
                class_batch_counts = torch.Tensor(
                    train_class_sizes) * args.batch_size

                args.class_weights.append(1 / torch.Tensor(class_batch_counts))

        if args.dataset_type == 'regression':
            save_smiles_property_file(
                f'{tmp_data_dir}{args.seed}{args.data_filename}_{i}_trainScaled',
                X_train, y_train_scaled.reshape(-1, n_task))

        save_smiles_property_file(
            f'{tmp_data_dir}{args.seed}{args.data_filename}_{i}_test', X_test,
            y_test.reshape(-1, n_task))
        save_smiles_property_file(
            f'{tmp_data_dir}{args.seed}{args.data_filename}_{i}_val', X_val,
            y_val.reshape(-1, n_task))
        i += 1
    print(f'train_steps:{train_steps}')
    return scalers, train_steps
                    d + 1e-5)  # add small value to avoid division by zero.

            predictions.append(np.argmax(bins))
        return predictions


# MAIN PROGRAM

train_X, train_Y, test_X, test_Y = get_dataset(
    sample=undersampling,
    pollution=pollution,  # How much of the outliers to put in the training set
    train_size=train_size  # How much of the inliers to put in the training set
)

# Preprocess the data
pipeline = make_pipeline(StandardScaler(),
                         PCA(n_components=n_components)).fit(train_X)

train_X = pd.DataFrame(pipeline.transform(train_X))

# Drop duplicates after having transformed the training data.
# we have to add train_Y in order to not mess up the indices.
cleaned = pd.concat([train_X, pd.DataFrame(train_Y)], axis=1).drop_duplicates()
train_X = cleaned.iloc[:, :-1]
train_Y = cleaned.iloc[:, -1]

test_X = np.asarray(pipeline.transform(test_X))

knn = KNN(k,
          train_X,
          train_Y,