def knn_NCA(X_train, Y_train, X_test, K=1) -> list: """ Reduce the dimensionalty of the dataset using the NCA method This is slower than using PCA or not using anything at all, but yields better results for now If the dataset sample is too large this takes really long to run """ # Scale all the output using a standard scaler scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # Reduce the dimensionalty of the data using NCA nca = NeighborhoodComponentsAnalysis(2).fit(X_train, Y_train) X_train_nca = nca.transform(X_train) X_test_nca = nca.transform(X_test) X_train_nca = pd.DataFrame(X_train_nca) X_test_nca = pd.DataFrame(X_test_nca) # Classify using a KNN classifier clf = KNeighborsClassifier(n_neighbors=K, leaf_size=2) clf.fit(X_train_nca, Y_train) # Return the predicted results return clf.predict(X_test_nca)
def knnGridSearch(X_train, Y_train, X_test, Y_test) -> list: """ Used to run a grid search to find the best params for later usage Only runs if the param -grid is provided """ # Params used for the gird search grid_params = { 'n_neighbors': [1, 3, 5], } # Scale all the output using a standard scaler scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # Reduce the dimensionalty of the data using NCA nca = NeighborhoodComponentsAnalysis(2).fit(X_train, Y_train) X_train_nca = nca.transform(X_train) X_test_nca = nca.transform(X_test) # Run the Grid search and print out the best params classifier = KNeighborsClassifier() gs = GridSearchCV(classifier, grid_params, verbose=1, cv=3, n_jobs=-1) gs.fit(X_train_nca, Y_train) print(gs.best_params_) # Score the best found params using a confusion matrix Y_pred = gs.predict(X_test_nca) print(confusion_matrix(Y_test, Y_pred))
def dim_reduc(X_train, Y_train, X_test, Y_test, K=1) -> None: """ Compare PCA, kernel PCA, and NCA dimensionalty reduction. Slightly modified version of this code: https://scikit-learn.org/stable/auto_examples/neighbors/plot_nca_dim_reduction.html Only runs if the -dim argument is provided KernelPCA and standard PCA give the same results While NCA seems to have a slight edge """ X = pd.concat([X_train, X_test]) Y = Y_train + Y_test random_state = 0 # Reduce dimension to 2 with PCA pca = make_pipeline(StandardScaler(), PCA(n_components=2, random_state=random_state)) # Reduce dimension to 2 with NeighborhoodComponentAnalysis nca = make_pipeline( StandardScaler(), NeighborhoodComponentsAnalysis(n_components=2, random_state=random_state)) # Reduce the dimensionalty using Kernel PCA kernel_pca = make_pipeline(StandardScaler(), KernelPCA(2, random_state=random_state)) # Use a nearest neighbor classifier to evaluate the methods knn = KNeighborsClassifier(n_neighbors=K) # Make a list of the methods to be compared dim_reduction_methods = [('PCA', pca), ('NCA', nca), ('KernelPCA', kernel_pca)] # plt.figure() for i, (name, model) in enumerate(dim_reduction_methods): plt.figure() # plt.subplot(1, 3, i + 1, aspect=1) # Fit the method's model model.fit(X_train, Y_train) # Fit a nearest neighbor classifier on the embedded training set knn.fit(model.transform(X_train), Y_train) # Compute the nearest neighbor accuracy on the embedded test set acc_knn = knn.score(model.transform(X_test), Y_test) print(name, acc_knn) # Embed the data set in 2 dimensions using the fitted model X_embedded = model.transform(X) # Plot the projected points and show the evaluation score plt.scatter( X_embedded[:, 0], X_embedded[:, 1], c=Y, s=30, cmap='Set1', ) plt.title("KNN with {}\np={}".format(name, round(acc_knn, 3))) plt.savefig("figs/KNN_{}.png".format(name)) plt.show()
def runKNN(X_train, Y_train, X_test, K=1) -> list: """ Trains and tests a KNN algorithm on the supplied data and returns the predictions. """ # Scale all the output using a standard scaler scaler = StandardScaler() X_train = scaler.fit_transform(X=X_train) X_test = scaler.transform(X_test) classifier = KNeighborsClassifier(n_neighbors=K) classifier.fit(X_train, Y_train) # Return the predicted results return classifier.predict(X_test)
def normalize_features(scaler: StandardScaler = None, replace_nan_token: int = 0, data: float = None) -> StandardScaler: if len(data) == 0 or data[0].features is None: return None if scaler is not None: scaler = scaler elif scaler is None: features = np.vstack([d.features for d in data]) scaler = StandardScaler(replace_nan_token=replace_nan_token) scaler.fit(features) for d in data: d.set_features(scaler.transform(d.features.reshape(1, -1))[0]) return scaler
def knn_PCA(X_train, Y_train, X_test, K=1) -> list: """ Although PCA peforms worse in most cases from our testing it is useful due to being much faster than NCA """ # Scale all the output using a standard scaler scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) pca = PCA(2) pca.fit_transform(X_train) pca.fit_transform(X_test) X_train = pd.DataFrame(X_train) X_test = pd.DataFrame(X_test) classifier = KNeighborsClassifier(n_neighbors=K) classifier.fit(X_train, Y_train) # Return the predicted results return classifier.predict(X_test)
def load_scalers(path: str) -> Tuple[StandardScaler, StandardScaler]: """ Loads the scalers a model was trained with. :param path: Path where model checkpoint is saved. :return: A tuple with the data scaler and the features scaler. """ state = torch.load(path, map_location=lambda storage, loc: storage) scaler = StandardScaler( state['data_scaler']['means'], state['data_scaler'] ['stds']) if state['data_scaler'] is not None else None features_scaler = StandardScaler( state['features_scaler']['means'], state['features_scaler']['stds'], replace_nan_token=0) if state['features_scaler'] is not None else None return scaler, features_scaler
def load_dataset(config, used_days=4, used_weeks=4, days=4, weeks=4): base_dir = config['data'].get('dataset_dir') dataset_dir = config['data'].get('dataset_dir') static_dim = config['data'].get('static_dim') dynamic_dim = config['data'].get('dynamic_dim') method = config['model'].get('method') per_period = config['data'].get('per_period', 5) seq_len = 1 if method == 'baseline' else config['model'].get('seq_len') data = {} # to return tprint("Loading Dataset: " + dataset_dir) # loading node features info = np.load(os.path.join(base_dir, 'link_info.npz'))['link_info'] data['link_length'] = info[:, 0] * 1000 # the length of road segments: km -> m # static feature normlize scaler0 = StaticFeatureScaler() static_fes = scaler0.transform(info)[:, 0:static_dim] print("static_fes.shape=", static_fes.shape) dynamic_fes = np_load(os.path.join(dataset_dir, 'dynamic_fes.npz')) eta_label = np_load(os.path.join(dataset_dir, 'eta_label.npz')) # each row is a list, including list(link_idxs, link_move, timespent) # dynamic feature normlize fes, fe_periods = dynamic_fes['fes'], dynamic_fes['periods'] scale_fes = fes[fe_periods < min(eta_label['valid_periods'])] # All samples before the valid periods can be used to scale scaler1 = StandardScaler(mean=scale_fes.mean(), std=scale_fes.std()) scaler1.save(os.path.join(dataset_dir, 'scaler1.npz')) if method == 'baseline': dynamic_fes0 = fes else: dynamic_fes0 = scaler1.transform(fes) for prefix in ('train', 'valid', 'test'): tprint("(%s)dynamic_fes.shape=%s" % (prefix, str(dynamic_fes0.shape))) assert (dynamic_fes0.shape[-1] == dynamic_dim) data['%s_loader' % prefix] = DataLoader(dynamic_fes=dynamic_fes0, static_fes=static_fes, links_time_list=eta_label[prefix].tolist(), seq_len=seq_len, days=used_days, weeks=used_weeks, fe_periods=dynamic_fes['periods'], eta_periods=eta_label['%s_periods' % prefix], shuffle=('train' == prefix), per_period=per_period) tprint('Dataset loaded successfully.') return data
def cross_val_regression(X, y, svr, k=5, mth='MSE'): """ SVR(回帰)について交差検証を行う StandardScaler 処理も含まれている """ # 交差検証の前にX, yをシャッフルする random_mask = np.arange(len(y)) np.random.shuffle(random_mask) X = X[random_mask] y = y[random_mask] k = 5 part = len(y) // k scores = [] for i in range(k): print("{} th cross validation...".format(i + 1)) test_X = X[part * i : part * (i+1)] test_y = y[part * i : part * (i+1)] train_X = np.concatenate((X[0:part*i], X[part*(i+1):-1]), axis=0) train_y = np.concatenate((y[0:part*i], y[part*(i+1):-1]), axis=0) sc = StandardScaler() train_X_std = sc.fit_transform(train_X) test_X_std = sc.transform(test_X) svr.fit(train_X_std, train_y) score = svr.score(test_X_std, test_y, mth=mth) if math.isnan(score): print("nan detected: cross_val_regression terminated!!") return np.nan print("{} score: ".format(mth), score) scores.append(score) res = np.mean(np.array(scores)) print("{}-fold average score: ".format(k), res) return res
import numpy as np import sys data_num = 2000 hf = data_num // 2 X, y = load_sanfrancisco(data_num) # 前半を訓練データ, 後半を評価データとする(load_sanfranciscoの時点でランダムサンプリングされている) X_train = X[:hf] y_train = y[:hf] X_test = X[hf:] y_test = y[hf:] # 正規化する sc = StandardScaler() X_train_std = sc.fit_transform(X_train) X_test_std = sc.transform(X_test) def SVRAgent(ker_type='-g', p=9.5, c=27000, eps=0.09): svr = SVRegressor(ker_type, p, c, eps) # cross_val_regression で正規化したのち交差検証している score = cross_val_regression(X_train, y_train, svr, mth='R2') print('cross val score: ', score) print('start SVR fit...') svr.fit(X_train_std, y_train) print('start SVR predict...')
def visualize_attention(args: Namespace): """Visualizes attention weights.""" print(f'Loading model from "{args.checkpoint_path}"') model = load_checkpoint(args.checkpoint_path, cuda=args.cuda) mpn = model.encoder print(f'mpn:-->{type(mpn)}') print(f'MPNencoder attributes:{mpn.encoder.__dict__}') print('Loading data') if os.path.exists(args.data_path) and os.path.getsize(args.data_path) > 0: DGLtest = args.data_path print(f'Loading data -->{DGLtest}') else: direct = 'data_RE2/tmp/' DGLtest = direct + 'viz.csv' print(f'Loading data -->{DGLtest}') viz_data = DGLDataset(DGLtest, training=False) viz_dataloader = DataLoader(viz_data, batch_size=args.batch_size, shuffle=False, num_workers=0, collate_fn=DGLCollator(training=False), drop_last=False, worker_init_fn=worker_init_fn) metric_func = get_metric_func(metric=args.metric) for it, result_batch in enumerate(tqdm(viz_dataloader)): batch_sm = result_batch['sm'] label_batch = result_batch['labels'] if args.dataset_type == 'regression': if args.scale == "standardization": print('Fitting scaler(Z-score standardization)') scaler = StandardScaler().fit(label_batch) y_train_scaled = scaler.transform(label_batch) print(f'train data mean:{scaler.means}\nstd:{scaler.stds}\n') if args.scale == "normalization": print('Fitting scaler( Min-Max normalization )') scaler = minmaxScaler().fit(label_batch) y_train_scaled = scaler.transform(label_batch) print( f'train data min:{scaler.mins}\ntrain data max:{scaler.maxs}\n' ) if args.scale != 'standardization' and args.scale != 'normalization': raise ValueError( "not implemented scaler,use one of [standardization, normalization]" ) else: scaler = None mpn.viz_attention(batch_sm, viz_dir=args.viz_dir) test_targets, test_preds, test_scores = evaluate_batch( args, model=model, data=viz_dataloader, num_tasks=args.num_tasks, metric_func=metric_func, dataset_type=args.dataset_type, scaler=scaler, logger=None, Foldth=0, predsLog=args.save_dir) print(f'rung viz{args.viz_dir}')
def evaluate_batch(args: Namespace, model: nn.Module, data: DataLoader, num_tasks: int, metric_func: Callable, dataset_type: str, Foldth: int, scaler: StandardScaler = None, logger: logging.Logger = None, predsLog=None) -> List[float]: info = logger.info if logger is not None else print model.eval() preds = [] targets = [] smiles = [] targets_sca = [] predsBack = [] if predsLog != None: predsLog = Path( predsLog) / f'predsLogFoldth{Foldth}_{args.data_filename}' for it, result_batch in enumerate(tqdm(data)): model.zero_grad() batch_labels = result_batch['labels'] batch_sm = result_batch['sm'] with torch.no_grad(): batch_preds = model(batch_sm) batch_preds = batch_preds.data.cpu().numpy() if scaler is not None: batch_preds_Sback = scaler.inverse_transform(batch_preds) batch_preds_Sback = batch_preds_Sback.tolist() predsBack.extend(batch_preds_Sback) batch_labels_sca = scaler.transform(batch_labels) targets_sca.extend(batch_labels_sca) preds.extend(batch_preds.tolist()) targets.extend(batch_labels) smiles.extend(batch_sm) if predsLog != None: with open(predsLog, 'w+') as pf: if scaler is not None: pf.write(f'smiles,labels,predictions,ORI_label,SB_pred,diff\n') for i, sm in enumerate(smiles): lab_S = targets_sca[i] pred = preds[i] pred_SB = predsBack[i] lab_noscle = targets[i] diff = np.array(lab_noscle, dtype=np.float32) - np.array( pred_SB, dtype=np.float32) pf.write( f'{sm},{lab_S},{pred},{lab_noscle},{pred_SB},{diff},<{i}>\n' ) else: pf.write(f'smiles,labels,predictions\n') for i, sm in enumerate(smiles): pred_noScale = preds[i] lab_noscle = targets[i] pf.write(f'{sm},{pred_noScale},{lab_noscle}\n') if len(preds) == 0: return [float('nan')] * num_tasks valid_preds = [[] for _ in range(num_tasks)] valid_targets = [[] for _ in range(num_tasks)] if args.dataset_type == 'regression': if scaler is None: targets = targets else: preds = predsBack for i in range(num_tasks): for j in range(len(preds)): if targets[j][i] is not None: valid_preds[i].append(preds[j][i]) valid_targets[i].append(targets[j][i]) results = [] for i in range(num_tasks): if dataset_type == 'classification': nan = False if all(target == 0 for target in valid_targets[i]) or all( target == 1 for target in valid_targets[i]): nan = True info( f'Warning: {args.split_type}Found a task with targets all 0s or all 1s,try random split to aviod all 1s or 0s' ) if all(pred == 0 for pred in valid_preds[i]) or all( pred == 1 for pred in valid_preds[i]): nan = True info('Warning: Found a task with predictions all 0s or all 1s') if nan: results.append(float('nan')) continue if len(valid_targets[i]) == 0: continue metric_func(valid_targets[i], valid_preds[i]) results.append(metric_func(valid_targets[i], valid_preds[i])) return targets, preds, results
def read_split_scale_write(args, data_path=None, tmp_data_dir=None, cols_to_read=None): if args.data_path != None: data_path = args.data_path if args.tmp_data_dir != None: tmp_data_dir = args.tmp_data_dir data = read_smiles_property_file(args, args.data_path, args.cols_to_read, keep_header=False) smiles = data[0] if len(data) > 1: labels = np.array(data[1:], dtype='float') labels = labels.T print(f'labels looks like{labels}{labels.shape}') args.n_task = n_task = len(data) - 1 else: labels = None n_task = None try: os.stat(tmp_data_dir) except: os.mkdir(tmp_data_dir) cross_validation_split = KFold(n_splits=10, shuffle=True, random_state=args.seed) data = list(cross_validation_split.split(smiles, labels)) i = 0 sizes = (0.8, 0.1, 0.1) scalers = [] train_steps = [] args.class_weights = [] for split in data: if args.split_type == 'random': print('Cross validation with random split, fold number ' + str(i) + ' in progress...') train_val, test = split train, val = train_test_split(train_val, test_size=0.11111111111, random_state=args.seed) if args.split_type == 'scaffold_sort': scaf_seed = args.seed + i train, val, test = scaffold_split(smiles, sizes, scaf_seed, balanced=False, use_indices=True, big_small=2) print(f'using scaffold split ') if args.split_type == 'scaffold_balanced': scaf_seed = args.seed + i train, val, test = scaffold_split(smiles, sizes, scaf_seed, balanced=True, use_indices=True, big_small=2) if args.split_type == 'scaffold_random': scaf_seed = args.seed + i train, val, test = scaffold_split(smiles, sizes, scaf_seed, scaff_random=True, use_indices=True, big_small=2) X_train = smiles[train] train_steps.append(len(X_train) // args.batch_size) y_train = labels[train] X_val = smiles[val] y_val = labels[val] X_test = smiles[test] y_test = labels[test] args.train_size = len(X_train) args.val_size = len(X_val) args.test_size = len(X_test) if args.dataset_type == 'regression': if args.scale == "standardization": print('Fitting scaler(Z-score standardization)') scaler = StandardScaler().fit(y_train) y_train_scaled = scaler.transform(y_train) print(f'train data mean:{scaler.means}\nstd:{scaler.stds}\n') if args.scale == "normalization": print('Fitting scaler( Min-Max normalization )') scaler = minmaxScaler().fit(y_train) y_train_scaled = scaler.transform(y_train) print( f'train data min:{scaler.mins}\ntrain data max:{scaler.maxs}\n' ) if args.scale != 'standardization' and args.scale != 'normalization': raise ValueError( "not implemented scaler,use one of [standardization, normalization]" ) else: scaler = None scalers.append(scaler) assert n_task != None save_smiles_property_file( f'{tmp_data_dir}{args.seed}{args.data_filename}_{i}_train', X_train, y_train.reshape(-1, n_task)) if args.dataset_type == 'classification': if args.class_balance: train_labels = y_train.reshape(-1, n_task).tolist() valid_targets = [[] for _ in range(args.n_task)] for ij in range(len(train_labels)): for task_num in range(args.n_task): if not math.isnan(train_labels[ij][task_num]): valid_targets[task_num].append( train_labels[ij][task_num]) train_class_sizes = [] for task_targets in valid_targets: assert set(np.unique(task_targets)) <= {0, 1} try: ones = np.count_nonzero(task_targets) / len( task_targets) except ZeroDivisionError: ones = float('nan') print('Warning: class has no targets') train_class_sizes.append([1 - ones, ones]) class_batch_counts = torch.Tensor( train_class_sizes) * args.batch_size args.class_weights.append(1 / torch.Tensor(class_batch_counts)) if args.dataset_type == 'regression': save_smiles_property_file( f'{tmp_data_dir}{args.seed}{args.data_filename}_{i}_trainScaled', X_train, y_train_scaled.reshape(-1, n_task)) save_smiles_property_file( f'{tmp_data_dir}{args.seed}{args.data_filename}_{i}_test', X_test, y_test.reshape(-1, n_task)) save_smiles_property_file( f'{tmp_data_dir}{args.seed}{args.data_filename}_{i}_val', X_val, y_val.reshape(-1, n_task)) i += 1 print(f'train_steps:{train_steps}') return scalers, train_steps
d + 1e-5) # add small value to avoid division by zero. predictions.append(np.argmax(bins)) return predictions # MAIN PROGRAM train_X, train_Y, test_X, test_Y = get_dataset( sample=undersampling, pollution=pollution, # How much of the outliers to put in the training set train_size=train_size # How much of the inliers to put in the training set ) # Preprocess the data pipeline = make_pipeline(StandardScaler(), PCA(n_components=n_components)).fit(train_X) train_X = pd.DataFrame(pipeline.transform(train_X)) # Drop duplicates after having transformed the training data. # we have to add train_Y in order to not mess up the indices. cleaned = pd.concat([train_X, pd.DataFrame(train_Y)], axis=1).drop_duplicates() train_X = cleaned.iloc[:, :-1] train_Y = cleaned.iloc[:, -1] test_X = np.asarray(pipeline.transform(test_X)) knn = KNN(k, train_X, train_Y,