def test_iterative_svd_with_low_rank_random_matrix(): solver = IterativeSVD(rank=3) XY_completed = solver.fit_transform(XY_incomplete) _, missing_mae = reconstruction_error(XY, XY_completed, missing_mask, name="IterativeSVD") assert missing_mae < 0.1, "Error too high!"
def fancyimpute_matrix_completion(function, gram_drop, seqs=None, sigma=None, triangular=None, num_process=4, drop_flag_matrix=None): gram_partially_completed_by_gak = gak.gram_gak(seqs, sigma=sigma, triangular=triangular, num_process=num_process, drop_flag_matrix=drop_flag_matrix) for i in range(len(gram_drop)): gram_drop[i, i] = 1 for j in range(len(gram_drop[0])): if np.isnan(gram_partially_completed_by_gak[i, j]): continue assert np.isnan(gram_drop[i, j]) gram_drop[i, j] = gram_partially_completed_by_gak[i, j] if function == "SoftImpute": gram_completed = SoftImpute().complete(gram_drop) elif function == "KNN": gram_completed = KNN().complete(gram_drop) elif function == "IterativeSVD": gram_completed = IterativeSVD().complete(gram_drop) else: print("unsupported fancyimpute functin") exit(-1) return gram_completed
def baseline_inpute(X_incomplete, method='mean', level=0): if method == 'mean': X_filled_mean = SimpleFill().fit_transform(X_incomplete) return X_filled_mean elif method == 'knn': k = [3, 10, 50][level] X_filled_knn = KNN(k=k, verbose=False).fit_transform(X_incomplete) return X_filled_knn elif method == 'svd': rank = [ np.ceil((X_incomplete.shape[1] - 1) / 10), np.ceil((X_incomplete.shape[1] - 1) / 5), X_incomplete.shape[1] - 1 ][level] X_filled_svd = IterativeSVD(rank=int(rank), verbose=False).fit_transform(X_incomplete) return X_filled_svd elif method == 'mice': max_iter = [3, 10, 50][level] X_filled_mice = IterativeImputer( max_iter=max_iter).fit_transform(X_incomplete) return X_filled_mice elif method == 'spectral': # default value for the sparsity level is with respect to the maximum singular value, # this is now done in a heuristic way sparsity = [0.5, None, 3][level] X_filled_spectral = SoftImpute( shrinkage_value=sparsity).fit_transform(X_incomplete) return X_filled_spectral else: raise NotImplementedError
def imputeData(X_incomplete): X = np.zeros((4153, 18)) # X_incomplete has missing data which is represented with NaN values X_filled = IterativeSVD().complete(X_incomplete) print('test1') return
def complete(self, data: pd.DataFrame): df = data.copy() cols = list(df) if np.argmax(cols) < self.rank: self.rank = np.argmax(cols) df = pd.DataFrame(IterativeSVD(rank=self.rank, verbose=False).fit_transform(df)) df.columns = cols return df
def test_estimators(X, y, dum_enc, classification=True): ModeMeanImputer = create_mode_mean_imputer(X, dum_enc) # List with all imputation algorithms to test, in tuples of (name, estimator object, inductive) impute_estimators = [ ("ModeMeanImputer", ModeMeanImputer, True), ("KNNImputer", KNNImputer(), True), ("Iter_BayesianRidge", IterativeImputer(estimator=BayesianRidge(), random_state=0), True), ("Iter_DecisionTree", IterativeImputer(estimator=DecisionTreeRegressor(max_features='sqrt', random_state=0), random_state=0), True), ("Iter_RF", IterativeImputer(estimator=RandomForestRegressor(n_estimators=100, random_state=0), random_state=0), True), ("Iter_ExtraTrees", IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=100, random_state=0), random_state=0), True), ("Iter_KNRegr", IterativeImputer(estimator=KNeighborsRegressor(n_neighbors=15), random_state=0), True), ("Iter_SVD", IterativeSVD(rank=min(min(X.shape) - 1, 10), verbose=False), False), ("SoftImpute", SoftImpute(verbose=False), False) ] imp_scores = {} times = {} if not classification: for estimator_name, impute_estimator, inductive in impute_estimators: time1 = time.time() imp_scores[estimator_name] = imputation_score_regression( X, y, estimator_name, impute_estimator, inductive) time2 = time.time() times[estimator_name] = time2 - time1 #print(estimator_name + " finished, took " + str(round(time2 - time1, 1)) + " seconds") if classification: for estimator_name, impute_estimator, inductive in impute_estimators: time1 = time.time() imp_scores[estimator_name] = imputation_score_classification( X, y, estimator_name, impute_estimator, inductive) time2 = time.time() times[estimator_name] = time2 - time1 #print(estimator_name + " finished, took " + str(round(time2 - time1, 1)) + " seconds") imputer_dict = {} for estimator_name, impute_estimator, inductive in impute_estimators: imputer_dict[estimator_name] = impute_estimator return imp_scores, times, imputer_dict
def fancyImputeAttempts(data, dataframe): data = np.array(data, np.float) #use fancy impute package filled_knn = KNN(k=3, verbose=False).complete(data) filled_softimpute = SoftImpute(verbose=False).complete(data) filled_svd = IterativeSVD(verbose=False).complete(data) print "\nKNN computations\n" doiteration(filled_knn, dataframe) print "\n SOFTIMPUTE computations\n" doiteration(filled_softimpute, dataframe) print "\n SVD computations\n" doiteration(filled_svd, dataframe)
def impute_svd(df, rank=10, convergence_threshold=0.00001, max_iters=200): """ Imputes the missing values by using SVD decomposition Based on the following publication: 'Missing value estimation methods for DNA microarrays' by Troyanskaya et. al. :param df:The input dataframe that contains missing values :param rank: Rank value of the truncated SVD :param convergence_threshold: The threshold to stop the iterations :param max_iters: Max number of iterations :return: the imputed dataframe """ imputed_matrix = IterativeSVD(rank, convergence_threshold, max_iters).complete(df.values) imputed_df = pd.DataFrame(imputed_matrix, df.index, df.columns) return imputed_df
def complex_imputation(df, method='mice', neighbors=3): """ Inputs: df -- dataframe of incomplete data method -- method of imputation - 'knn': Imputes using K Nearest Neighbors of completed rows - 'soft_impute': Imputes using iterative soft thresholding of SVD decompositions - 'mice': Imputes using Multiple Imputation by Chained Equations method - 'nuclear_nm': Imputation using Exact Matrix Completion via Convex Optimization method - 'matrix_factorization': Imputes by factorization of matrix in low-rank U and V with L1 sparsity on U elements and L2 sparsity on V elements - 'iterative_svd': Imputes based on iterative low-rank SVD decomposition neighbors -- parameter for KNN imputation Output: Completed matrix """ # Create matrix of features X_incomplete = df.values # Normalize matrix by std and mean (0 mean, 1 variance) X_incomplete_normalized = BiScaler().fit_transform(X_incomplete) if method == 'knn': X_complete = KNN(neighbors).complete(X_incomplete) return fill_values(df, X_complete) if method == 'soft_impute': X_complete_normalized = SoftImpute().complete(X_incomplete_normalized) X_complete = BiScaler().inverse_transform(X_complete_normalized) return fill_values(df, X_complete) if method == 'mice': X_complete = MICE().complete(X_incomplete) return fill_values(df, X_complete) if method == 'nuclear_nm': X_complete = NuclearNormMinimization().complete(X_incomplete) return fill_values(df, X_complete) if method == 'matrix_factorization': X_complete = MatrixFactorization().complete(X_incomplete) return fill_values(df, X_complete) if method == 'iterative_svd': X_complete = IterativeSVD().complete(X_incomplete) return fill_values(df, X_complete)
def run_impute(self, X, state='train'): if state == 'train': self.train_data['ave'] = np.zeros([X.shape[0], X.shape[1]]) for imp_method in self.impute_method: if imp_method == 'mean': imp_ope = SimpleFill() if imp_method == 'KNN': imp_ope = KNN() if imp_method == 'IterativeSVD': imp_ope = IterativeSVD() if imp_method == 'MatrixFactorization': imp_ope = MatrixFactorization() X_filled = imp_ope.fit_transform(X) self.train_data[imp_method] = X_filled self.impute_operator[imp_method] = imp_ope self.train_data['ave'] += X_filled self.train_data['ave'] /= len(self.impute_method) return 0
def determine_impute(df): """Iterates various imputation methods to find lower MSE""" algorithms = [ SimpleFill(), KNN(1), KNN(2), KNN(3), KNN(4), KNN(5), IterativeSVD(), MatrixFactorization() ] MSE = {} df_incomplete = create_test_df(df, 0.7, list(T40_dict.keys())) for i, alg in enumerate(algorithms): print(alg) X_complete = impute_df(df_incomplete, alg) alg_mse = ((df - X_complete)**2).sum().mean() print(str(i) + alg.__class__.__name__, alg_mse) MSE[str(i) + alg.__class__.__name__] = alg_mse return MSE
def __init__(self, data, predict): self.df = data self.predict = predict self.X = None self.y = None self.X_scale = None self.X_train = None self.X_test = None self.y_train = None self.y_test = None self.incomplete_data = None self.clean_data = None self.methods = [ SimpleFill(), KNN(1), KNN(2), KNN(3), KNN(4), KNN(5), IterativeSVD(), MatrixFactorization() ]
name="MICE_%d" % negative_log_regularization_weight) for fill_method in ["mean", "median"]: table.add_entry( solver=SimpleFill(fill_method=fill_method), name="SimpleFill_%s" % fill_method) for k in [1, 5, 17]: table.add_entry( solver=DenseKNN( k=k, orientation="rows"), name="DenseKNN_k%d" % (k,)) for shrinkage_value in [50, 200, 800]: # SoftImpute without rank constraints table.add_entry( solver=SoftImpute( shrinkage_value=shrinkage_value), name="SoftImpute_lambda%d" % (shrinkage_value,)) for rank in [10, 40, 160]: table.add_entry( solver=IterativeSVD( rank=rank, init_fill_method="zero"), name="IterativeSVD_rank%d" % (rank,)) table.save_html_table() table.print_sorted_errors()
def impute(data, method='mean', value=None, nan_value=np.nan): """ Impute missing values on a numpy ndarray in a column-wise manner. ANTsR function: `antsrimpute` Arguments --------- data : numpy.ndarray data to impute method : string or float type of imputation method to use Options: mean median constant KNN BiScaler NuclearNormMinimization SoftImpute IterativeSVD value : scalar (optional) optional arguments for different methods if method == 'constant' constant value if method == 'KNN' number of nearest neighbors to use nan_value : scalar value which is interpreted as a missing value Returns ------- ndarray if ndarray was given OR pd.DataFrame if pd.DataFrame was given Example ------- >>> import ants >>> import numpy as np >>> data = np.random.randn(4,10) >>> data[2,3] = np.nan >>> data[3,5] = np.nan >>> data_imputed = ants.impute(data, 'mean') Details ------- KNN: Nearest neighbor imputations which weights samples using the mean squared difference on features for which two rows both have observed data. SoftImpute: Matrix completion by iterative soft thresholding of SVD decompositions. Inspired by the softImpute package for R, which is based on Spectral Regularization Algorithms for Learning Large Incomplete Matrices by Mazumder et. al. IterativeSVD: Matrix completion by iterative low-rank SVD decomposition. Should be similar to SVDimpute from Missing value estimation methods for DNA microarrays by Troyanskaya et. al. MICE: Reimplementation of Multiple Imputation by Chained Equations. MatrixFactorization: Direct factorization of the incomplete matrix into low-rank U and V, with an L1 sparsity penalty on the elements of U and an L2 penalty on the elements of V. Solved by gradient descent. NuclearNormMinimization: Simple implementation of Exact Matrix Completion via Convex Optimization by Emmanuel Candes and Benjamin Recht using cvxpy. Too slow for large matrices. BiScaler: Iterative estimation of row/column means and standard deviations to get doubly normalized matrix. Not guaranteed to converge but works well in practice. Taken from Matrix Completion and Low-Rank SVD via Fast Alternating Least Squares. """ _fancyimpute_options = { 'KNN', 'BiScaler', 'NuclearNormMinimization', 'SoftImpute', 'IterativeSVD' } if (not has_fancyimpute) and (method in _fancyimpute_options): raise ValueError( 'You must install `fancyimpute` (pip install fancyimpute) to use this method' ) _base_options = {'mean', 'median', 'constant'} if (method not in _base_options) and ( method not in _fancyimpute_options) and (not isinstance( method, (int, float))): raise ValueError( 'method not understood.. Use `mean`, `median`, a scalar, or an option from `fancyimpute`' ) X_incomplete = data.copy() if method == 'KNN': if value is None: value = 3 X_filled = KNN(k=value, verbose=False).complete(X_incomplete) elif method == 'BiScaler': X_filled = BiScaler(verbose=False).fit_transform(X_incomplete) elif method == 'SoftImpute': X_filled = SoftImpute(verbose=False).complete(X_incomplete) elif method == 'IterativeSVD': if value is None: rank = min(10, X_incomplete.shape[0] - 2) else: rank = value X_filled = IterativeSVD(rank=rank, verbose=False).complete(X_incomplete) elif method == 'mean': col_means = np.nanmean(X_incomplete, axis=0) for i in range(X_incomplete.shape[1]): X_incomplete[:, i][np.isnan(X_incomplete[:, i])] = col_means[i] X_filled = X_incomplete elif method == 'median': col_means = np.nanmean(X_incomplete, axis=0) for i in range(X_incomplete.shape[1]): X_incomplete[:, i][np.isnan(X_incomplete[:, i])] = col_means[i] X_filled = X_incomplete elif method == 'constant': if value is None: raise ValueError( 'Must give `value` argument if method == constant') X_incomplete[np.isnan(X_incomplete)] = value X_filled = X_incomplete return X_filled
reader = Reader(rating_scale=(limits[0], limits[1])) data = Dataset.load_from_df(df[['user', 'item', 'rating']], reader) df = pd.DataFrame(ratings_dict) #reader = Reader(line_format='user item rating', sep='\t') # A reader is still needed but only the rating_scale param is requiered. data.split(n_folds=10) # data can now be used normally data_full = data.build_full_trainset() # obj = IterativeSVD(rank=20, max_iters=700, min_value=limits[0], max_value=limits[1], verbose=True) datamat_filled_SVD_fancy = obj.complete(datamat_missing) obj = SoftImpute(shrinkage_value=None, max_iters=700, max_rank=20, n_power_iterations=1, init_fill_method="zero", min_value=limits[0], max_value=limits[1], normalizer=None, verbose=True)
def run(folder, name, patients, run_all, save_imputed): random_seed = 123 np.random.seed(seed=random_seed) X_corrupt = load_file(folder, name) name = name.split('.csv')[0] print(name) end = X_corrupt.shape[0] print(end) X = np.genfromtxt('./data/completeCasesBoxCox.csv', delimiter=',', skip_header=1)[:end, 1:] scores = {} simple_mean_X = SimpleFill(fill_method='mean').complete(X_corrupt) scores['simple_mean'] = evaluate(simple_mean_X, X, X_corrupt) simple_median_X = SimpleFill(fill_method='median').complete(X_corrupt) scores['simple_median'] = evaluate(simple_median_X, X, X_corrupt) random_X = SimpleFill(fill_method='random').complete(X_corrupt) scores['random'] = evaluate(random_X, X, X_corrupt) # SVD svd_1_X = IterativeSVD(rank=1).complete(X_corrupt) scores['svd_1'] = evaluate(svd_1_X, X, X_corrupt) svd_2_X = IterativeSVD(rank=2).complete(X_corrupt) scores['svd_2'] = evaluate(svd_2_X, X, X_corrupt) svd_3_X = IterativeSVD(rank=3).complete(X_corrupt) scores['svd_3'] = evaluate(svd_3_X, X, X_corrupt) svd_4_X = IterativeSVD(rank=4).complete(X_corrupt) scores['svd_4'] = evaluate(svd_4_X, X, X_corrupt) svd_5_X = IterativeSVD(rank=5).complete(X_corrupt) scores['svd_5'] = evaluate(svd_5_X, X, X_corrupt) svd_6_X = IterativeSVD(rank=6).complete(X_corrupt) scores['svd_6'] = evaluate(svd_6_X, X, X_corrupt) svd_7_X = IterativeSVD(rank=7).complete(X_corrupt) scores['svd_7'] = evaluate(svd_7_X, X, X_corrupt) svd_8_X = IterativeSVD(rank=8).complete(X_corrupt) scores['svd_8'] = evaluate(svd_8_X, X, X_corrupt) svd_9_X = IterativeSVD(rank=9).complete(X_corrupt) scores['svd_9'] = evaluate(svd_9_X, X, X_corrupt) svd_10_X = IterativeSVD(rank=10).complete(X_corrupt) scores['svd_10'] = evaluate(svd_10_X, X, X_corrupt) svd_11_X = IterativeSVD(rank=11).complete(X_corrupt) scores['svd_11'] = evaluate(svd_11_X, X, X_corrupt) svd_12_X = IterativeSVD(rank=12).complete(X_corrupt) scores['svd_12'] = evaluate(svd_12_X, X, X_corrupt) svd_13_X = IterativeSVD(rank=13).complete(X_corrupt) scores['svd_13'] = evaluate(svd_13_X, X, X_corrupt) svd_14_X = IterativeSVD(rank=14).complete(X_corrupt) scores['svd_14'] = evaluate(svd_14_X, X, X_corrupt) svd_15_X = IterativeSVD(rank=15).complete(X_corrupt) scores['svd_15'] = evaluate(svd_15_X, X, X_corrupt) svd_16_X = IterativeSVD(rank=16).complete(X_corrupt) scores['svd_16'] = evaluate(svd_16_X, X, X_corrupt) svd_17_X = IterativeSVD(rank=17).complete(X_corrupt) scores['svd_17'] = evaluate(svd_17_X, X, X_corrupt) svd_18_X = IterativeSVD(rank=18).complete(X_corrupt) scores['svd_18'] = evaluate(svd_18_X, X, X_corrupt) svd_19_X = IterativeSVD(rank=19).complete(X_corrupt) scores['svd_19'] = evaluate(svd_19_X, X, X_corrupt) svd_20_X = IterativeSVD(rank=20).complete(X_corrupt) scores['svd_20'] = evaluate(svd_20_X, X, X_corrupt) svd_21_X = IterativeSVD(rank=21).complete(X_corrupt) scores['svd_21'] = evaluate(svd_21_X, X, X_corrupt) svd_22_X = IterativeSVD(rank=22).complete(X_corrupt) scores['svd_22'] = evaluate(svd_22_X, X, X_corrupt) svd_23_X = IterativeSVD(rank=23).complete(X_corrupt) scores['svd_23'] = evaluate(svd_23_X, X, X_corrupt) svd_24_X = IterativeSVD(rank=24).complete(X_corrupt) scores['svd_24'] = evaluate(svd_24_X, X, X_corrupt) si_X = SoftImpute().complete(X_corrupt) scores['si'] = evaluate(si_X, X, X_corrupt) si_s_half_X = SoftImpute(shrinkage_value=0.5).complete(X_corrupt) scores['si_s_half'] = evaluate(si_s_half_X, X, X_corrupt) si_s_1_X = SoftImpute(shrinkage_value=1).complete(X_corrupt) scores['si_s_1'] = evaluate(si_s_1_X, X, X_corrupt) si_s_2_X = SoftImpute(shrinkage_value=2).complete(X_corrupt) scores['si_s_2'] = evaluate(si_s_2_X, X, X_corrupt) si_s_4_X = SoftImpute(shrinkage_value=4).complete(X_corrupt) scores['si_s_4'] = evaluate(si_s_4_X, X, X_corrupt) si_s_8_X = SoftImpute(shrinkage_value=8).complete(X_corrupt) scores['si_s_8'] = evaluate(si_s_8_X, X, X_corrupt) si_s_16_X = SoftImpute(shrinkage_value=16).complete(X_corrupt) scores['si_s_16'] = evaluate(si_s_16_X, X, X_corrupt) si_s_32_X = SoftImpute(shrinkage_value=32).complete(X_corrupt) scores['si_s_32'] = evaluate(si_s_32_X, X, X_corrupt) si_s_64_X = SoftImpute(shrinkage_value=64).complete(X_corrupt) scores['si_s_64'] = evaluate(si_s_64_X, X, X_corrupt) si_s_128_X = SoftImpute(shrinkage_value=128).complete(X_corrupt) scores['si_s_128'] = evaluate(si_s_128_X, X, X_corrupt) if save_imputed: np.savetxt('./output/sweeps/' + name + '_simple_mean.csv', simple_mean_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_simple_median.csv', simple_median_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_simple_random.csv', random_X, delimiter=',', newline='\n'), np.savetxt('./output/sweeps/' + name + '_svd_1.csv', svd_1_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_2.csv', svd_2_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_3.csv', svd_3_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_4.csv', svd_4_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_5.csv', svd_5_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_6.csv', svd_6_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_7.csv', svd_7_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_8.csv', svd_8_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_9.csv', svd_9_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_10.csv', svd_10_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_11.csv', svd_11_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_12.csv', svd_12_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_13.csv', svd_13_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_14.csv', svd_14_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_15.csv', svd_15_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_16.csv', svd_16_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_17.csv', svd_17_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_18.csv', svd_18_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_19.csv', svd_19_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_20.csv', svd_20_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_21.csv', svd_21_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_22.csv', svd_22_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_23.csv', svd_23_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_svd_24.csv', svd_24_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_si.csv', si_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_si_s_half.csv', si_s_half_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_si_s_1.csv', si_s_1_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_si_s_2.csv', si_s_2_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_si_s_4.csv', si_s_4_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_si_s_8.csv', si_s_8_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_si_s_16.csv', si_s_16_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_si_s_32.csv', si_s_32_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_si_s_64.csv', si_s_64_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_si_s_128.csv', si_s_128_X, delimiter=',', newline='\n') if run_all: mice_X = MICE().complete(X_corrupt) scores['MICE'] = evaluate(mice_X, X, X_corrupt) mice_col_lambda_reg_25 = MICE( model=BayesianRidgeRegression(lambda_reg=0.25)).complete(X_corrupt) scores['MICE_col_lambda_reg_25'] = evaluate( mice_col_lambda_reg_25, X, X_corrupt) mice_col_lambda_reg_10 = MICE( model=BayesianRidgeRegression(lambda_reg=0.1)).complete(X_corrupt) scores['MICE_col_lambda_reg_10'] = evaluate( mice_col_lambda_reg_10, X, X_corrupt) mice_col_lambda_reg_1 = MICE( model=BayesianRidgeRegression(lambda_reg=0.01)).complete(X_corrupt) scores['MICE_col_lambda_reg_1'] = evaluate( mice_col_lambda_reg_1, X, X_corrupt) mice_col_lambda_reg_01 = MICE( model=BayesianRidgeRegression(lambda_reg=0.001)).complete(X_corrupt) scores['MICE_col_lambda_reg_01'] = evaluate( mice_col_lambda_reg_01, X, X_corrupt) mice_col_lambda_reg_001 = MICE( model=BayesianRidgeRegression(lambda_reg=0.0001)).complete(X_corrupt) scores['MICE_col_lambda_reg_001'] = evaluate( mice_col_lambda_reg_001, X, X_corrupt) mice_pmm_X = MICE(impute_type='pmm').complete(X_corrupt) scores['MICE_pmm'] = evaluate(mice_pmm_X, X, X_corrupt) mice_pmm_lambda_reg_25 = MICE( impute_type='pmm', model=BayesianRidgeRegression(lambda_reg=0.25)).complete(X_corrupt) scores['MICE_pmm_lambda_reg_25'] = evaluate( mice_pmm_lambda_reg_25, X, X_corrupt) mice_pmm_lambda_reg_10 = MICE( impute_type='pmm', model=BayesianRidgeRegression(lambda_reg=0.1)).complete(X_corrupt) scores['MICE_pmm_lambda_reg_10'] = evaluate( mice_pmm_lambda_reg_10, X, X_corrupt) mice_pmm_lambda_reg_1 = MICE( impute_type='pmm', model=BayesianRidgeRegression(lambda_reg=0.01)).complete(X_corrupt) scores['MICE_pmm_lambda_reg_1'] = evaluate(mice_pmm_lambda_reg_1, X, X_corrupt) mice_pmm_lambda_reg_01 = MICE( impute_type='pmm', model=BayesianRidgeRegression(lambda_reg=0.001)).complete(X_corrupt) scores['MICE_pmm_lambda_reg_01'] = evaluate(mice_pmm_lambda_reg_01, X, X_corrupt) mice_pmm_lambda_reg_001 = MICE( impute_type='pmm', model=BayesianRidgeRegression(lambda_reg=0.0001)).complete(X_corrupt) scores['MICE_pmm_lambda_reg_001'] = evaluate( mice_pmm_lambda_reg_001, X, X_corrupt) knn_1_X = KNN(k=1).complete(X_corrupt) scores['knn_1'] = evaluate(knn_1_X, X, X_corrupt) knn_3_X = KNN(k=3).complete(X_corrupt) scores['knn_3'] = evaluate(knn_3_X, X, X_corrupt) knn_9_X = KNN(k=9).complete(X_corrupt) scores['knn_9'] = evaluate(knn_9_X, X, X_corrupt) knn_15_X = KNN(k=15).complete(X_corrupt) scores['knn_15'] = evaluate(knn_15_X, X, X_corrupt) knn_30_X = KNN(k=30).complete(X_corrupt) scores['knn_30'] = evaluate(knn_30_X, X, X_corrupt) knn_81_X = KNN(k=81).complete(X_corrupt) scores['knn_81'] = evaluate(knn_81_X, X, X_corrupt) knn_243_X = KNN(k=243).complete(X_corrupt) scores['knn_243'] = evaluate(knn_243_X, X, X_corrupt) knn_751_X = KNN(k=751).complete(X_corrupt) scores['knn_751'] = evaluate(knn_751_X, X, X_corrupt) knn_2000_X = KNN(k=2000).complete(X_corrupt) scores['knn_2000'] = evaluate(knn_2000_X, X, X_corrupt) knn_6000_X = KNN(k=6000).complete(X_corrupt) scores['knn_6000'] = evaluate(knn_6000_X, X, X_corrupt) if save_imputed: np.savetxt('./output/sweeps/' + name + '_MICE.csv', mice_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_mice_col_lambda_reg_25.csv', mice_col_lambda_reg_25, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_mice_col_lambda_reg_10.csv', mice_col_lambda_reg_10, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_mice_col_lambda_reg_1.csv', mice_col_lambda_reg_1, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_mice_col_lambda_reg_01.csv', mice_col_lambda_reg_01, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_mice_col_lambda_reg_001.csv', mice_col_lambda_reg_001, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_mice_pmm_X.csv', mice_pmm_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_mice_pmm_lambda_reg_25.csv', mice_pmm_lambda_reg_25, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_mice_pmm_lambda_reg_10.csv', mice_pmm_lambda_reg_10, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_mice_pmm_lambda_reg_1.csv', mice_pmm_lambda_reg_1, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_mice_pmm_lambda_reg_01.csv', mice_pmm_lambda_reg_01, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_mice_pmm_lambda_reg_001.csv', mice_pmm_lambda_reg_001, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_knn_1.csv', knn_1_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_knn_3.csv', knn_3_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_knn_9.csv', knn_9_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_knn_15.csv', knn_15_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_knn_30.csv', knn_30_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_knn_81.csv', knn_81_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_knn_243.csv', knn_243_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_knn_751.csv', knn_751_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_knn_2000.csv', knn_2000_X, delimiter=',', newline='\n') np.savetxt('./output/sweeps/' + name + '_knn_6000.csv', knn_6000_X, delimiter=',', newline='\n') print(scores) scores_df = pd.DataFrame().from_dict(scores.items()) scores_df.columns = ['Method', 'Score'] scores_df.set_index('Method') scores_df.to_csv('./output/scores/' + folder + '/' + name + '.csv')
RNA_txt.to_csv(datadir + '/filled_data/Mean_' + cancertype + str(missing_perc * 100) + '_' + str(sample_count) + '.csv') nz = test_data[:, :RNA_size].size nnm_mse = np.sqrt((np.linalg.norm( (X_filled[:test_data.shape[0], :RNA_size] - test_data[:, :RNA_size]))**2) / nz) print("Mean method, RMSE: %f" % nnm_mse) loss_list_Mean[cancer_c, perc, sample_count - 1] = nnm_mse ##############SVD rank = 10 X_filled = IterativeSVD( rank, init_fill_method="mean", verbose=False, convergence_threshold=0.0000001).fit_transform(df_combine) RNA_txt = pd.DataFrame(X_filled[:, :RNA_size], index=shuffle_cancer.index, columns=shuffle_cancer.columns[:RNA_size]) RNA_txt.to_csv(datadir + '/filled_data/SVD_' + cancertype + str(missing_perc * 100) + '_' + str(sample_count) + '.csv') nz = test_data[:, :RNA_size].size nnm_mse = np.sqrt((np.linalg.norm( (X_filled[:test_data.shape[0], :RNA_size] - test_data[:, :RNA_size]))**2) / nz) print("SVD, RMSE: %f" % nnm_mse) loss_list_SVD[cancer_c, perc, sample_count - 1] = nnm_mse
def _perform_imputation(job_context: Dict) -> Dict: """ Take the inputs and perform the primary imputation. Via https://github.com/AlexsLemonade/refinebio/issues/508#issuecomment-435879283: - Combine all microarray samples with a full join to form a microarray_expression_matrix (this may end up being a DataFrame) - Combine all RNA-seq samples (lengthScaledTPM) with a full outer join to form a rnaseq_expression_matrix - Calculate the sum of the lengthScaledTPM values for each row (gene) of the rnaseq_expression_matrix (rnaseq_row_sums) - Calculate the 10th percentile of rnaseq_row_sums - Drop all rows in rnaseq_expression_matrix with a row sum < 10th percentile of rnaseq_row_sums; this is now filtered_rnaseq_matrix - log2(x + 1) transform filtered_rnaseq_matrix; this is now log2_rnaseq_matrix - Set all zero values in log2_rnaseq_matrix to NA, but make sure to keep track of where these zeroes are - Perform a full outer join of microarray_expression_matrix and log2_rnaseq_matrix; combined_matrix - Remove genes (rows) with >30% missing values in combined_matrix - Remove samples (columns) with >50% missing values in combined_matrix - "Reset" zero values that were set to NA in RNA-seq samples (i.e., make these zero again) in combined_matrix - Transpose combined_matrix; transposed_matrix - Perform imputation of missing values with IterativeSVD (rank=10) on the transposed_matrix; imputed_matrix - Untranspose imputed_matrix (genes are now rows, samples are now columns) - Quantile normalize imputed_matrix where genes are rows and samples are columns """ job_context['time_start'] = timezone.now() # Combine all microarray samples with a full join to form a microarray_expression_matrix (this may end up being a DataFrame) microarray_expression_matrix = job_context['microarray_inputs'] # Combine all RNA-seq samples (lengthScaledTPM) with a full outer join to form a rnaseq_expression_matrix rnaseq_expression_matrix = job_context['rnaseq_inputs'] # Calculate the sum of the lengthScaledTPM values for each row (gene) of the rnaseq_expression_matrix (rnaseq_row_sums) rnaseq_row_sums = np.sum(rnaseq_expression_matrix, axis=1) # Calculate the 10th percentile of rnaseq_row_sums rnaseq_tenth_percentile = np.percentile(rnaseq_row_sums, 10) # Drop all rows in rnaseq_expression_matrix with a row sum < 10th percentile of rnaseq_row_sums; this is now filtered_rnaseq_matrix # TODO: This is probably a better way to do this with `np.where` rows_to_filter = [] for (x, sum_val) in rnaseq_row_sums.items(): if sum_val < rnaseq_tenth_percentile: rows_to_filter.append(x) filtered_rnaseq_matrix = rnaseq_expression_matrix.drop(rows_to_filter) # log2(x + 1) transform filtered_rnaseq_matrix; this is now log2_rnaseq_matrix filtered_rnaseq_matrix_plus_one = filtered_rnaseq_matrix + 1 log2_rnaseq_matrix = np.log2(filtered_rnaseq_matrix_plus_one) # Cache our RNA-Seq zero values cached_zeroes = {} for column in log2_rnaseq_matrix.columns: cached_zeroes[column] = np.where(log2_rnaseq_matrix[column] == 0) # Set all zero values in log2_rnaseq_matrix to NA, but make sure to keep track of where these zeroes are log2_rnaseq_matrix[log2_rnaseq_matrix==0]=np.nan # Perform a full outer join of microarray_expression_matrix and log2_rnaseq_matrix; combined_matrix combined_matrix = microarray_expression_matrix.merge(log2_rnaseq_matrix, how='outer', left_index=True, right_index=True) # Remove genes (rows) with <=70% present values in combined_matrix thresh = combined_matrix.shape[1] * .7 # (Rows, Columns) row_filtered_combined_matrix = combined_matrix.dropna(axis='index', thresh=thresh) # Everything below `thresh` is dropped # Remove samples (columns) with <50% present values in combined_matrix # XXX: Find better test data for this! col_thresh = row_filtered_combined_matrix.shape[0] * .5 row_col_filtered_combined_matrix_samples = row_filtered_combined_matrix.dropna(axis='columns', thresh=col_thresh) # "Reset" zero values that were set to NA in RNA-seq samples (i.e., make these zero again) in combined_matrix for column in cached_zeroes.keys(): zeroes = cached_zeroes[column] # Skip purged columns if column not in row_col_filtered_combined_matrix_samples: continue # Place the zero try: np.put(row_col_filtered_combined_matrix_samples[column], zeroes, 0.0) except Exception as e: logger.exception("Error when replacing zero") continue # Label our new replaced data combined_matrix_zero = row_col_filtered_combined_matrix_samples # Transpose combined_matrix; transposed_matrix transposed_matrix = combined_matrix_zero.transpose() # row_col_filtered_combined_matrix_samples.transpose() # Remove -inf and inf # This should never happen, but make sure it doesn't! transposed_matrix = transposed_matrix.replace([np.inf, -np.inf], np.nan) # Perform imputation of missing values with IterativeSVD (rank=10) on the transposed_matrix; imputed_matrix imputed_matrix = IterativeSVD(rank=10).fit_transform(transposed_matrix) # Untranspose imputed_matrix (genes are now rows, samples are now columns) untransposed_imputed_matrix = imputed_matrix.transpose() # Convert back to Pandas untransposed_imputed_matrix_df = pd.DataFrame.from_records(untransposed_imputed_matrix) untransposed_imputed_matrix_df.index = row_col_filtered_combined_matrix_samples.index untransposed_imputed_matrix_df.columns = row_col_filtered_combined_matrix_samples.columns # Quantile normalize imputed_matrix where genes are rows and samples are columns # XXX: Refactor QN target acquisition and application before doing this job_context['organism'] = Organism.get_object_for_name(list(job_context['input_files'].keys())[0]) job_context['merged_no_qn'] = untransposed_imputed_matrix_df # Perform the Quantile Normalization job_context = smasher._quantile_normalize(job_context, ks_check=False) job_context['time_end'] = timezone.now() job_context['formatted_command'] = "create_compendia.py" return job_context
while len(rows_to_impute) > 0: try: impute_me = set( random.sample(rows_to_impute, int(len(all_rows) * iteration_percent))) except Exception: # Population larger than sample impute_me = rows_to_impute rows_to_impute = rows_to_impute - impute_me df['SYNTHETIC'][impute_me] = np.nan needs_imputation_transposed = df.transpose() print("Imputing step!") imputed_matrix = IterativeSVD( rank=10).fit_transform(needs_imputation_transposed) imputed_matrix_transposed = imputed_matrix.transpose() print("Imputed!") # Convert back to Pandas df = df.transpose() df_imputed_matrix_transposed = pd.DataFrame.from_records( imputed_matrix_transposed) df_imputed_matrix_transposed.index = all_rows df_imputed_matrix_transposed.columns = all_cols df = df_imputed_matrix_transposed df.to_csv('synthetic_' + colname + "_" + str(iteration_percent) + '.tsv', sep='\t', encoding='utf-8')
def _perform_imputation(job_context: Dict) -> Dict: """ Take the inputs and perform the primary imputation. Via https://github.com/AlexsLemonade/refinebio/issues/508#issuecomment-435879283: - Combine all microarray samples with a full join to form a microarray_expression_matrix (this may end up being a DataFrame). - Combine all RNA-seq samples (lengthScaledTPM) with a full outer join to form a rnaseq_expression_matrix. - Calculate the sum of the lengthScaledTPM values for each row (gene) of the rnaseq_expression_matrix (rnaseq_row_sums). - Calculate the 10th percentile of rnaseq_row_sums - Drop all rows in rnaseq_expression_matrix with a row sum < 10th percentile of rnaseq_row_sums; this is now filtered_rnaseq_matrix - log2(x + 1) transform filtered_rnaseq_matrix; this is now log2_rnaseq_matrix - Set all zero values in log2_rnaseq_matrix to NA, but make sure to keep track of where these zeroes are - Perform a full outer join of microarray_expression_matrix and log2_rnaseq_matrix; combined_matrix - Remove genes (rows) with >30% missing values in combined_matrix - Remove samples (columns) with >50% missing values in combined_matrix - "Reset" zero values that were set to NA in RNA-seq samples (i.e., make these zero again) in combined_matrix - Transpose combined_matrix; transposed_matrix - Perform imputation of missing values with IterativeSVD (rank=10) on the transposed_matrix; imputed_matrix -- with specified svd algorithm or skip - Untranspose imputed_matrix (genes are now rows, samples are now columns) - Quantile normalize imputed_matrix where genes are rows and samples are columns """ imputation_start = log_state("start perform imputation", job_context["job"].id) job_context["time_start"] = timezone.now() rnaseq_row_sums_start = log_state("start rnaseq row sums", job_context["job"].id) # We potentially can have a microarray-only compendia but not a RNASeq-only compendia log2_rnaseq_matrix = None if job_context["rnaseq_matrix"] is not None: # Drop any genes that are entirely NULL in the RNA-Seq matrix job_context["rnaseq_matrix"] = job_context["rnaseq_matrix"].dropna( axis="columns", how="all") # Calculate the sum of the lengthScaledTPM values for each row # (gene) of the rnaseq_matrix (rnaseq_row_sums) rnaseq_row_sums = np.sum(job_context["rnaseq_matrix"], axis=1) log_state("end rnaseq row sums", job_context["job"].id, rnaseq_row_sums_start) rnaseq_decile_start = log_state("start rnaseq decile", job_context["job"].id) # Calculate the 10th percentile of rnaseq_row_sums rnaseq_tenth_percentile = np.percentile(rnaseq_row_sums, 10) log_state("end rnaseq decile", job_context["job"].id, rnaseq_decile_start) drop_start = log_state("drop all rows", job_context["job"].id) # Drop all rows in rnaseq_matrix with a row sum < 10th # percentile of rnaseq_row_sums; this is now # filtered_rnaseq_matrix # TODO: This is probably a better way to do this with `np.where` rows_to_filter = [] for (x, sum_val) in rnaseq_row_sums.items(): if sum_val < rnaseq_tenth_percentile: rows_to_filter.append(x) del rnaseq_row_sums log_state("actually calling drop()", job_context["job"].id) filtered_rnaseq_matrix = job_context.pop("rnaseq_matrix").drop( rows_to_filter) del rows_to_filter log_state("end drop all rows", job_context["job"].id, drop_start) log2_start = log_state("start log2", job_context["job"].id) # log2(x + 1) transform filtered_rnaseq_matrix; this is now log2_rnaseq_matrix filtered_rnaseq_matrix_plus_one = filtered_rnaseq_matrix + 1 log2_rnaseq_matrix = np.log2(filtered_rnaseq_matrix_plus_one) del filtered_rnaseq_matrix_plus_one del filtered_rnaseq_matrix log_state("end log2", job_context["job"].id, log2_start) cache_start = log_state("start caching zeroes", job_context["job"].id) # Cache our RNA-Seq zero values cached_zeroes = {} for column in log2_rnaseq_matrix.columns: cached_zeroes[column] = log2_rnaseq_matrix.index[np.where( log2_rnaseq_matrix[column] == 0)] # Set all zero values in log2_rnaseq_matrix to NA, but make sure # to keep track of where these zeroes are log2_rnaseq_matrix[log2_rnaseq_matrix == 0] = np.nan log_state("end caching zeroes", job_context["job"].id, cache_start) outer_merge_start = log_state("start outer merge", job_context["job"].id) # Perform a full outer join of microarray_matrix and # log2_rnaseq_matrix; combined_matrix if log2_rnaseq_matrix is not None: combined_matrix = job_context.pop("microarray_matrix").merge( log2_rnaseq_matrix, how="outer", left_index=True, right_index=True) else: logger.info("Building compendia with only microarray data.", job_id=job_context["job"].id) combined_matrix = job_context.pop("microarray_matrix") log_state("ran outer merge, now deleteing log2_rnaseq_matrix", job_context["job"].id) del log2_rnaseq_matrix log_state("end outer merge", job_context["job"].id, outer_merge_start) drop_na_genes_start = log_state("start drop NA genes", job_context["job"].id) # # Visualize Prefiltered # output_path = job_context['output_dir'] + "pre_filtered_" + str(time.time()) + ".png" # visualized_prefilter = visualize.visualize(combined_matrix.copy(), output_path) # Remove genes (rows) with <=70% present values in combined_matrix thresh = combined_matrix.shape[1] * 0.7 # (Rows, Columns) # Everything below `thresh` is dropped row_filtered_matrix = combined_matrix.dropna(axis="index", thresh=thresh) del combined_matrix del thresh log_state("end drop NA genes", job_context["job"].id, drop_na_genes_start) drop_na_samples_start = log_state("start drop NA samples", job_context["job"].id) # # Visualize Row Filtered # output_path = job_context['output_dir'] + "row_filtered_" + str(time.time()) + ".png" # visualized_rowfilter = visualize.visualize(row_filtered_matrix.copy(), output_path) # Remove samples (columns) with <50% present values in combined_matrix # XXX: Find better test data for this! col_thresh = row_filtered_matrix.shape[0] * 0.5 row_col_filtered_matrix_samples = row_filtered_matrix.dropna( axis="columns", thresh=col_thresh) row_col_filtered_matrix_samples_index = row_col_filtered_matrix_samples.index row_col_filtered_matrix_samples_columns = row_col_filtered_matrix_samples.columns log_state("end drop NA genes", job_context["job"].id, drop_na_samples_start) replace_zeroes_start = log_state("start replace zeroes", job_context["job"].id) for sample_accession_code in row_filtered_matrix.columns: if sample_accession_code not in row_col_filtered_matrix_samples_columns: sample = Sample.objects.get(accession_code=sample_accession_code) sample_metadata = sample.to_metadata_dict() job_context["filtered_samples"][sample_accession_code] = { **sample_metadata, "reason": "Sample was dropped because it had less than 50% present values.", "experiment_accession_code": smashing_utils.get_experiment_accession( sample.accession_code, job_context["dataset"].data), } del row_filtered_matrix # # Visualize Row and Column Filtered # output_path = job_context['output_dir'] + "row_col_filtered_" + str(time.time()) + ".png" # visualized_rowcolfilter = visualize.visualize(row_col_filtered_matrix_samples.copy(), # output_path) # "Reset" zero values that were set to NA in RNA-seq samples # (i.e., make these zero again) in combined_matrix for column in cached_zeroes.keys(): zeroes = cached_zeroes[column] # Skip purged columns if column not in row_col_filtered_matrix_samples: continue # Place the zero try: # This generates a warning, so use loc[] instead # row_col_filtered_matrix_samples[column].replace(zeroes, 0.0, inplace=True) zeroes_list = zeroes.tolist() new_index_list = row_col_filtered_matrix_samples_index.tolist() new_zeroes = list(set(new_index_list) & set(zeroes_list)) row_col_filtered_matrix_samples[column].loc[new_zeroes] = 0.0 except Exception: logger.warn("Error when replacing zero") continue log_state("end replace zeroes", job_context["job"].id, replace_zeroes_start) transposed_zeroes_start = log_state("start replacing transposed zeroes", job_context["job"].id) # Label our new replaced data combined_matrix_zero = row_col_filtered_matrix_samples del row_col_filtered_matrix_samples transposed_matrix_with_zeros = combined_matrix_zero.T del combined_matrix_zero # Remove -inf and inf # This should never happen, but make sure it doesn't! transposed_matrix = transposed_matrix_with_zeros.replace([np.inf, -np.inf], np.nan) del transposed_matrix_with_zeros log_state("end replacing transposed zeroes", job_context["job"].id, transposed_zeroes_start) # Store the absolute/percentages of imputed values matrix_sum = transposed_matrix.isnull().sum() percent = (matrix_sum / transposed_matrix.isnull().count()).sort_values(ascending=False) total_percent_imputed = sum(percent) / len(transposed_matrix.count()) job_context["total_percent_imputed"] = total_percent_imputed logger.info("Total percentage of data to impute!", total_percent_imputed=total_percent_imputed) # Perform imputation of missing values with IterativeSVD (rank=10) on the # transposed_matrix; imputed_matrix svd_algorithm = job_context["dataset"].svd_algorithm if svd_algorithm != "NONE": svd_start = log_state("start SVD", job_context["job"].id) logger.info("IterativeSVD algorithm: %s" % svd_algorithm) svd_algorithm = str.lower(svd_algorithm) imputed_matrix = IterativeSVD( rank=10, svd_algorithm=svd_algorithm).fit_transform(transposed_matrix) svd_start = log_state("end SVD", job_context["job"].id, svd_start) else: imputed_matrix = transposed_matrix logger.info("Skipping IterativeSVD") del transposed_matrix untranspose_start = log_state("start untranspose", job_context["job"].id) # Untranspose imputed_matrix (genes are now rows, samples are now columns) untransposed_imputed_matrix = imputed_matrix.T del imputed_matrix # Convert back to Pandas untransposed_imputed_matrix_df = pd.DataFrame.from_records( untransposed_imputed_matrix) untransposed_imputed_matrix_df.index = row_col_filtered_matrix_samples_index untransposed_imputed_matrix_df.columns = row_col_filtered_matrix_samples_columns del untransposed_imputed_matrix del row_col_filtered_matrix_samples_index del row_col_filtered_matrix_samples_columns # Quantile normalize imputed_matrix where genes are rows and samples are columns job_context["organism"] = Organism.get_object_for_name( job_context["organism_name"]) job_context["merged_no_qn"] = untransposed_imputed_matrix_df # output_path = job_context['output_dir'] + "compendia_no_qn_" + str(time.time()) + ".png" # visualized_merged_no_qn = visualize.visualize(untransposed_imputed_matrix_df.copy(), # output_path) log_state("end untranspose", job_context["job"].id, untranspose_start) quantile_start = log_state("start quantile normalize", job_context["job"].id) # Perform the Quantile Normalization job_context = smashing_utils.quantile_normalize(job_context, ks_check=False) log_state("end quantile normalize", job_context["job"].id, quantile_start) # Visualize Final Compendia # output_path = job_context['output_dir'] + "compendia_with_qn_" + str(time.time()) + ".png" # visualized_merged_qn = visualize.visualize(job_context['merged_qn'].copy(), output_path) job_context["time_end"] = timezone.now() job_context["formatted_command"] = ["create_compendia.py"] log_state("end prepare imputation", job_context["job"].id, imputation_start) return job_context
uncorrelated=correlation_proportions[2], missing_portion=0.0, fill_na=np.nan) X, _, y = generator.generate_data_logistic(1024, min_mult=0.0, max_mult=1.0) # X_incomplete has the same values as X except a subset have been replace with NaN X_incomplete, missing_mask = generator.generate_missing(X, 0.1, np.nan) # Use 3 nearest rows which have a feature to fill in each row's missing features X_filled_knn = KNN(k=3).fit_transform(X_incomplete) # matrix completion using MICE X_filled_mice = IterativeImputer().fit_transform(X_incomplete) # matrix completion using Iterative SVD X_filled_svd = IterativeSVD(rank=3).fit_transform(X_incomplete) # matrix completion using Matrix Factorization X_filled_mf = MatrixFactorization(learning_rate=0.01, rank=3, l2_penalty=0, min_improvement=1e-6).fit_transform(X_incomplete) # matrix completion using Mean Fill X_filled_meanfill = SimpleFill(fill_method='mean').fit_transform(X_incomplete) # matrix completion using Median Fill X_filled_medianfill = SimpleFill(fill_method='median').fit_transform(X_incomplete) # matrix completion using Zero Fill X_filled_zerofill = SimpleFill(fill_method='zero').fit_transform(X_incomplete) # matrix completion using Min Fill X_filled_minfill = SimpleFill(fill_method='min').fit_transform(X_incomplete)
import sklearn import pandas as pd import matplotlib as plt from fancyimpute import IterativeSVD print("reading data...") X = pd.read_csv("Data/train.csv").iloc[:, 1:-1].as_matrix( ) # remove first row (labels), first and last columns test = pd.read_csv("Data/test.csv").iloc[:, 1:].as_matrix() # ind = np.genfromtxt('Class_change_ind.csv', delimiter = ',', dtype = 'int32') # test_incomplete = test.copy() print X.shape print("setting svd object...") svd = IterativeSVD(rank=1000, convergence_threshold=0.0001) X_svd = svd.complete(X) # print X_svd[:,0] print("saving data...") np.savetxt("Data/train_isvdimp.csv", X_svd, delimiter=",") print("imputing test...") test_svd = svd.complete(test) np.savetxt("Data/test_isvdimp.csv", test_svd, delimiter=",") print '\a'
for negative_log_regularization_weight in [2, 3, 4]: regularization_weight = 10.0**-negative_log_regularization_weight table.add_entry(solver=IterativeImputer( n_nearest_columns=80, n_iter=50, n_burn_in=5, ), name="IterativeImputer_%d" % negative_log_regularization_weight) for fill_method in ["mean", "median"]: table.add_entry(solver=SimpleFill(fill_method=fill_method), name="SimpleFill_%s" % fill_method) for k in [1, 3, 7]: table.add_entry(solver=KNN(k=k, orientation="rows"), name="KNN_k%d" % (k, )) for shrinkage_value in [25, 50, 100]: # SoftImpute without rank constraints table.add_entry(solver=SoftImpute(shrinkage_value=shrinkage_value), name="SoftImpute_lambda%d" % (shrinkage_value, )) for rank in [10, 20, 40]: table.add_entry(solver=IterativeSVD(rank=rank, init_fill_method="zero"), name="IterativeSVD_rank%d" % (rank, )) table.save_html_table() table.print_sorted_errors()
def main(): #Creating matrix e1 sig_matrix = np.loadtxt('Clear_Cell_Cycle.txt') print("Dimension of raw e1: ") print(sig_matrix.shape) print('\n') #--------------------------------------------------- #Filling missing data in e1 sig_matrix[sig_matrix == 0] = np.NaN X_incomplete = sig_matrix #imputer = Imputer() #transformed_sig_matrix = imputer.fit_transform(sig_matrix) #Count the number of NaN values in each column #print(np.isnan(transformed_sig_matrix).sum()) #sig_matrix = transformed_sig_matrix # Use SVD X_filled = IterativeSVD().complete(X_incomplete) # Use 3 nearest rows which have a feature to fill in each row's missing features X_filled_knn = KNN(k=5).complete(X_incomplete) # svd_mse = ((X_filled_knn[missing_mask] - X[missing_mask]) ** 2).mean() # print("IterativeSVD MSE: %f" % svd_mse) # matrix completion using convex optimization to find low-rank solution # that still matches observed values. Slow! #X_filled_nnm = NuclearNormMinimization().complete(X_incomplete) # print mean squared error for the three imputation methods above #nnm_mse = ((X_filled_nnm[missing_mask] - X[missing_mask]) ** 2).mean() #print("Nuclear norm minimization MSE: %f" % nnm_mse) # knn_mse = ((X_filled_knn[missing_mask] - X[missing_mask]) ** 2).mean() # print("knnImpute MSE: %f" % knn_mse) sig_matrix = X_filled_knn e1 = sig_matrix # print("\n Tesnor 1 before decomposition\n") #with open("Tensor_Before_Decomposition", 'w') as fout: #fout.writelines(sess.run(T)) # print(sess.run(T)) Tfirst = T = TensorProj(e1) n = tf.norm(T) print("\n") print(sess.run(n)) T_new = tf.zeros([len(e1), len(e1), len(e1)], tf.float32) #scaling T # norm = tf.nn.l2_normalize(T, 0, epsilon = 1e-12, name = None) # norm = tf.norm(T) # T = norm # print(sess.run(T)) # fout = open("norm.txt", 'a') for i in range(200): decomp = decom(T, g) T_new += decomp T = decomp decomp = Tfirst - T_new init_op = tf.global_variables_initializer() sess = tf.Session() sess.run(init_op) n = tf.norm(decomp) print("\n") print(sess.run(n)) with open('norm.text', 'a') as fout: fout.write(str(sess.run(n)) + '\n') fout.close() #norm = tf.nn.l2_normalize(T_new, 0, epsilon = 1e-12, name = None) #print("\n sum of decomposition:\n") #init_op = tf.global_variables_initializer() #sess = tf.Session() #sess.run(init_op) #print(sess.run(T_new)) print("\n") Tfirst = T = TensorProj(e2) print("Tesnor 2 before decomposition\n") init_op = tf.global_variables_initializer() sess = tf.Session() sess.run(init_op) #print(sess.run(T)) print("\n") # Creating e2 b_pseoudoInv = np.linalg.pinv(b) project = np.dot(b, b_pseoudoInv) e2 = np.dot(project, e1) g = tf.Variable(tf.random_uniform([N])) #print('\n g is \n') init_op = tf.global_variables_initializer() sess = tf.Session() sess.run(init_op) print(sess.run(g)) T2 = T_new = tf.zeros([len(e2), len(e2), len(e2)], tf.float32) for i in range(0): T1 = decom(T, g) T_new += T - T1 T = T1 T2 = T_new - Tfirst init_op = tf.global_variables_initializer() sess = tf.Session() sess.run(init_op) n = tf.norm(T2) print(sess.run(n)) print("\n") norm = tf.nn.l2_normalize(T_new, 0, epsilon=1e-12, name=None) print("sum of decomposition:\n") init_op = tf.global_variables_initializer() sess = tf.Session() sess.run(init_op) print(sess.run(T_new)) print("\n")
def main(): #Enter the raw data file (main signal matrix e1) with open('Cell_Cycle_Expresion.txt') as infile: with open("Clear_Cell_Cycle.txt", "w") as outfile: for line in infile: Clear_Cell_Cycle(line, outfile) #Removing first row of the data file with open("Clear_Cell_Cycle.txt", 'r') as fin: data = fin.read().splitlines(True) with open("Clear_Cell_Cycle.txt", 'w') as fout: fout.writelines(data[1:]) #Keeping name of the genes and features of e1 (Cell_Cycle_Expresion) with open('Cell_Cycle_Expresion.txt', "r") as infile: with open("GeneNames_Cell_Cycle.txt", "w") as outfile: for line in infile: outfile.write("\t".join(line.split()[0]) + "\n") with open("GeneNames_Cell_Cycle.txt", 'r') as fin: data = fin.read().splitlines(True) with open("GeneNames_Cell_Cycle.txt", 'w') as fout: fout.writelines(data[1:]) #Creating list of signal genes f = open("GeneNames_Cell_Cycle.txt", 'r') sig_genes = f.readlines() print("Length of signal genes of e1: ") print(len(sig_genes)) print('\n') #Changing NULL to 0 with open("Clear_Cell_Cycle.txt", 'r') as fin2: data = fin2.read().splitlines(True) with open("Clear_Cell_Cycle.txt", 'w') as fout2: for line in data: fout2.write(line.replace('Null', '0')) #Creating matrix e1 sig_matrix = np.loadtxt('Clear_Cell_Cycle.txt') print("Dimension of raw e1: ") print(sig_matrix.shape) print('\n') #--------------------------------------------------- #Filling missing data in e1 sig_matrix[sig_matrix == 0] = np.NaN X_incomplete = sig_matrix #imputer = Imputer() #transformed_sig_matrix = imputer.fit_transform(sig_matrix) #Count the number of NaN values in each column #print(np.isnan(transformed_sig_matrix).sum()) #sig_matrix = transformed_sig_matrix # Use SVD X_filled = IterativeSVD().complete(X_incomplete) # Use 3 nearest rows which have a feature to fill in each row's missing features X_filled_knn = KNN(k=5).complete(X_incomplete) # svd_mse = ((X_filled_knn[missing_mask] - X[missing_mask]) ** 2).mean() # print("IterativeSVD MSE: %f" % svd_mse) # matrix completion using convex optimization to find low-rank solution # that still matches observed values. Slow! #X_filled_nnm = NuclearNormMinimization().complete(X_incomplete) # print mean squared error for the three imputation methods above #nnm_mse = ((X_filled_nnm[missing_mask] - X[missing_mask]) ** 2).mean() #print("Nuclear norm minimization MSE: %f" % nnm_mse) # knn_mse = ((X_filled_knn[missing_mask] - X[missing_mask]) ** 2).mean() # print("knnImpute MSE: %f" % knn_mse) sig_matrix = X_filled_knn #Center the expressions of genes sig_npArray = np.array(sig_matrix) sig_mean = np.mean(sig_npArray, axis=0) print("Mean of e1 at it's time level:") print(sig_mean) for i in range(0, 18): sig_matrix[:, i] = sig_matrix[:, i] - sig_mean[i] print("\n") #-------------------------------------------------- #Calculating svd of e1 U: eigenarray, s: eigenexpresions, V: eigengenes eigenarrays1, eigenexpressions1, eigengenes1 = np.linalg.svd( sig_matrix, full_matrices=False) #Creating a1 by e1 a1 = np.dot(sig_matrix, sig_matrix.transpose()) print("Dimension of raw a1: ") print(a1.shape) print('\n') #Calculating Evd of a1:network1 eigenarrays1_trans = eigenarrays1.transpose() a1_sSquare = np.square(eigenexpressions1) #Calculating significance of subnetworks of a1 M1 = 4 a1Frac = fraction(eigenexpressions1, M1) print( "Expression correlations for 4 most significant subnetworks of a1(%)") print(a1Frac * 100) print('\n') a1_entropy = ent(a1Frac) print("Entropy of matrix a1") print(a1_entropy) print('\n') #--------------------------------------------------- #Enter the raw data files for basic signals b1, b2, and b3 #b1 with open('Cell_Cycle_Binding.txt') as infile2: with open("Clear_Cycle_Bin.txt", "w") as outfile2: for line in infile2: Clear_Bind(line, outfile2) #Removing first row of the data file with open("Clear_Cycle_Bin.txt", 'r') as fin2: data = fin2.read().splitlines(True) with open("Clear_Cycle_Bin.txt", 'w') as fout2: fout2.writelines(data[1:]) #Creating matrix basis signal b1 sig_basis1 = np.loadtxt('Clear_Cycle_Bin.txt') print("Dimension of b1(Cell Cycle Binding): ") print(sig_basis1.shape) print('\n') #Keeping name of the genes and features of b1 (Cell_Cycle_Binding) with open('Cell_Cycle_Binding.txt') as infile: with open("GeneNames_Cycle_Bin.txt", "w") as outfile: for line in infile: outfile.write("\t".join(line.split()[0]) + "\n") with open("GeneNames_Cycle_Bin.txt", 'r') as fin: data = fin.read().splitlines(True) with open("GeneNames_Cycle_Bin.txt", 'w') as fout: fout.writelines(data[1:]) #Creating list of signal genes of b1 f = open("GeneNames_Cycle_Bin.txt", 'r') b1_genes = f.readlines() print("Length of signal genes of b1: ") print(len(b1_genes)) print('\n') #Find intersection of e1 and b1 interSec = set(sig_genes).intersection(b1_genes) print("Length of intersection of e1 and b1") print(len(interSec)) print('\n') #Finding relevant data of intersection(e1, b1) in e1 M = 18 sig_matrix = relData(interSec, sig_matrix, sig_genes, M) #Finding relevant data of intersection(e1, b1) in b1 M = 12 sig_basis1 = relData(interSec, sig_basis1, b1_genes, M) #Devide signal matrix by mean to convert signals to DNA binding sig_npArray = np.array(sig_basis1) basis1_mean = np.mean(sig_npArray, axis=1) print("Mean of b1 for gean measurments:") print(basis1_mean) for i in range(0, 1588): sig_basis1[i, :] = sig_basis1[i, :] / basis1_mean[i] print("\n") #Calculating svd of b1 U: eigenarray, s: eigenexpresions, V: eigengenes eigenarrays2, eigenexprssions2, eigengenes2 = np.linalg.svd( sig_basis1, full_matrices=False) print("Eigenexpresions of partial b1") print(eigenexprssions2) print('\n') #Computing entropy of b1 M1 = 12 b1Frac = fraction(eigenexprssions2, M1) print("Expression correlations for most significant subnetworks of b1(%)") print(a1Frac * 100) print('\n') b1_entropy = ent(b1Frac) #print(b1Frac[1]) print("Entropy of partial b1") print(b1_entropy) print('\n') #b2 with open('Develop_Binding.txt') as infile3: with open("Clear_Dev_Bin.txt", "w") as outfile3: for line in infile3: Clear_Bind(line, outfile3) #Removing first row of the data file with open("Clear_Dev_Bin.txt", 'r') as fin3: data = fin3.read().splitlines(True) with open("Clear_Dev_Bin.txt", 'w') as fout3: fout3.writelines(data[1:]) #Creating matrix basis signal b2 sig_basis2 = np.loadtxt('Clear_Dev_Bin.txt') print("Dimension of b2: ") print(sig_basis2.shape) print('\n') #b3 with open('Biosynthesis_Binding.txt') as infile4: with open("Clear_Biosyn_Bin.txt", "w") as outfile4: for line in infile4: Clear_Bind(line, outfile4) #Removing first row of the data file with open("Clear_Biosyn_Bin.txt", 'r') as fin4: data = fin4.read().splitlines(True) with open("Clear_Biosyn_Bin.txt", 'w') as fout4: fout4.writelines(data[1:]) #Creating matrix basis signal b3 sig_basis3 = np.loadtxt('Clear_Biosyn_Bin.txt') print("Dimension of b3: ") print(sig_basis3.shape) print('\n') #---------------------------------------------------- #pseudoInverse projection to create a2, a3, and a4 #a2 b1_pseoudoInv = np.linalg.pinv(sig_basis1) project1 = np.dot(sig_basis1, b1_pseoudoInv) sig2_matrix = np.dot(project1, sig_matrix) print("Dimension of e2: ") print(sig2_matrix.shape) print('\n') #Calculating svd of e2 U: eigenarray, s: eigenexpresions, V: eigengenes eigenarrays2, eigenexpressions2, eigengenes2 = np.linalg.svd( sig2_matrix, full_matrices=False) print("Eigenexpresions of e2") print(eigenexpressions2) print("\n") #Creating a2 by e2 a2 = np.dot(sig2_matrix, sig2_matrix.transpose()) #Calculating Evd of a2:network2 eigenarrays2_trans = eigenarrays2.transpose() a2_sSquare = np.square(eigenexpressions2) #Calculating significance of subnetworks of a2 M2 = 2 a2Frac = fraction(eigenexpressions2, M2) print("Expression correlations for most significant subnetworks of a2(%)") print(a2Frac * 100) print('\n') a2_entropy = ent(a2Frac) print("Entropy of matrix a2(should be .49)") print(a1_entropy) print("fraction of first eigenvalue of a2") print(a2Frac[0]) print('\n') #a3 b2_pseoudoInv = np.linalg.pinv(sig_basis2) project2 = np.dot(sig_basis1, b1_pseoudoInv) sig3_matrix = np.dot(project2, sig_matrix[0:2120]) #Picking 2120 genes out of the matrix sig3_matrix = sig3_matrix[0:2120, :] print("dimension of e3: ") print(sig3_matrix.shape) print('\n') #Calculating svd of e3 U: eigenarray, s: eigenexpresions, V: eigengenes eigenarrays3, eigenexprssions3, eigengenes3 = np.linalg.svd( sig3_matrix, full_matrices=False) #Creating a3 by e3 a3 = np.dot(sig_matrix, sig_matrix.transpose()) #Calculating Evd of a3:network3 eigenarrays3_trans = eigenarrays3.transpose() a3_sSquare = np.square(eigenexprssions3) #Calculating significance of subnetworks of a3 a3_fractions = a3_sSquare / np.sum(a3_sSquare) #a4 b3_pseoudoInv = np.linalg.pinv(sig_basis3) project3 = np.dot(sig_basis1, b1_pseoudoInv) sig4_matrix = np.dot(project3, sig_matrix[0:2120]) #Picking 2120 genes out of the matrix sig4_matrix = sig4_matrix[0:2120, :] print("dimension of e4: ") print(sig4_matrix.shape) print('\n') #Calculating svd of e4 = UsV U: eigenarray, s: eigenexpresions, V: eigengenes eigenarrays4, eigenexprssions4, eigengenes4 = np.linalg.svd( sig4_matrix, full_matrices=False) #Creating a4 by e4 a4 = np.dot(sig_matrix, sig_matrix.transpose()) #Calculating Evd of a4:network4 eigenarrays4_trans = eigenarrays4.transpose() a4_sSquare = np.square(eigenexprssions4) #Calculating significance of subnetworks of a4 a4_fractions = a4_sSquare / np.sum(a4_sSquare) #------------------------------------------------- #Tensor Decomposition #Picking 1588 genes out of the matrix # a_T = a1 + a2 + a3 + a4 #print(a_T.shape) #Appending signal matrices e1 to e4 sig_appendTemp = np.concatenate( (sig_matrix.transpose(), sig2_matrix.transpose(), sig3_matrix.transpose(), sig4_matrix.transpose()), axis=0) sig_append = sig_appendTemp.transpose() print("dimension of appended e") print(sig_append.shape) print('\n') #Calculating svd of appended e = UsV U: eigenarray, s: eigenexpresions, V: eigengenes eigenarrays, eigenexprssions, eigengenes = np.linalg.svd( sig_append, full_matrices=True) #Calculating HOEVD of overall network a_T a_T_sSquare = np.square(eigenexprssions) a_T_append = np.dot(sig_append, sig_append.transpose()) #print(a_T_sSquare.shape) #HOEVD for individual networks M = 18 + 12 + 12 + 8 M_couple = M * (M - 1) / 2 M1 = 15 d = 1588 #Picking 1588 genes out of the matrix a1 = a1[0:1588, 0:1588] HOEVD(a1, a1_sSquare, eigenarrays, M1, d) M2 = 12 d = 1588 HOEVD(a2, a2_sSquare, eigenarrays, M2, d)
def benchmark_complete(data, ending_density=.02, step=.01): ''' Input: Data array to benchmark on, the ending density to return results, the step bteween density imputation Output: Dataframe of output density and RMSE for each method with respect to each input density ''' # removes min value that is greater than zero (checks density) in each iteration randomly chosen #density range to run nonzeroscount = np.count_nonzero(data) sizel = data.shape totalentr = sizel[0] * sizel[1] end = 0.02 # final density to test begin = (nonzeroscount / totalentr) # Begning density of matrix given #step=.01 # step of density #intialize lists to store density_in = [] RMSE_empca_scores = [] RMSE_wpca_scores = [] RMSE_sfi_scores = [] RMSE_siv_scores = [] RMSE_sni_scores = [] RMSE_smi_scores = [] RMSE_szi_scores = [] RMSE_wmiC_scores = [] RMSE_wmiP_scores = [] Density_empca = [] Density_wpca = [] Density_sfi = [] Density_siv = [] Density_sni = [] Density_smi = [] Density_szi = [] Density_wmiC = [] Density_wmiP = [] #radnomly remove values from known matrix and try to impute them for d in reversed(np.arange(end, begin, step)): otum = data.T.copy() #begin density check nonzeroscount = np.count_nonzero(otum) sizel = otum.shape totalentr = sizel[0] * sizel[1] while np.float64((nonzeroscount / totalentr)) > d: #remove a min frequency OTU and then check density j = np.random.randint(0, len(otum[:][:]) - 1) #make sure row is not all zero (all zero row causes singular matrix) if sum(list(otum[j][:])) < 1: continue m = min(i for i in list(otum[j][:]) if i > 0) #make sure removing value will not result in zero row if sum(list(otum[j][:])) == m: continue otum[j][list(otum[j][:]).index(m)] = 0 #check denstiy to break nonzeroscount = float(np.count_nonzero(otum)) sizel = otum.shape totalentr = float(sizel[0]) * float(sizel[1]) # coherce float of the unknown and print new density print("Data table of %f generated" % d) otum = otum.T.astype(np.float64) # make zero unknown for fancy impute, avoid singular matrix by taking transpose otum2 = otum.T.copy() otum2 = otum2.astype(np.float64) otum2[otum2 == 0] = np.nan #make unknown nan #WPCA and EMPCA #build wieghted matrix weight = otum.copy() for i in range(len(otum2.T)): for j in range(len(otum2.T[i])): if otum2.T[i][j] == 0: weight[i][j] = 1 else: weight[i][j] = 1000 print("Running EMPCA") EMPCAi = EMPCA(n_components=3).fit_reconstruct(otum.copy(), weight) print("Running WPCA") WPCAi = WPCA(n_components=3).fit_reconstruct(otum.copy(), weight) # fancy impute and zeros print("Nuclear Norm") sni = NuclearNormMinimization(min_value=(np.amin(otum2)), max_value=(np.amax(otum2))).complete( otum2.copy()) print("Running Soft Impute") sfi = SoftImpute(shrinkage_value=None, convergence_threshold=0.00001, max_iters=1000, max_rank=min(otum2.shape), n_power_iterations=1, init_fill_method="zero", min_value=(np.amin(otum2)), max_value=(np.amax(otum2)), normalizer=None, verbose=False).complete(otum2.copy()) print("Running Iterative SVD") siv = IterativeSVD(rank=(min(otum2.shape) - 1), convergence_threshold=0.00001, max_iters=1000, gradual_rank_increase=True, svd_algorithm="arpack", init_fill_method="zero", min_value=(np.amin(otum2)), max_value=(np.amax(otum2)), verbose=False).complete(otum2.copy()) print("Running Matrix Factorization") smi = MatrixFactorization(rank=(min(otum2.shape) - 1), initializer=np.random.randn, learning_rate=0.01, patience=5, l1_penalty=0.05, l2_penalty=0.05, min_improvement=0.01, max_gradient_norm=5, optimization_algorithm="adam", min_value=(np.amin(otum2)), max_value=(np.amax(otum2)), verbose=False).complete(otum2.copy()) print("Imputing by filling with zeros for base comparison") szi = base.zeros(otum2.copy()) print("Weighted Mean Interpolation without phylo-distance") wmiC = base.wmi_wrapper(X=otum2.copy()) print("Weighted Mean Interpolation with phylo-distance") phylo = pd.read_csv( 'data/Matched_Pheno_and_Phylo_Data/matched_phylo.csv/matched_phylo.csv' ) wmiP = base.wmi_wrapper(X=otum2.copy(), D_j=phylo) # save the results #density in (after removed values) density_in.append(error.get_density(otum)) # density imputed Density_empca.append(error.get_density(EMPCAi)) Density_wpca.append(error.get_density(WPCAi)) Density_sfi.append(error.get_density(sfi)) Density_siv.append(error.get_density(siv)) Density_sni.append(error.get_density(sni)) Density_smi.append(error.get_density(smi)) Density_szi.append(error.get_density(szi)) Density_wmiC.append(error.get_density(wmiC)) Density_wmiP.append(error.get_density(wmiP)) # RMSE of imputed values missing_mask = np.isnan( otum2.T ) # masking to only check RMSE between values imputed and values removed RMSE_empca_scores.append(error.RMSE(data, EMPCAi, missing_mask)) RMSE_wpca_scores.append(error.RMSE(data, WPCAi, missing_mask)) RMSE_sfi_scores.append(error.RMSE(data, sfi.T, missing_mask)) RMSE_siv_scores.append(error.RMSE(data, siv.T, missing_mask)) RMSE_sni_scores.append(error.RMSE(data, sni.T, missing_mask)) RMSE_smi_scores.append(error.RMSE(data, smi.T, missing_mask)) RMSE_szi_scores.append(error.RMSE(data, szi.T, missing_mask)) RMSE_wmiC_scores.append(error.RMSE(data, wmiC.T, missing_mask)) RMSE_wmiP_scores.append(error.RMSE(data, wmiP.T, missing_mask)) RMSEmapping = pd.DataFrame({ 'Density': list(map(int, density_in)), 'EMPCA': RMSE_empca_scores, 'Matrix Factorization': RMSE_smi_scores, 'WPCA': RMSE_wpca_scores, 'Soft Impute': RMSE_sfi_scores, 'Iterative SVD': RMSE_siv_scores, 'Nuclear Norm Minimization': RMSE_sni_scores, 'Zeros Replace Unknown': RMSE_szi_scores, 'Weighted-Mean Interpolation Correlation': RMSE_wmiC_scores, 'Weighted-Mean Interpolation Phylo': RMSE_wmiP_scores }) RMSEmapping.set_index(['Density'], inplace=True) Out_density = pd.DataFrame({ 'density': list(map(int, density_in)), 'EMPCA': Density_empca, 'Matrix Factorization': Density_smi, 'WPCA': Density_wpca, 'Soft Impute': Density_sfi, 'Iterative SVD': Density_siv, 'Nuclear Norm Minimization': Density_sni, 'Zeros Replace Unknown': Density_szi, 'Weighted-Mean Interpolation Correlation': Density_wmiC, 'Weighted-Mean Interpolation Phylo': Density_wmiP }) Out_density.set_index(['density'], inplace=True) return Out_density, RMSEmapping
imputed_knn_col = KNN(k=10, orientation="columns").fit_transform(scaled) # inverse transformation -- we don't want the standard scores inverse_knn_col = scaler.inverse_transform(imputed_knn_col) # columns are samples untransposed_knn_col = inverse_knn_col.transpose() # write to file knn_col_df = pd.DataFrame(untransposed_knn_col) knn_col_df.index = data.index knn_col_df.columns = data.columns.values # not to be confused with the Sleipnir KNNImputer output knn_col_outfile = outfile + "_KNN_fancyimpute_column.pcl" knn_col_df.to_csv(knn_col_outfile, sep='\t') print("IterativeSVD...") # no transformation imputed_svd = IterativeSVD(rank=10).fit_transform(transposed) # columns are samples untransposed_svd = imputed_svd.transpose() # write to file svd_df = pd.DataFrame(untransposed_svd) svd_df.index = data.index svd_df.columns = data.columns.values # not to be confused with the Sleipnir KNNImputer output svd_outfile = outfile + "_IterativeSVD.pcl" svd_df.to_csv(svd_outfile, sep='\t')
args = parser.parse_args() with open(args.config) as f: config = json.load(f) data_path = config["data_path"] #Ground truth data corrupt_data_path = config[ "corrupt_data_path"] #Data containing missing values rank = config["rank"] trial_ind = config["trial_ind"] # LOAD DATA data = pd.read_csv(data_path).values data_missing = pd.read_csv(corrupt_data_path).values n_row = data_missing.shape[1] # dimensionality of data space non_missing_row_ind = np.where(np.isfinite(np.sum(data_missing, axis=1))) na_ind = np.where(np.isnan(data_missing)) na_count = len(na_ind[0]) data_impute_SVD = IterativeSVD(rank=rank, convergence_threshold=0.0005, max_iters=16).fit_transform(data_missing) ReconstructionErrorSVD = sum( ((data_impute_SVD[na_ind] - data[na_ind])**2)**0.5) / na_count print('Reconstruction error (VAE):') print(ReconstructionErrorSVD) np.savetxt("./imputed_data_trial_" + str(trial_ind) + "_SVD.csv", data_impute_SVD, delimiter=",")
def calculate_cumulative_ratings(owner_id, project_id): import numpy as np from fancyimpute import SoftImpute, IterativeSVD from sklearn.preprocessing import MinMaxScaler ROW_WISE = 1 # COL_WISE = 0 scaler_top = MinMaxScaler(feature_range=(2, 3)) scaler_bottom = MinMaxScaler(feature_range=(1, 2)) # input format: worker_id, task_id, accuracy # output format: pivot table of worker_id and task_id with accuracy values query = ''' SELECT r.id, r.target_id AS worker_id, u.username username, r.task_id AS task_id, weight AS accuracy FROM crowdsourcing_rating r INNER JOIN crowdsourcing_task t ON t.id = r.task_id INNER JOIN crowdsourcing_project p ON p.id = t.project_id INNER JOIN crowdsourcing_taskworker tw ON t.id = tw.task_id AND tw.worker_id=r.target_id INNER JOIN auth_user u ON u.id = r.target_id WHERE p.group_id = (%(project_id)s) AND origin_type=(%(origin_type)s); ''' cursor = connection.cursor() cursor.execute( query, { 'project_id': project_id, 'origin_type': Rating.RATING_REQUESTER, 'origin_id': owner_id }) worker_ratings_raw = cursor.fetchall() # 0 - rating id # 1 - worker_id # 2 - username # 3 - task_id # 4 - accuracy d = [{ 'worker_id': worker_rating[1], 'task_id': worker_rating[3], 'accuracy': worker_rating[4], } for worker_rating in worker_ratings_raw] usernames = {} for rating in worker_ratings_raw: usernames['%d' % rating[1]] = rating[2] df = DataFrame(d) pivoted = pivot_table(df, values='accuracy', index=['worker_id'], columns=['task_id']) pivoted = pivoted.reset_index('worker_id') pivoted.index.name = None # COLUMNS = ["worker_id", "score", "accuracy", "attempted", "correct", "boomerang"] data = pivoted.copy(deep=True) matrix = data.ix[:, 1:] # without worker_id # data['accuracy'] = matrix.mean(axis=ROW_WISE) * 100 # data['attempted'] = matrix.count(axis=ROW_WISE) # data['correct'] = matrix.sum(axis=ROW_WISE) # data = data[data["attempted"]>=MIN_TASKS] # turn incorrect to -1 as imputations will fill with 0 # matrix[matrix <= 0] = -1 try: mat = IterativeSVD(verbose=False, init_fill_method="mean").complete(matrix) except Exception: mat = SoftImpute(verbose=False, init_fill_method="mean").complete(matrix) data['score'] = mat.mean(axis=ROW_WISE) data = data.sort_values(by=['score'], ascending=[False]) percentile = data['score'].quantile(settings.WORKER_SPLIT_PERCENTILE) # Top 25% = 3-2 and Bottom 75% = 2-1 num_workers = len(data) num_workers_top_x = len(data[data['score'] >= percentile]) top_x = data.head(num_workers_top_x) # add extra worker at inflexion point from top set as it will have 2.0 duplicated bottom_y = data.tail(num_workers - num_workers_top_x + 1) # accuracy = sum(data['correct']) * 100 / sum(data['attempted']) top_x_score = scaler_top.fit_transform( np.array(top_x['score']).reshape((len(top_x['score']), 1))) bottom_y_score = scaler_bottom.fit_transform( np.array(bottom_y['score']).reshape((len(bottom_y['score']), 1))) # ignore the 1st value of bottom list as it is duplicate one from top list. boomerang_scores = np.append(top_x_score, bottom_y_score[1:]) data['boomerang'] = boomerang_scores boomerang_ratings = data.to_dict('records') worker_ratings = [{ "worker_id": r.worker_id, "worker_username": usernames[r.worker_id], "task_avg": r.boomerang, "requester_avg": 0 } for r in boomerang_ratings] return worker_ratings
data = pd.read_csv("train_with_missing/1.csv", index_col=False) data.head(5) # In[6]: data.isnull().sum(axis=0) # In[7]: values = data.values values.shape # In[9]: X_filled_svd = IterativeSVD().fit_transform(values) # In[12]: X_filled_svd # In[42]: y = values[:, 13] x = np.delete(X_filled_svd, 0, 1) # In[43]: xtr, xt, ytr, yt = train_test_split(x, y, test_size=0.1, random_state=42) # In[48]: