Beispiel #1
0
def test_iterative_svd_with_low_rank_random_matrix():
    solver = IterativeSVD(rank=3)
    XY_completed = solver.fit_transform(XY_incomplete)
    _, missing_mae = reconstruction_error(XY,
                                          XY_completed,
                                          missing_mask,
                                          name="IterativeSVD")
    assert missing_mae < 0.1, "Error too high!"
Beispiel #2
0
def fancyimpute_matrix_completion(function, gram_drop,
                                  seqs=None, sigma=None, triangular=None,
                                  num_process=4,
                                  drop_flag_matrix=None):
    gram_partially_completed_by_gak = gak.gram_gak(seqs,
                                                   sigma=sigma,
                                                   triangular=triangular,
                                                   num_process=num_process,
                                                   drop_flag_matrix=drop_flag_matrix)
    for i in range(len(gram_drop)):
        gram_drop[i, i] = 1
        for j in range(len(gram_drop[0])):
            if np.isnan(gram_partially_completed_by_gak[i, j]):
                continue
            assert np.isnan(gram_drop[i, j])
            gram_drop[i, j] = gram_partially_completed_by_gak[i, j]
    if function == "SoftImpute":
        gram_completed = SoftImpute().complete(gram_drop)
    elif function == "KNN":
        gram_completed = KNN().complete(gram_drop)
    elif function == "IterativeSVD":
        gram_completed = IterativeSVD().complete(gram_drop)
    else:
        print("unsupported fancyimpute functin")
        exit(-1)
    return gram_completed
Beispiel #3
0
def baseline_inpute(X_incomplete, method='mean', level=0):

    if method == 'mean':
        X_filled_mean = SimpleFill().fit_transform(X_incomplete)
        return X_filled_mean
    elif method == 'knn':
        k = [3, 10, 50][level]
        X_filled_knn = KNN(k=k, verbose=False).fit_transform(X_incomplete)
        return X_filled_knn
    elif method == 'svd':
        rank = [
            np.ceil((X_incomplete.shape[1] - 1) / 10),
            np.ceil((X_incomplete.shape[1] - 1) / 5), X_incomplete.shape[1] - 1
        ][level]
        X_filled_svd = IterativeSVD(rank=int(rank),
                                    verbose=False).fit_transform(X_incomplete)
        return X_filled_svd
    elif method == 'mice':
        max_iter = [3, 10, 50][level]
        X_filled_mice = IterativeImputer(
            max_iter=max_iter).fit_transform(X_incomplete)
        return X_filled_mice
    elif method == 'spectral':
        # default value for the sparsity level is with respect to the maximum singular value,
        # this is now done in a heuristic way
        sparsity = [0.5, None, 3][level]
        X_filled_spectral = SoftImpute(
            shrinkage_value=sparsity).fit_transform(X_incomplete)
        return X_filled_spectral
    else:
        raise NotImplementedError
def imputeData(X_incomplete):

    X = np.zeros((4153, 18))

    # X_incomplete has missing data which is represented with NaN values
    X_filled = IterativeSVD().complete(X_incomplete)
    print('test1')
    return
 def complete(self, data: pd.DataFrame):
     df = data.copy()
     cols = list(df)
     if np.argmax(cols) < self.rank:
         self.rank = np.argmax(cols)
     df = pd.DataFrame(IterativeSVD(rank=self.rank, verbose=False).fit_transform(df))
     df.columns = cols
     return df
Beispiel #6
0
def test_estimators(X, y, dum_enc, classification=True):
    ModeMeanImputer = create_mode_mean_imputer(X, dum_enc)

    # List with all imputation algorithms to test, in tuples of (name, estimator object, inductive)
    impute_estimators = [
        ("ModeMeanImputer", ModeMeanImputer, True),
        ("KNNImputer", KNNImputer(), True),
        ("Iter_BayesianRidge",
         IterativeImputer(estimator=BayesianRidge(), random_state=0), True),
        ("Iter_DecisionTree",
         IterativeImputer(estimator=DecisionTreeRegressor(max_features='sqrt',
                                                          random_state=0),
                          random_state=0), True),
        ("Iter_RF",
         IterativeImputer(estimator=RandomForestRegressor(n_estimators=100,
                                                          random_state=0),
                          random_state=0), True),
        ("Iter_ExtraTrees",
         IterativeImputer(estimator=ExtraTreesRegressor(n_estimators=100,
                                                        random_state=0),
                          random_state=0), True),
        ("Iter_KNRegr",
         IterativeImputer(estimator=KNeighborsRegressor(n_neighbors=15),
                          random_state=0), True),
        ("Iter_SVD", IterativeSVD(rank=min(min(X.shape) - 1, 10),
                                  verbose=False), False),
        ("SoftImpute", SoftImpute(verbose=False), False)
    ]

    imp_scores = {}
    times = {}
    if not classification:
        for estimator_name, impute_estimator, inductive in impute_estimators:
            time1 = time.time()
            imp_scores[estimator_name] = imputation_score_regression(
                X, y, estimator_name, impute_estimator, inductive)
            time2 = time.time()
            times[estimator_name] = time2 - time1
            #print(estimator_name + " finished, took " + str(round(time2 - time1, 1)) + " seconds")

    if classification:
        for estimator_name, impute_estimator, inductive in impute_estimators:
            time1 = time.time()
            imp_scores[estimator_name] = imputation_score_classification(
                X, y, estimator_name, impute_estimator, inductive)
            time2 = time.time()
            times[estimator_name] = time2 - time1
            #print(estimator_name + " finished, took " + str(round(time2 - time1, 1)) + " seconds")

    imputer_dict = {}
    for estimator_name, impute_estimator, inductive in impute_estimators:
        imputer_dict[estimator_name] = impute_estimator

    return imp_scores, times, imputer_dict
def fancyImputeAttempts(data, dataframe):
    data = np.array(data, np.float)
    #use fancy impute package
    filled_knn = KNN(k=3, verbose=False).complete(data)
    filled_softimpute = SoftImpute(verbose=False).complete(data)
    filled_svd = IterativeSVD(verbose=False).complete(data)

    print "\nKNN computations\n"
    doiteration(filled_knn, dataframe)
    print "\n SOFTIMPUTE computations\n"
    doiteration(filled_softimpute, dataframe)
    print "\n SVD computations\n"
    doiteration(filled_svd, dataframe)
Beispiel #8
0
    def impute_svd(df, rank=10, convergence_threshold=0.00001, max_iters=200):
        """
        Imputes the missing values by using SVD decomposition
        Based on the following publication: 'Missing value estimation methods for DNA microarrays' by Troyanskaya et. al.

        :param df:The input dataframe that contains missing values
        :param rank: Rank value of the truncated SVD
        :param convergence_threshold: The threshold to stop the iterations
        :param max_iters: Max number of iterations
        :return: the imputed dataframe
        """
        imputed_matrix = IterativeSVD(rank, convergence_threshold,
                                      max_iters).complete(df.values)
        imputed_df = pd.DataFrame(imputed_matrix, df.index, df.columns)
        return imputed_df
Beispiel #9
0
def complex_imputation(df, method='mice', neighbors=3):
    """
	Inputs:
	df -- dataframe of incomplete data
	method -- method of imputation
		- 'knn': Imputes using K Nearest Neighbors of completed rows
		- 'soft_impute': Imputes using iterative soft thresholding of SVD decompositions
		- 'mice': Imputes using Multiple Imputation by Chained Equations method
		- 'nuclear_nm': Imputation using Exact Matrix Completion via Convex Optimization method
		- 'matrix_factorization': Imputes by factorization of matrix in low-rank U and V
								  with L1 sparsity on U elements and L2 sparsity on V elements
		- 'iterative_svd': Imputes based on iterative low-rank SVD decomposition
	neighbors -- parameter for KNN imputation
	
	Output:
	Completed matrix
	"""
    # Create matrix of features
    X_incomplete = df.values
    # Normalize matrix by std and mean (0 mean, 1 variance)
    X_incomplete_normalized = BiScaler().fit_transform(X_incomplete)

    if method == 'knn':
        X_complete = KNN(neighbors).complete(X_incomplete)
        return fill_values(df, X_complete)

    if method == 'soft_impute':
        X_complete_normalized = SoftImpute().complete(X_incomplete_normalized)
        X_complete = BiScaler().inverse_transform(X_complete_normalized)
        return fill_values(df, X_complete)

    if method == 'mice':
        X_complete = MICE().complete(X_incomplete)
        return fill_values(df, X_complete)

    if method == 'nuclear_nm':
        X_complete = NuclearNormMinimization().complete(X_incomplete)
        return fill_values(df, X_complete)

    if method == 'matrix_factorization':
        X_complete = MatrixFactorization().complete(X_incomplete)
        return fill_values(df, X_complete)

    if method == 'iterative_svd':
        X_complete = IterativeSVD().complete(X_incomplete)
        return fill_values(df, X_complete)
 def run_impute(self, X, state='train'):
     if state == 'train':
         self.train_data['ave'] = np.zeros([X.shape[0], X.shape[1]])
         for imp_method in self.impute_method:
             if imp_method == 'mean':
                 imp_ope = SimpleFill()
             if imp_method == 'KNN':
                 imp_ope = KNN()
             if imp_method == 'IterativeSVD':
                 imp_ope = IterativeSVD()
             if imp_method == 'MatrixFactorization':
                 imp_ope = MatrixFactorization()
             X_filled = imp_ope.fit_transform(X)
             self.train_data[imp_method] = X_filled
             self.impute_operator[imp_method] = imp_ope
             self.train_data['ave'] += X_filled
         self.train_data['ave'] /= len(self.impute_method)
     return 0
Beispiel #11
0
def determine_impute(df):
    """Iterates various imputation methods to find lower MSE"""
    algorithms = [
        SimpleFill(),
        KNN(1),
        KNN(2),
        KNN(3),
        KNN(4),
        KNN(5),
        IterativeSVD(),
        MatrixFactorization()
    ]
    MSE = {}
    df_incomplete = create_test_df(df, 0.7, list(T40_dict.keys()))
    for i, alg in enumerate(algorithms):
        print(alg)
        X_complete = impute_df(df_incomplete, alg)
        alg_mse = ((df - X_complete)**2).sum().mean()
        print(str(i) + alg.__class__.__name__, alg_mse)
        MSE[str(i) + alg.__class__.__name__] = alg_mse
    return MSE
Beispiel #12
0
 def __init__(self, data, predict):
     self.df = data
     self.predict = predict
     self.X = None
     self.y = None
     self.X_scale = None
     self.X_train = None
     self.X_test = None
     self.y_train = None
     self.y_test = None
     self.incomplete_data = None
     self.clean_data = None
     self.methods = [
         SimpleFill(),
         KNN(1),
         KNN(2),
         KNN(3),
         KNN(4),
         KNN(5),
         IterativeSVD(),
         MatrixFactorization()
     ]
Beispiel #13
0
            name="MICE_%d" % negative_log_regularization_weight)

    for fill_method in ["mean", "median"]:
        table.add_entry(
            solver=SimpleFill(fill_method=fill_method),
            name="SimpleFill_%s" % fill_method)

    for k in [1, 5, 17]:
        table.add_entry(
            solver=DenseKNN(
                k=k,
                orientation="rows"),
            name="DenseKNN_k%d" % (k,))

    for shrinkage_value in [50, 200, 800]:
        # SoftImpute without rank constraints
        table.add_entry(
            solver=SoftImpute(
                shrinkage_value=shrinkage_value),
            name="SoftImpute_lambda%d" % (shrinkage_value,))

    for rank in [10, 40, 160]:
        table.add_entry(
            solver=IterativeSVD(
                rank=rank,
                init_fill_method="zero"),
            name="IterativeSVD_rank%d" % (rank,))

    table.save_html_table()
    table.print_sorted_errors()
Beispiel #14
0
def impute(data, method='mean', value=None, nan_value=np.nan):
    """
    Impute missing values on a numpy ndarray in a column-wise manner.
    
    ANTsR function: `antsrimpute`

    Arguments
    ---------
    data : numpy.ndarray
        data to impute

    method : string or float
        type of imputation method to use
        Options:
            mean
            median
            constant
            KNN
            BiScaler
            NuclearNormMinimization
            SoftImpute
            IterativeSVD

    value : scalar (optional)
        optional arguments for different methods
        if method == 'constant'
            constant value
        if method == 'KNN'
            number of nearest neighbors to use

    nan_value : scalar
        value which is interpreted as a missing value

    Returns
    -------
    ndarray if ndarray was given
    OR
    pd.DataFrame if pd.DataFrame was given

    Example
    -------
    >>> import ants
    >>> import numpy as np
    >>> data = np.random.randn(4,10)
    >>> data[2,3] = np.nan
    >>> data[3,5] = np.nan
    >>> data_imputed = ants.impute(data, 'mean')

    Details
    -------
    KNN: Nearest neighbor imputations which weights samples using the mean squared 
            difference on features for which two rows both have observed data.

    SoftImpute: Matrix completion by iterative soft thresholding of SVD 
                decompositions. Inspired by the softImpute package for R, which 
                is based on Spectral Regularization Algorithms for Learning 
                Large Incomplete Matrices by Mazumder et. al.

    IterativeSVD: Matrix completion by iterative low-rank SVD decomposition.
                    Should be similar to SVDimpute from Missing value estimation 
                    methods for DNA microarrays by Troyanskaya et. al.

    MICE: Reimplementation of Multiple Imputation by Chained Equations.

    MatrixFactorization: Direct factorization of the incomplete matrix into 
                        low-rank U and V, with an L1 sparsity penalty on the elements 
                        of U and an L2 penalty on the elements of V. 
                        Solved by gradient descent.

    NuclearNormMinimization: Simple implementation of Exact Matrix Completion 
                            via Convex Optimization by Emmanuel Candes and Benjamin 
                            Recht using cvxpy. Too slow for large matrices.

    BiScaler: Iterative estimation of row/column means and standard deviations 
                to get doubly normalized matrix. Not guaranteed to converge but 
                works well in practice. Taken from Matrix Completion and 
                Low-Rank SVD via Fast Alternating Least Squares.
    """
    _fancyimpute_options = {
        'KNN', 'BiScaler', 'NuclearNormMinimization', 'SoftImpute',
        'IterativeSVD'
    }
    if (not has_fancyimpute) and (method in _fancyimpute_options):
        raise ValueError(
            'You must install `fancyimpute` (pip install fancyimpute) to use this method'
        )

    _base_options = {'mean', 'median', 'constant'}
    if (method not in _base_options) and (
            method not in _fancyimpute_options) and (not isinstance(
                method, (int, float))):
        raise ValueError(
            'method not understood.. Use `mean`, `median`, a scalar, or an option from `fancyimpute`'
        )

    X_incomplete = data.copy()

    if method == 'KNN':
        if value is None:
            value = 3
        X_filled = KNN(k=value, verbose=False).complete(X_incomplete)

    elif method == 'BiScaler':
        X_filled = BiScaler(verbose=False).fit_transform(X_incomplete)

    elif method == 'SoftImpute':
        X_filled = SoftImpute(verbose=False).complete(X_incomplete)

    elif method == 'IterativeSVD':
        if value is None:
            rank = min(10, X_incomplete.shape[0] - 2)
        else:
            rank = value
        X_filled = IterativeSVD(rank=rank,
                                verbose=False).complete(X_incomplete)

    elif method == 'mean':
        col_means = np.nanmean(X_incomplete, axis=0)
        for i in range(X_incomplete.shape[1]):
            X_incomplete[:, i][np.isnan(X_incomplete[:, i])] = col_means[i]
        X_filled = X_incomplete

    elif method == 'median':
        col_means = np.nanmean(X_incomplete, axis=0)
        for i in range(X_incomplete.shape[1]):
            X_incomplete[:, i][np.isnan(X_incomplete[:, i])] = col_means[i]
        X_filled = X_incomplete

    elif method == 'constant':
        if value is None:
            raise ValueError(
                'Must give `value` argument if method == constant')
        X_incomplete[np.isnan(X_incomplete)] = value
        X_filled = X_incomplete

    return X_filled
Beispiel #15
0
reader = Reader(rating_scale=(limits[0], limits[1]))
data = Dataset.load_from_df(df[['user', 'item', 'rating']], reader)

df = pd.DataFrame(ratings_dict)
#reader = Reader(line_format='user item rating', sep='\t')

# A reader is still needed but only the rating_scale param is requiered.

data.split(n_folds=10)  # data can now be used normally

data_full = data.build_full_trainset()

#
obj = IterativeSVD(rank=20,
                   max_iters=700,
                   min_value=limits[0],
                   max_value=limits[1],
                   verbose=True)

datamat_filled_SVD_fancy = obj.complete(datamat_missing)

obj = SoftImpute(shrinkage_value=None,
                 max_iters=700,
                 max_rank=20,
                 n_power_iterations=1,
                 init_fill_method="zero",
                 min_value=limits[0],
                 max_value=limits[1],
                 normalizer=None,
                 verbose=True)
Beispiel #16
0
def run(folder, name, patients, run_all, save_imputed):
    random_seed = 123
    np.random.seed(seed=random_seed)

    X_corrupt = load_file(folder, name)
    name = name.split('.csv')[0]
    print(name)

    end = X_corrupt.shape[0]
    print(end)
    X = np.genfromtxt('./data/completeCasesBoxCox.csv', delimiter=',',
                      skip_header=1)[:end, 1:]

    scores = {}
    simple_mean_X = SimpleFill(fill_method='mean').complete(X_corrupt)
    scores['simple_mean'] = evaluate(simple_mean_X, X, X_corrupt)

    simple_median_X = SimpleFill(fill_method='median').complete(X_corrupt)
    scores['simple_median'] = evaluate(simple_median_X, X, X_corrupt)

    random_X = SimpleFill(fill_method='random').complete(X_corrupt)
    scores['random'] = evaluate(random_X, X, X_corrupt)

    # SVD
    svd_1_X = IterativeSVD(rank=1).complete(X_corrupt)
    scores['svd_1'] = evaluate(svd_1_X, X, X_corrupt)

    svd_2_X = IterativeSVD(rank=2).complete(X_corrupt)
    scores['svd_2'] = evaluate(svd_2_X, X, X_corrupt)

    svd_3_X = IterativeSVD(rank=3).complete(X_corrupt)
    scores['svd_3'] = evaluate(svd_3_X, X, X_corrupt)

    svd_4_X = IterativeSVD(rank=4).complete(X_corrupt)
    scores['svd_4'] = evaluate(svd_4_X, X, X_corrupt)

    svd_5_X = IterativeSVD(rank=5).complete(X_corrupt)
    scores['svd_5'] = evaluate(svd_5_X, X, X_corrupt)

    svd_6_X = IterativeSVD(rank=6).complete(X_corrupt)
    scores['svd_6'] = evaluate(svd_6_X, X, X_corrupt)

    svd_7_X = IterativeSVD(rank=7).complete(X_corrupt)
    scores['svd_7'] = evaluate(svd_7_X, X, X_corrupt)

    svd_8_X = IterativeSVD(rank=8).complete(X_corrupt)
    scores['svd_8'] = evaluate(svd_8_X, X, X_corrupt)

    svd_9_X = IterativeSVD(rank=9).complete(X_corrupt)
    scores['svd_9'] = evaluate(svd_9_X, X, X_corrupt)

    svd_10_X = IterativeSVD(rank=10).complete(X_corrupt)
    scores['svd_10'] = evaluate(svd_10_X, X, X_corrupt)

    svd_11_X = IterativeSVD(rank=11).complete(X_corrupt)
    scores['svd_11'] = evaluate(svd_11_X, X, X_corrupt)

    svd_12_X = IterativeSVD(rank=12).complete(X_corrupt)
    scores['svd_12'] = evaluate(svd_12_X, X, X_corrupt)

    svd_13_X = IterativeSVD(rank=13).complete(X_corrupt)
    scores['svd_13'] = evaluate(svd_13_X, X, X_corrupt)

    svd_14_X = IterativeSVD(rank=14).complete(X_corrupt)
    scores['svd_14'] = evaluate(svd_14_X, X, X_corrupt)

    svd_15_X = IterativeSVD(rank=15).complete(X_corrupt)
    scores['svd_15'] = evaluate(svd_15_X, X, X_corrupt)

    svd_16_X = IterativeSVD(rank=16).complete(X_corrupt)
    scores['svd_16'] = evaluate(svd_16_X, X, X_corrupt)

    svd_17_X = IterativeSVD(rank=17).complete(X_corrupt)
    scores['svd_17'] = evaluate(svd_17_X, X, X_corrupt)

    svd_18_X = IterativeSVD(rank=18).complete(X_corrupt)
    scores['svd_18'] = evaluate(svd_18_X, X, X_corrupt)

    svd_19_X = IterativeSVD(rank=19).complete(X_corrupt)
    scores['svd_19'] = evaluate(svd_19_X, X, X_corrupt)

    svd_20_X = IterativeSVD(rank=20).complete(X_corrupt)
    scores['svd_20'] = evaluate(svd_20_X, X, X_corrupt)

    svd_21_X = IterativeSVD(rank=21).complete(X_corrupt)
    scores['svd_21'] = evaluate(svd_21_X, X, X_corrupt)

    svd_22_X = IterativeSVD(rank=22).complete(X_corrupt)
    scores['svd_22'] = evaluate(svd_22_X, X, X_corrupt)

    svd_23_X = IterativeSVD(rank=23).complete(X_corrupt)
    scores['svd_23'] = evaluate(svd_23_X, X, X_corrupt)

    svd_24_X = IterativeSVD(rank=24).complete(X_corrupt)
    scores['svd_24'] = evaluate(svd_24_X, X, X_corrupt)

    si_X = SoftImpute().complete(X_corrupt)
    scores['si'] = evaluate(si_X, X, X_corrupt)

    si_s_half_X = SoftImpute(shrinkage_value=0.5).complete(X_corrupt)
    scores['si_s_half'] = evaluate(si_s_half_X, X, X_corrupt)

    si_s_1_X = SoftImpute(shrinkage_value=1).complete(X_corrupt)
    scores['si_s_1'] = evaluate(si_s_1_X, X, X_corrupt)

    si_s_2_X = SoftImpute(shrinkage_value=2).complete(X_corrupt)
    scores['si_s_2'] = evaluate(si_s_2_X, X, X_corrupt)

    si_s_4_X = SoftImpute(shrinkage_value=4).complete(X_corrupt)
    scores['si_s_4'] = evaluate(si_s_4_X, X, X_corrupt)

    si_s_8_X = SoftImpute(shrinkage_value=8).complete(X_corrupt)
    scores['si_s_8'] = evaluate(si_s_8_X, X, X_corrupt)

    si_s_16_X = SoftImpute(shrinkage_value=16).complete(X_corrupt)
    scores['si_s_16'] = evaluate(si_s_16_X, X, X_corrupt)

    si_s_32_X = SoftImpute(shrinkage_value=32).complete(X_corrupt)
    scores['si_s_32'] = evaluate(si_s_32_X, X, X_corrupt)

    si_s_64_X = SoftImpute(shrinkage_value=64).complete(X_corrupt)
    scores['si_s_64'] = evaluate(si_s_64_X, X, X_corrupt)

    si_s_128_X = SoftImpute(shrinkage_value=128).complete(X_corrupt)
    scores['si_s_128'] = evaluate(si_s_128_X, X, X_corrupt)

    if save_imputed:
        np.savetxt('./output/sweeps/' + name + '_simple_mean.csv',
                   simple_mean_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_simple_median.csv',
                   simple_median_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_simple_random.csv',
                   random_X, delimiter=',', newline='\n'),
        np.savetxt('./output/sweeps/' + name + '_svd_1.csv',
                   svd_1_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_2.csv',
                   svd_2_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_3.csv',
                   svd_3_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_4.csv',
                   svd_4_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_5.csv',
                   svd_5_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_6.csv',
                   svd_6_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_7.csv',
                   svd_7_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_8.csv',
                   svd_8_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_9.csv',
                   svd_9_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_10.csv',
                   svd_10_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_11.csv',
                   svd_11_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_12.csv',
                   svd_12_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_13.csv',
                   svd_13_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_14.csv',
                   svd_14_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_15.csv',
                   svd_15_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_16.csv',
                   svd_16_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_17.csv',
                   svd_17_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_18.csv',
                   svd_18_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_19.csv',
                   svd_19_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_20.csv',
                   svd_20_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_21.csv',
                   svd_21_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_22.csv',
                   svd_22_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_23.csv',
                   svd_23_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_svd_24.csv',
                   svd_24_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_si.csv',
                   si_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_si_s_half.csv',
                   si_s_half_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_si_s_1.csv',
                   si_s_1_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_si_s_2.csv',
                   si_s_2_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_si_s_4.csv',
                   si_s_4_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_si_s_8.csv',
                   si_s_8_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_si_s_16.csv',
                   si_s_16_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_si_s_32.csv',
                   si_s_32_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_si_s_64.csv',
                   si_s_64_X, delimiter=',', newline='\n')
        np.savetxt('./output/sweeps/' + name + '_si_s_128.csv',
                   si_s_128_X, delimiter=',', newline='\n')

    if run_all:
        mice_X = MICE().complete(X_corrupt)
        scores['MICE'] = evaluate(mice_X, X, X_corrupt)

        mice_col_lambda_reg_25 = MICE(
            model=BayesianRidgeRegression(lambda_reg=0.25)).complete(X_corrupt)
        scores['MICE_col_lambda_reg_25'] = evaluate(
            mice_col_lambda_reg_25, X, X_corrupt)

        mice_col_lambda_reg_10 = MICE(
            model=BayesianRidgeRegression(lambda_reg=0.1)).complete(X_corrupt)
        scores['MICE_col_lambda_reg_10'] = evaluate(
            mice_col_lambda_reg_10, X, X_corrupt)

        mice_col_lambda_reg_1 = MICE(
            model=BayesianRidgeRegression(lambda_reg=0.01)).complete(X_corrupt)
        scores['MICE_col_lambda_reg_1'] = evaluate(
            mice_col_lambda_reg_1, X, X_corrupt)

        mice_col_lambda_reg_01 = MICE(
            model=BayesianRidgeRegression(lambda_reg=0.001)).complete(X_corrupt)
        scores['MICE_col_lambda_reg_01'] = evaluate(
            mice_col_lambda_reg_01, X, X_corrupt)

        mice_col_lambda_reg_001 = MICE(
            model=BayesianRidgeRegression(lambda_reg=0.0001)).complete(X_corrupt)
        scores['MICE_col_lambda_reg_001'] = evaluate(
            mice_col_lambda_reg_001, X, X_corrupt)

        mice_pmm_X = MICE(impute_type='pmm').complete(X_corrupt)
        scores['MICE_pmm'] = evaluate(mice_pmm_X, X, X_corrupt)

        mice_pmm_lambda_reg_25 = MICE(
            impute_type='pmm',
            model=BayesianRidgeRegression(lambda_reg=0.25)).complete(X_corrupt)
        scores['MICE_pmm_lambda_reg_25'] = evaluate(
            mice_pmm_lambda_reg_25, X, X_corrupt)

        mice_pmm_lambda_reg_10 = MICE(
            impute_type='pmm',
            model=BayesianRidgeRegression(lambda_reg=0.1)).complete(X_corrupt)
        scores['MICE_pmm_lambda_reg_10'] = evaluate(
            mice_pmm_lambda_reg_10, X, X_corrupt)

        mice_pmm_lambda_reg_1 = MICE(
            impute_type='pmm',
             model=BayesianRidgeRegression(lambda_reg=0.01)).complete(X_corrupt)
        scores['MICE_pmm_lambda_reg_1'] = evaluate(mice_pmm_lambda_reg_1, X, X_corrupt)

        mice_pmm_lambda_reg_01 = MICE(
            impute_type='pmm',
            model=BayesianRidgeRegression(lambda_reg=0.001)).complete(X_corrupt)
        scores['MICE_pmm_lambda_reg_01'] = evaluate(mice_pmm_lambda_reg_01, X, X_corrupt)

        mice_pmm_lambda_reg_001 = MICE(
            impute_type='pmm',
            model=BayesianRidgeRegression(lambda_reg=0.0001)).complete(X_corrupt)
        scores['MICE_pmm_lambda_reg_001'] = evaluate(
            mice_pmm_lambda_reg_001, X, X_corrupt)

        knn_1_X = KNN(k=1).complete(X_corrupt)
        scores['knn_1'] = evaluate(knn_1_X, X, X_corrupt)

        knn_3_X = KNN(k=3).complete(X_corrupt)
        scores['knn_3'] = evaluate(knn_3_X, X, X_corrupt)

        knn_9_X = KNN(k=9).complete(X_corrupt)
        scores['knn_9'] = evaluate(knn_9_X, X, X_corrupt)

        knn_15_X = KNN(k=15).complete(X_corrupt)
        scores['knn_15'] = evaluate(knn_15_X, X, X_corrupt)

        knn_30_X = KNN(k=30).complete(X_corrupt)
        scores['knn_30'] = evaluate(knn_30_X, X, X_corrupt)

        knn_81_X = KNN(k=81).complete(X_corrupt)
        scores['knn_81'] = evaluate(knn_81_X, X, X_corrupt)

        knn_243_X = KNN(k=243).complete(X_corrupt)
        scores['knn_243'] = evaluate(knn_243_X, X, X_corrupt)

        knn_751_X = KNN(k=751).complete(X_corrupt)
        scores['knn_751'] = evaluate(knn_751_X, X, X_corrupt)

        knn_2000_X = KNN(k=2000).complete(X_corrupt)
        scores['knn_2000'] = evaluate(knn_2000_X, X, X_corrupt)

        knn_6000_X = KNN(k=6000).complete(X_corrupt)
        scores['knn_6000'] = evaluate(knn_6000_X, X, X_corrupt)

        if save_imputed:
            np.savetxt('./output/sweeps/' + name + '_MICE.csv',
                       mice_X, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name +
                       '_mice_col_lambda_reg_25.csv',
                       mice_col_lambda_reg_25, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_mice_col_lambda_reg_10.csv',
                       mice_col_lambda_reg_10, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_mice_col_lambda_reg_1.csv',
                       mice_col_lambda_reg_1, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_mice_col_lambda_reg_01.csv',
                       mice_col_lambda_reg_01, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_mice_col_lambda_reg_001.csv',
                       mice_col_lambda_reg_001, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_mice_pmm_X.csv',
                       mice_pmm_X, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_mice_pmm_lambda_reg_25.csv',
                       mice_pmm_lambda_reg_25, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_mice_pmm_lambda_reg_10.csv',
                       mice_pmm_lambda_reg_10, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_mice_pmm_lambda_reg_1.csv',
                       mice_pmm_lambda_reg_1, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_mice_pmm_lambda_reg_01.csv',
                       mice_pmm_lambda_reg_01, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_mice_pmm_lambda_reg_001.csv',
                       mice_pmm_lambda_reg_001, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_knn_1.csv',
                       knn_1_X, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_knn_3.csv',
                       knn_3_X, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_knn_9.csv',
                       knn_9_X, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_knn_15.csv',
                       knn_15_X, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_knn_30.csv',
                       knn_30_X, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_knn_81.csv',
                       knn_81_X, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_knn_243.csv',
                       knn_243_X, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_knn_751.csv',
                       knn_751_X, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_knn_2000.csv',
                       knn_2000_X, delimiter=',', newline='\n')
            np.savetxt('./output/sweeps/' + name + '_knn_6000.csv',
                       knn_6000_X, delimiter=',', newline='\n')
    print(scores)
    scores_df = pd.DataFrame().from_dict(scores.items())
    scores_df.columns = ['Method', 'Score']
    scores_df.set_index('Method')
    scores_df.to_csv('./output/scores/' + folder + '/' + name + '.csv')
        RNA_txt.to_csv(datadir + '/filled_data/Mean_' + cancertype +
                       str(missing_perc * 100) + '_' + str(sample_count) +
                       '.csv')

        nz = test_data[:, :RNA_size].size
        nnm_mse = np.sqrt((np.linalg.norm(
            (X_filled[:test_data.shape[0], :RNA_size] -
             test_data[:, :RNA_size]))**2) / nz)
        print("Mean method, RMSE: %f" % nnm_mse)
        loss_list_Mean[cancer_c, perc, sample_count - 1] = nnm_mse

        ##############SVD
        rank = 10
        X_filled = IterativeSVD(
            rank,
            init_fill_method="mean",
            verbose=False,
            convergence_threshold=0.0000001).fit_transform(df_combine)
        RNA_txt = pd.DataFrame(X_filled[:, :RNA_size],
                               index=shuffle_cancer.index,
                               columns=shuffle_cancer.columns[:RNA_size])
        RNA_txt.to_csv(datadir + '/filled_data/SVD_' + cancertype +
                       str(missing_perc * 100) + '_' + str(sample_count) +
                       '.csv')

        nz = test_data[:, :RNA_size].size
        nnm_mse = np.sqrt((np.linalg.norm(
            (X_filled[:test_data.shape[0], :RNA_size] -
             test_data[:, :RNA_size]))**2) / nz)
        print("SVD, RMSE: %f" % nnm_mse)
        loss_list_SVD[cancer_c, perc, sample_count - 1] = nnm_mse
def _perform_imputation(job_context: Dict) -> Dict:
    """

    Take the inputs and perform the primary imputation.

    Via https://github.com/AlexsLemonade/refinebio/issues/508#issuecomment-435879283: 
     - Combine all microarray samples with a full join to form a microarray_expression_matrix (this may end up being a DataFrame)
     - Combine all RNA-seq samples (lengthScaledTPM) with a full outer join to form a rnaseq_expression_matrix
     - Calculate the sum of the lengthScaledTPM values for each row (gene) of the rnaseq_expression_matrix (rnaseq_row_sums)
     - Calculate the 10th percentile of rnaseq_row_sums
     - Drop all rows in rnaseq_expression_matrix with a row sum < 10th percentile of rnaseq_row_sums; this is now filtered_rnaseq_matrix
     - log2(x + 1) transform filtered_rnaseq_matrix; this is now log2_rnaseq_matrix
     - Set all zero values in log2_rnaseq_matrix to NA, but make sure to keep track of where these zeroes are
     - Perform a full outer join of microarray_expression_matrix and log2_rnaseq_matrix; combined_matrix
     - Remove genes (rows) with >30% missing values in combined_matrix
     - Remove samples (columns) with >50% missing values in combined_matrix
     - "Reset" zero values that were set to NA in RNA-seq samples (i.e., make these zero again) in combined_matrix
     - Transpose combined_matrix; transposed_matrix
     - Perform imputation of missing values with IterativeSVD (rank=10) on the transposed_matrix; imputed_matrix
     - Untranspose imputed_matrix (genes are now rows, samples are now columns)
     - Quantile normalize imputed_matrix where genes are rows and samples are columns

    """
    job_context['time_start'] = timezone.now()

    # Combine all microarray samples with a full join to form a microarray_expression_matrix (this may end up being a DataFrame)
    microarray_expression_matrix = job_context['microarray_inputs']

    # Combine all RNA-seq samples (lengthScaledTPM) with a full outer join to form a rnaseq_expression_matrix
    rnaseq_expression_matrix = job_context['rnaseq_inputs']

    # Calculate the sum of the lengthScaledTPM values for each row (gene) of the rnaseq_expression_matrix (rnaseq_row_sums)
    rnaseq_row_sums = np.sum(rnaseq_expression_matrix, axis=1)

    # Calculate the 10th percentile of rnaseq_row_sums
    rnaseq_tenth_percentile = np.percentile(rnaseq_row_sums, 10)

    # Drop all rows in rnaseq_expression_matrix with a row sum < 10th percentile of rnaseq_row_sums; this is now filtered_rnaseq_matrix
    # TODO: This is probably a better way to do this with `np.where`
    rows_to_filter = []
    for (x, sum_val) in rnaseq_row_sums.items():
        if sum_val < rnaseq_tenth_percentile:
            rows_to_filter.append(x)

    filtered_rnaseq_matrix = rnaseq_expression_matrix.drop(rows_to_filter)

    # log2(x + 1) transform filtered_rnaseq_matrix; this is now log2_rnaseq_matrix
    filtered_rnaseq_matrix_plus_one = filtered_rnaseq_matrix + 1
    log2_rnaseq_matrix = np.log2(filtered_rnaseq_matrix_plus_one)

    # Cache our RNA-Seq zero values
    cached_zeroes = {}
    for column in log2_rnaseq_matrix.columns:
        cached_zeroes[column] = np.where(log2_rnaseq_matrix[column] == 0)

    # Set all zero values in log2_rnaseq_matrix to NA, but make sure to keep track of where these zeroes are
    log2_rnaseq_matrix[log2_rnaseq_matrix==0]=np.nan

    # Perform a full outer join of microarray_expression_matrix and log2_rnaseq_matrix; combined_matrix
    combined_matrix = microarray_expression_matrix.merge(log2_rnaseq_matrix, how='outer', left_index=True, right_index=True)

    # Remove genes (rows) with <=70% present values in combined_matrix
    thresh = combined_matrix.shape[1] * .7 # (Rows, Columns)
    row_filtered_combined_matrix = combined_matrix.dropna(axis='index', thresh=thresh) # Everything below `thresh` is dropped

    # Remove samples (columns) with <50% present values in combined_matrix
    # XXX: Find better test data for this!
    col_thresh = row_filtered_combined_matrix.shape[0] * .5
    row_col_filtered_combined_matrix_samples = row_filtered_combined_matrix.dropna(axis='columns', thresh=col_thresh)

    # "Reset" zero values that were set to NA in RNA-seq samples (i.e., make these zero again) in combined_matrix
    for column in cached_zeroes.keys():
        zeroes = cached_zeroes[column]

        # Skip purged columns
        if column not in row_col_filtered_combined_matrix_samples:
            continue
        
        # Place the zero
        try:
            np.put(row_col_filtered_combined_matrix_samples[column], zeroes, 0.0)
        except Exception as e:
            logger.exception("Error when replacing zero")
            continue

    # Label our new replaced data
    combined_matrix_zero = row_col_filtered_combined_matrix_samples

    # Transpose combined_matrix; transposed_matrix
    transposed_matrix = combined_matrix_zero.transpose() #  row_col_filtered_combined_matrix_samples.transpose()

    # Remove -inf and inf
    # This should never happen, but make sure it doesn't!
    transposed_matrix = transposed_matrix.replace([np.inf, -np.inf], np.nan)

    # Perform imputation of missing values with IterativeSVD (rank=10) on the transposed_matrix; imputed_matrix
    imputed_matrix = IterativeSVD(rank=10).fit_transform(transposed_matrix)

    # Untranspose imputed_matrix (genes are now rows, samples are now columns)
    untransposed_imputed_matrix = imputed_matrix.transpose()

    # Convert back to Pandas
    untransposed_imputed_matrix_df = pd.DataFrame.from_records(untransposed_imputed_matrix)
    untransposed_imputed_matrix_df.index = row_col_filtered_combined_matrix_samples.index
    untransposed_imputed_matrix_df.columns = row_col_filtered_combined_matrix_samples.columns

    # Quantile normalize imputed_matrix where genes are rows and samples are columns
    # XXX: Refactor QN target acquisition and application before doing this
    job_context['organism'] = Organism.get_object_for_name(list(job_context['input_files'].keys())[0])
    job_context['merged_no_qn'] = untransposed_imputed_matrix_df

    # Perform the Quantile Normalization
    job_context = smasher._quantile_normalize(job_context, ks_check=False)
    job_context['time_end'] = timezone.now()
    job_context['formatted_command'] = "create_compendia.py"

    return job_context
while len(rows_to_impute) > 0:
    try:
        impute_me = set(
            random.sample(rows_to_impute,
                          int(len(all_rows) * iteration_percent)))
    except Exception:
        # Population larger than sample
        impute_me = rows_to_impute
    rows_to_impute = rows_to_impute - impute_me

    df['SYNTHETIC'][impute_me] = np.nan

    needs_imputation_transposed = df.transpose()
    print("Imputing step!")
    imputed_matrix = IterativeSVD(
        rank=10).fit_transform(needs_imputation_transposed)
    imputed_matrix_transposed = imputed_matrix.transpose()
    print("Imputed!")

    # Convert back to Pandas
    df = df.transpose()
    df_imputed_matrix_transposed = pd.DataFrame.from_records(
        imputed_matrix_transposed)
    df_imputed_matrix_transposed.index = all_rows
    df_imputed_matrix_transposed.columns = all_cols
    df = df_imputed_matrix_transposed

df.to_csv('synthetic_' + colname + "_" + str(iteration_percent) + '.tsv',
          sep='\t',
          encoding='utf-8')
Beispiel #20
0
def _perform_imputation(job_context: Dict) -> Dict:
    """

    Take the inputs and perform the primary imputation.

    Via https://github.com/AlexsLemonade/refinebio/issues/508#issuecomment-435879283:
     - Combine all microarray samples with a full join to form a
       microarray_expression_matrix (this may end up being a DataFrame).
     - Combine all RNA-seq samples (lengthScaledTPM) with a full outer join
       to form a rnaseq_expression_matrix.
     - Calculate the sum of the lengthScaledTPM values for each row (gene) of
       the rnaseq_expression_matrix (rnaseq_row_sums).
     - Calculate the 10th percentile of rnaseq_row_sums
     - Drop all rows in rnaseq_expression_matrix with a row sum < 10th percentile of
       rnaseq_row_sums; this is now filtered_rnaseq_matrix
     - log2(x + 1) transform filtered_rnaseq_matrix; this is now log2_rnaseq_matrix
     - Set all zero values in log2_rnaseq_matrix to NA, but make sure to keep track of
       where these zeroes are
     - Perform a full outer join of microarray_expression_matrix and
       log2_rnaseq_matrix; combined_matrix
     - Remove genes (rows) with >30% missing values in combined_matrix
     - Remove samples (columns) with >50% missing values in combined_matrix
     - "Reset" zero values that were set to NA in RNA-seq samples (i.e., make these zero
       again) in combined_matrix
     - Transpose combined_matrix; transposed_matrix
     - Perform imputation of missing values with IterativeSVD (rank=10) on
       the transposed_matrix; imputed_matrix
        -- with specified svd algorithm or skip
     - Untranspose imputed_matrix (genes are now rows, samples are now columns)
     - Quantile normalize imputed_matrix where genes are rows and samples are columns

    """
    imputation_start = log_state("start perform imputation",
                                 job_context["job"].id)
    job_context["time_start"] = timezone.now()
    rnaseq_row_sums_start = log_state("start rnaseq row sums",
                                      job_context["job"].id)

    # We potentially can have a microarray-only compendia but not a RNASeq-only compendia
    log2_rnaseq_matrix = None
    if job_context["rnaseq_matrix"] is not None:
        # Drop any genes that are entirely NULL in the RNA-Seq matrix
        job_context["rnaseq_matrix"] = job_context["rnaseq_matrix"].dropna(
            axis="columns", how="all")

        # Calculate the sum of the lengthScaledTPM values for each row
        # (gene) of the rnaseq_matrix (rnaseq_row_sums)
        rnaseq_row_sums = np.sum(job_context["rnaseq_matrix"], axis=1)

        log_state("end rnaseq row sums", job_context["job"].id,
                  rnaseq_row_sums_start)
        rnaseq_decile_start = log_state("start rnaseq decile",
                                        job_context["job"].id)

        # Calculate the 10th percentile of rnaseq_row_sums
        rnaseq_tenth_percentile = np.percentile(rnaseq_row_sums, 10)

        log_state("end rnaseq decile", job_context["job"].id,
                  rnaseq_decile_start)
        drop_start = log_state("drop all rows", job_context["job"].id)
        # Drop all rows in rnaseq_matrix with a row sum < 10th
        # percentile of rnaseq_row_sums; this is now
        # filtered_rnaseq_matrix
        # TODO: This is probably a better way to do this with `np.where`
        rows_to_filter = []
        for (x, sum_val) in rnaseq_row_sums.items():
            if sum_val < rnaseq_tenth_percentile:
                rows_to_filter.append(x)

        del rnaseq_row_sums

        log_state("actually calling drop()", job_context["job"].id)

        filtered_rnaseq_matrix = job_context.pop("rnaseq_matrix").drop(
            rows_to_filter)

        del rows_to_filter

        log_state("end drop all rows", job_context["job"].id, drop_start)
        log2_start = log_state("start log2", job_context["job"].id)

        # log2(x + 1) transform filtered_rnaseq_matrix; this is now log2_rnaseq_matrix
        filtered_rnaseq_matrix_plus_one = filtered_rnaseq_matrix + 1
        log2_rnaseq_matrix = np.log2(filtered_rnaseq_matrix_plus_one)
        del filtered_rnaseq_matrix_plus_one
        del filtered_rnaseq_matrix

        log_state("end log2", job_context["job"].id, log2_start)
        cache_start = log_state("start caching zeroes", job_context["job"].id)

        # Cache our RNA-Seq zero values
        cached_zeroes = {}
        for column in log2_rnaseq_matrix.columns:
            cached_zeroes[column] = log2_rnaseq_matrix.index[np.where(
                log2_rnaseq_matrix[column] == 0)]

        # Set all zero values in log2_rnaseq_matrix to NA, but make sure
        # to keep track of where these zeroes are
        log2_rnaseq_matrix[log2_rnaseq_matrix == 0] = np.nan

        log_state("end caching zeroes", job_context["job"].id, cache_start)

    outer_merge_start = log_state("start outer merge", job_context["job"].id)

    # Perform a full outer join of microarray_matrix and
    # log2_rnaseq_matrix; combined_matrix
    if log2_rnaseq_matrix is not None:
        combined_matrix = job_context.pop("microarray_matrix").merge(
            log2_rnaseq_matrix, how="outer", left_index=True, right_index=True)
    else:
        logger.info("Building compendia with only microarray data.",
                    job_id=job_context["job"].id)
        combined_matrix = job_context.pop("microarray_matrix")

    log_state("ran outer merge, now deleteing log2_rnaseq_matrix",
              job_context["job"].id)

    del log2_rnaseq_matrix

    log_state("end outer merge", job_context["job"].id, outer_merge_start)
    drop_na_genes_start = log_state("start drop NA genes",
                                    job_context["job"].id)

    # # Visualize Prefiltered
    # output_path = job_context['output_dir'] + "pre_filtered_" + str(time.time()) + ".png"
    # visualized_prefilter = visualize.visualize(combined_matrix.copy(), output_path)

    # Remove genes (rows) with <=70% present values in combined_matrix
    thresh = combined_matrix.shape[1] * 0.7  # (Rows, Columns)
    # Everything below `thresh` is dropped
    row_filtered_matrix = combined_matrix.dropna(axis="index", thresh=thresh)

    del combined_matrix
    del thresh

    log_state("end drop NA genes", job_context["job"].id, drop_na_genes_start)
    drop_na_samples_start = log_state("start drop NA samples",
                                      job_context["job"].id)

    # # Visualize Row Filtered
    # output_path = job_context['output_dir'] + "row_filtered_" + str(time.time()) + ".png"
    # visualized_rowfilter = visualize.visualize(row_filtered_matrix.copy(), output_path)

    # Remove samples (columns) with <50% present values in combined_matrix
    # XXX: Find better test data for this!
    col_thresh = row_filtered_matrix.shape[0] * 0.5
    row_col_filtered_matrix_samples = row_filtered_matrix.dropna(
        axis="columns", thresh=col_thresh)
    row_col_filtered_matrix_samples_index = row_col_filtered_matrix_samples.index
    row_col_filtered_matrix_samples_columns = row_col_filtered_matrix_samples.columns

    log_state("end drop NA genes", job_context["job"].id,
              drop_na_samples_start)
    replace_zeroes_start = log_state("start replace zeroes",
                                     job_context["job"].id)

    for sample_accession_code in row_filtered_matrix.columns:
        if sample_accession_code not in row_col_filtered_matrix_samples_columns:
            sample = Sample.objects.get(accession_code=sample_accession_code)
            sample_metadata = sample.to_metadata_dict()
            job_context["filtered_samples"][sample_accession_code] = {
                **sample_metadata,
                "reason":
                "Sample was dropped because it had less than 50% present values.",
                "experiment_accession_code":
                smashing_utils.get_experiment_accession(
                    sample.accession_code, job_context["dataset"].data),
            }

    del row_filtered_matrix

    # # Visualize Row and Column Filtered
    # output_path = job_context['output_dir'] + "row_col_filtered_" + str(time.time()) + ".png"
    # visualized_rowcolfilter = visualize.visualize(row_col_filtered_matrix_samples.copy(),
    #                                               output_path)

    # "Reset" zero values that were set to NA in RNA-seq samples
    # (i.e., make these zero again) in combined_matrix
    for column in cached_zeroes.keys():
        zeroes = cached_zeroes[column]

        # Skip purged columns
        if column not in row_col_filtered_matrix_samples:
            continue

        # Place the zero
        try:
            # This generates a warning, so use loc[] instead
            # row_col_filtered_matrix_samples[column].replace(zeroes, 0.0, inplace=True)
            zeroes_list = zeroes.tolist()
            new_index_list = row_col_filtered_matrix_samples_index.tolist()
            new_zeroes = list(set(new_index_list) & set(zeroes_list))
            row_col_filtered_matrix_samples[column].loc[new_zeroes] = 0.0
        except Exception:
            logger.warn("Error when replacing zero")
            continue

    log_state("end replace zeroes", job_context["job"].id,
              replace_zeroes_start)
    transposed_zeroes_start = log_state("start replacing transposed zeroes",
                                        job_context["job"].id)

    # Label our new replaced data
    combined_matrix_zero = row_col_filtered_matrix_samples
    del row_col_filtered_matrix_samples

    transposed_matrix_with_zeros = combined_matrix_zero.T
    del combined_matrix_zero

    # Remove -inf and inf
    # This should never happen, but make sure it doesn't!
    transposed_matrix = transposed_matrix_with_zeros.replace([np.inf, -np.inf],
                                                             np.nan)
    del transposed_matrix_with_zeros

    log_state("end replacing transposed zeroes", job_context["job"].id,
              transposed_zeroes_start)

    # Store the absolute/percentages of imputed values
    matrix_sum = transposed_matrix.isnull().sum()
    percent = (matrix_sum /
               transposed_matrix.isnull().count()).sort_values(ascending=False)
    total_percent_imputed = sum(percent) / len(transposed_matrix.count())
    job_context["total_percent_imputed"] = total_percent_imputed
    logger.info("Total percentage of data to impute!",
                total_percent_imputed=total_percent_imputed)

    # Perform imputation of missing values with IterativeSVD (rank=10) on the
    # transposed_matrix; imputed_matrix
    svd_algorithm = job_context["dataset"].svd_algorithm
    if svd_algorithm != "NONE":
        svd_start = log_state("start SVD", job_context["job"].id)

        logger.info("IterativeSVD algorithm: %s" % svd_algorithm)
        svd_algorithm = str.lower(svd_algorithm)
        imputed_matrix = IterativeSVD(
            rank=10,
            svd_algorithm=svd_algorithm).fit_transform(transposed_matrix)

        svd_start = log_state("end SVD", job_context["job"].id, svd_start)
    else:
        imputed_matrix = transposed_matrix
        logger.info("Skipping IterativeSVD")
    del transposed_matrix

    untranspose_start = log_state("start untranspose", job_context["job"].id)

    # Untranspose imputed_matrix (genes are now rows, samples are now columns)
    untransposed_imputed_matrix = imputed_matrix.T
    del imputed_matrix

    # Convert back to Pandas
    untransposed_imputed_matrix_df = pd.DataFrame.from_records(
        untransposed_imputed_matrix)
    untransposed_imputed_matrix_df.index = row_col_filtered_matrix_samples_index
    untransposed_imputed_matrix_df.columns = row_col_filtered_matrix_samples_columns
    del untransposed_imputed_matrix
    del row_col_filtered_matrix_samples_index
    del row_col_filtered_matrix_samples_columns
    # Quantile normalize imputed_matrix where genes are rows and samples are columns
    job_context["organism"] = Organism.get_object_for_name(
        job_context["organism_name"])
    job_context["merged_no_qn"] = untransposed_imputed_matrix_df
    # output_path = job_context['output_dir'] + "compendia_no_qn_" + str(time.time()) + ".png"
    # visualized_merged_no_qn = visualize.visualize(untransposed_imputed_matrix_df.copy(),
    #                                               output_path)

    log_state("end untranspose", job_context["job"].id, untranspose_start)
    quantile_start = log_state("start quantile normalize",
                               job_context["job"].id)

    # Perform the Quantile Normalization
    job_context = smashing_utils.quantile_normalize(job_context,
                                                    ks_check=False)

    log_state("end quantile normalize", job_context["job"].id, quantile_start)

    # Visualize Final Compendia
    # output_path = job_context['output_dir'] + "compendia_with_qn_" + str(time.time()) + ".png"
    # visualized_merged_qn = visualize.visualize(job_context['merged_qn'].copy(), output_path)

    job_context["time_end"] = timezone.now()
    job_context["formatted_command"] = ["create_compendia.py"]
    log_state("end prepare imputation", job_context["job"].id,
              imputation_start)
    return job_context
Beispiel #21
0
                                   uncorrelated=correlation_proportions[2],
                                   missing_portion=0.0,
                                   fill_na=np.nan)

X, _, y = generator.generate_data_logistic(1024, min_mult=0.0, max_mult=1.0)
# X_incomplete has the same values as X except a subset have been replace with NaN
X_incomplete, missing_mask = generator.generate_missing(X, 0.1, np.nan)

# Use 3 nearest rows which have a feature to fill in each row's missing features
X_filled_knn = KNN(k=3).fit_transform(X_incomplete)

# matrix completion using MICE
X_filled_mice = IterativeImputer().fit_transform(X_incomplete)

# matrix completion using Iterative SVD
X_filled_svd = IterativeSVD(rank=3).fit_transform(X_incomplete)

# matrix completion using Matrix Factorization
X_filled_mf = MatrixFactorization(learning_rate=0.01,
                                  rank=3,
                                  l2_penalty=0,
                                  min_improvement=1e-6).fit_transform(X_incomplete)

# matrix completion using Mean Fill
X_filled_meanfill = SimpleFill(fill_method='mean').fit_transform(X_incomplete)
# matrix completion using Median Fill
X_filled_medianfill = SimpleFill(fill_method='median').fit_transform(X_incomplete)
# matrix completion using Zero Fill
X_filled_zerofill = SimpleFill(fill_method='zero').fit_transform(X_incomplete)
# matrix completion using Min Fill
X_filled_minfill = SimpleFill(fill_method='min').fit_transform(X_incomplete)
Beispiel #22
0
import sklearn
import pandas as pd
import matplotlib as plt
from fancyimpute import IterativeSVD

print("reading data...")

X = pd.read_csv("Data/train.csv").iloc[:, 1:-1].as_matrix(
)  # remove first row (labels), first and last columns
test = pd.read_csv("Data/test.csv").iloc[:, 1:].as_matrix()
# ind = np.genfromtxt('Class_change_ind.csv', delimiter = ',', dtype = 'int32')

# test_incomplete = test.copy()

print X.shape

print("setting svd object...")
svd = IterativeSVD(rank=1000, convergence_threshold=0.0001)

X_svd = svd.complete(X)
# print X_svd[:,0]

print("saving data...")
np.savetxt("Data/train_isvdimp.csv", X_svd, delimiter=",")

print("imputing test...")
test_svd = svd.complete(test)

np.savetxt("Data/test_isvdimp.csv", test_svd, delimiter=",")
print '\a'
    for negative_log_regularization_weight in [2, 3, 4]:
        regularization_weight = 10.0**-negative_log_regularization_weight
        table.add_entry(solver=IterativeImputer(
            n_nearest_columns=80,
            n_iter=50,
            n_burn_in=5,
        ),
                        name="IterativeImputer_%d" %
                        negative_log_regularization_weight)

    for fill_method in ["mean", "median"]:
        table.add_entry(solver=SimpleFill(fill_method=fill_method),
                        name="SimpleFill_%s" % fill_method)

    for k in [1, 3, 7]:
        table.add_entry(solver=KNN(k=k, orientation="rows"),
                        name="KNN_k%d" % (k, ))

    for shrinkage_value in [25, 50, 100]:
        # SoftImpute without rank constraints
        table.add_entry(solver=SoftImpute(shrinkage_value=shrinkage_value),
                        name="SoftImpute_lambda%d" % (shrinkage_value, ))

    for rank in [10, 20, 40]:
        table.add_entry(solver=IterativeSVD(rank=rank,
                                            init_fill_method="zero"),
                        name="IterativeSVD_rank%d" % (rank, ))

    table.save_html_table()
    table.print_sorted_errors()
def main():

    #Creating matrix e1

    sig_matrix = np.loadtxt('Clear_Cell_Cycle.txt')
    print("Dimension of raw e1: ")
    print(sig_matrix.shape)
    print('\n')

    #---------------------------------------------------

    #Filling missing data in e1

    sig_matrix[sig_matrix == 0] = np.NaN
    X_incomplete = sig_matrix
    #imputer = Imputer()
    #transformed_sig_matrix = imputer.fit_transform(sig_matrix)
    #Count the number of NaN values in each column
    #print(np.isnan(transformed_sig_matrix).sum())
    #sig_matrix = transformed_sig_matrix

    # Use SVD
    X_filled = IterativeSVD().complete(X_incomplete)
    # Use 3 nearest rows which have a feature to fill in each row's missing features
    X_filled_knn = KNN(k=5).complete(X_incomplete)
    #   svd_mse = ((X_filled_knn[missing_mask] - X[missing_mask]) ** 2).mean()
    #   print("IterativeSVD MSE: %f" % svd_mse)

    # matrix completion using convex optimization to find low-rank solution
    # that still matches observed values. Slow!
    #X_filled_nnm = NuclearNormMinimization().complete(X_incomplete)

    # print mean squared error for the three imputation methods above
    #nnm_mse = ((X_filled_nnm[missing_mask] - X[missing_mask]) ** 2).mean()
    #print("Nuclear norm minimization MSE: %f" % nnm_mse)

    #   knn_mse = ((X_filled_knn[missing_mask] - X[missing_mask]) ** 2).mean()
    #   print("knnImpute MSE: %f" % knn_mse)

    sig_matrix = X_filled_knn
    e1 = sig_matrix

    #   print("\n Tesnor 1 before decomposition\n")
    #with open("Tensor_Before_Decomposition", 'w') as fout:
    #fout.writelines(sess.run(T))
    # print(sess.run(T))
    Tfirst = T = TensorProj(e1)
    n = tf.norm(T)
    print("\n")
    print(sess.run(n))
    T_new = tf.zeros([len(e1), len(e1), len(e1)], tf.float32)

    #scaling T
    #   norm = tf.nn.l2_normalize(T, 0, epsilon = 1e-12, name = None)
    #   norm = tf.norm(T)
    #   T = norm
    #    print(sess.run(T))

    #   fout = open("norm.txt", 'a')
    for i in range(200):

        decomp = decom(T, g)
        T_new += decomp
        T = decomp
        decomp = Tfirst - T_new
        init_op = tf.global_variables_initializer()
        sess = tf.Session()
        sess.run(init_op)
        n = tf.norm(decomp)
        print("\n")
        print(sess.run(n))
        with open('norm.text', 'a') as fout:
            fout.write(str(sess.run(n)) + '\n')
        fout.close()

    #norm = tf.nn.l2_normalize(T_new, 0, epsilon = 1e-12, name = None)
    #print("\n sum of decomposition:\n")
    #init_op = tf.global_variables_initializer()
    #sess = tf.Session()
    #sess.run(init_op)
    #print(sess.run(T_new))
    print("\n")

    Tfirst = T = TensorProj(e2)

    print("Tesnor 2 before decomposition\n")
    init_op = tf.global_variables_initializer()
    sess = tf.Session()
    sess.run(init_op)
    #print(sess.run(T))
    print("\n")

    # Creating e2
    b_pseoudoInv = np.linalg.pinv(b)
    project = np.dot(b, b_pseoudoInv)
    e2 = np.dot(project, e1)

    g = tf.Variable(tf.random_uniform([N]))
    #print('\n g is \n')
    init_op = tf.global_variables_initializer()
    sess = tf.Session()
    sess.run(init_op)
    print(sess.run(g))

    T2 = T_new = tf.zeros([len(e2), len(e2), len(e2)], tf.float32)
    for i in range(0):
        T1 = decom(T, g)
        T_new += T - T1
        T = T1
        T2 = T_new - Tfirst
        init_op = tf.global_variables_initializer()
        sess = tf.Session()
        sess.run(init_op)
        n = tf.norm(T2)
        print(sess.run(n))
        print("\n")

    norm = tf.nn.l2_normalize(T_new, 0, epsilon=1e-12, name=None)
    print("sum of decomposition:\n")
    init_op = tf.global_variables_initializer()
    sess = tf.Session()
    sess.run(init_op)
    print(sess.run(T_new))
    print("\n")
def main():

    #Enter the raw data file (main signal matrix e1)

    with open('Cell_Cycle_Expresion.txt') as infile:
        with open("Clear_Cell_Cycle.txt", "w") as outfile:
            for line in infile:
                Clear_Cell_Cycle(line, outfile)

    #Removing first row of the data file

    with open("Clear_Cell_Cycle.txt", 'r') as fin:
        data = fin.read().splitlines(True)
    with open("Clear_Cell_Cycle.txt", 'w') as fout:
        fout.writelines(data[1:])

    #Keeping name of the genes and features of e1 (Cell_Cycle_Expresion)

    with open('Cell_Cycle_Expresion.txt', "r") as infile:
        with open("GeneNames_Cell_Cycle.txt", "w") as outfile:
            for line in infile:
                outfile.write("\t".join(line.split()[0]) + "\n")

    with open("GeneNames_Cell_Cycle.txt", 'r') as fin:
        data = fin.read().splitlines(True)
    with open("GeneNames_Cell_Cycle.txt", 'w') as fout:
        fout.writelines(data[1:])

    #Creating list of signal genes

    f = open("GeneNames_Cell_Cycle.txt", 'r')
    sig_genes = f.readlines()
    print("Length of signal genes of e1: ")
    print(len(sig_genes))
    print('\n')

    #Changing NULL to 0

    with open("Clear_Cell_Cycle.txt", 'r') as fin2:
        data = fin2.read().splitlines(True)
    with open("Clear_Cell_Cycle.txt", 'w') as fout2:
        for line in data:
            fout2.write(line.replace('Null', '0'))

    #Creating matrix e1

    sig_matrix = np.loadtxt('Clear_Cell_Cycle.txt')
    print("Dimension of raw e1: ")
    print(sig_matrix.shape)
    print('\n')

    #---------------------------------------------------

    #Filling missing data in e1

    sig_matrix[sig_matrix == 0] = np.NaN
    X_incomplete = sig_matrix
    #imputer = Imputer()
    #transformed_sig_matrix = imputer.fit_transform(sig_matrix)
    #Count the number of NaN values in each column
    #print(np.isnan(transformed_sig_matrix).sum())
    #sig_matrix = transformed_sig_matrix

    # Use SVD
    X_filled = IterativeSVD().complete(X_incomplete)
    # Use 3 nearest rows which have a feature to fill in each row's missing features
    X_filled_knn = KNN(k=5).complete(X_incomplete)
    #   svd_mse = ((X_filled_knn[missing_mask] - X[missing_mask]) ** 2).mean()
    #   print("IterativeSVD MSE: %f" % svd_mse)

    # matrix completion using convex optimization to find low-rank solution
    # that still matches observed values. Slow!
    #X_filled_nnm = NuclearNormMinimization().complete(X_incomplete)

    # print mean squared error for the three imputation methods above
    #nnm_mse = ((X_filled_nnm[missing_mask] - X[missing_mask]) ** 2).mean()
    #print("Nuclear norm minimization MSE: %f" % nnm_mse)

    #   knn_mse = ((X_filled_knn[missing_mask] - X[missing_mask]) ** 2).mean()
    #   print("knnImpute MSE: %f" % knn_mse)

    sig_matrix = X_filled_knn

    #Center the expressions of genes
    sig_npArray = np.array(sig_matrix)
    sig_mean = np.mean(sig_npArray, axis=0)
    print("Mean of e1 at it's time level:")
    print(sig_mean)
    for i in range(0, 18):
        sig_matrix[:, i] = sig_matrix[:, i] - sig_mean[i]
    print("\n")

    #--------------------------------------------------

    #Calculating svd of e1   U: eigenarray, s: eigenexpresions, V: eigengenes

    eigenarrays1, eigenexpressions1, eigengenes1 = np.linalg.svd(
        sig_matrix, full_matrices=False)

    #Creating a1 by e1

    a1 = np.dot(sig_matrix, sig_matrix.transpose())
    print("Dimension of raw a1: ")
    print(a1.shape)
    print('\n')

    #Calculating Evd of a1:network1

    eigenarrays1_trans = eigenarrays1.transpose()
    a1_sSquare = np.square(eigenexpressions1)

    #Calculating significance of subnetworks of a1

    M1 = 4
    a1Frac = fraction(eigenexpressions1, M1)
    print(
        "Expression correlations for 4 most significant subnetworks of a1(%)")
    print(a1Frac * 100)
    print('\n')
    a1_entropy = ent(a1Frac)
    print("Entropy of matrix a1")
    print(a1_entropy)
    print('\n')

    #---------------------------------------------------
    #Enter the raw data files for basic signals b1, b2, and b3

    #b1
    with open('Cell_Cycle_Binding.txt') as infile2:
        with open("Clear_Cycle_Bin.txt", "w") as outfile2:
            for line in infile2:
                Clear_Bind(line, outfile2)

    #Removing first row of the data file

    with open("Clear_Cycle_Bin.txt", 'r') as fin2:
        data = fin2.read().splitlines(True)
    with open("Clear_Cycle_Bin.txt", 'w') as fout2:
        fout2.writelines(data[1:])

    #Creating matrix basis signal b1

    sig_basis1 = np.loadtxt('Clear_Cycle_Bin.txt')
    print("Dimension of b1(Cell Cycle Binding): ")
    print(sig_basis1.shape)
    print('\n')

    #Keeping name of the genes and features of b1 (Cell_Cycle_Binding)

    with open('Cell_Cycle_Binding.txt') as infile:
        with open("GeneNames_Cycle_Bin.txt", "w") as outfile:
            for line in infile:
                outfile.write("\t".join(line.split()[0]) + "\n")

    with open("GeneNames_Cycle_Bin.txt", 'r') as fin:
        data = fin.read().splitlines(True)
    with open("GeneNames_Cycle_Bin.txt", 'w') as fout:
        fout.writelines(data[1:])

    #Creating list of signal genes of b1

    f = open("GeneNames_Cycle_Bin.txt", 'r')
    b1_genes = f.readlines()
    print("Length of signal genes of b1: ")
    print(len(b1_genes))
    print('\n')

    #Find intersection of e1 and b1

    interSec = set(sig_genes).intersection(b1_genes)
    print("Length of intersection of e1 and b1")
    print(len(interSec))
    print('\n')

    #Finding relevant data of intersection(e1, b1) in e1
    M = 18
    sig_matrix = relData(interSec, sig_matrix, sig_genes, M)

    #Finding relevant data of intersection(e1, b1) in b1
    M = 12
    sig_basis1 = relData(interSec, sig_basis1, b1_genes, M)

    #Devide signal matrix by mean to convert signals to DNA binding

    sig_npArray = np.array(sig_basis1)
    basis1_mean = np.mean(sig_npArray, axis=1)
    print("Mean of b1 for gean measurments:")
    print(basis1_mean)
    for i in range(0, 1588):
        sig_basis1[i, :] = sig_basis1[i, :] / basis1_mean[i]
    print("\n")

    #Calculating svd of b1   U: eigenarray, s: eigenexpresions, V: eigengenes

    eigenarrays2, eigenexprssions2, eigengenes2 = np.linalg.svd(
        sig_basis1, full_matrices=False)
    print("Eigenexpresions of partial b1")
    print(eigenexprssions2)
    print('\n')

    #Computing entropy of b1

    M1 = 12
    b1Frac = fraction(eigenexprssions2, M1)
    print("Expression correlations for most significant subnetworks of b1(%)")
    print(a1Frac * 100)
    print('\n')
    b1_entropy = ent(b1Frac)
    #print(b1Frac[1])
    print("Entropy of partial b1")
    print(b1_entropy)
    print('\n')

    #b2

    with open('Develop_Binding.txt') as infile3:
        with open("Clear_Dev_Bin.txt", "w") as outfile3:
            for line in infile3:
                Clear_Bind(line, outfile3)

    #Removing first row of the data file

    with open("Clear_Dev_Bin.txt", 'r') as fin3:
        data = fin3.read().splitlines(True)
    with open("Clear_Dev_Bin.txt", 'w') as fout3:
        fout3.writelines(data[1:])

    #Creating matrix basis signal b2

    sig_basis2 = np.loadtxt('Clear_Dev_Bin.txt')
    print("Dimension of b2: ")
    print(sig_basis2.shape)
    print('\n')

    #b3

    with open('Biosynthesis_Binding.txt') as infile4:
        with open("Clear_Biosyn_Bin.txt", "w") as outfile4:
            for line in infile4:
                Clear_Bind(line, outfile4)

    #Removing first row of the data file

    with open("Clear_Biosyn_Bin.txt", 'r') as fin4:
        data = fin4.read().splitlines(True)
    with open("Clear_Biosyn_Bin.txt", 'w') as fout4:
        fout4.writelines(data[1:])

    #Creating matrix basis signal b3

    sig_basis3 = np.loadtxt('Clear_Biosyn_Bin.txt')
    print("Dimension of b3: ")
    print(sig_basis3.shape)
    print('\n')
    #----------------------------------------------------

    #pseudoInverse projection to create a2, a3, and a4
    #a2
    b1_pseoudoInv = np.linalg.pinv(sig_basis1)
    project1 = np.dot(sig_basis1, b1_pseoudoInv)
    sig2_matrix = np.dot(project1, sig_matrix)
    print("Dimension of e2: ")
    print(sig2_matrix.shape)
    print('\n')

    #Calculating svd of e2   U: eigenarray, s: eigenexpresions, V: eigengenes

    eigenarrays2, eigenexpressions2, eigengenes2 = np.linalg.svd(
        sig2_matrix, full_matrices=False)
    print("Eigenexpresions of e2")
    print(eigenexpressions2)
    print("\n")

    #Creating a2 by e2

    a2 = np.dot(sig2_matrix, sig2_matrix.transpose())

    #Calculating Evd of a2:network2

    eigenarrays2_trans = eigenarrays2.transpose()
    a2_sSquare = np.square(eigenexpressions2)

    #Calculating significance of subnetworks of a2

    M2 = 2
    a2Frac = fraction(eigenexpressions2, M2)
    print("Expression correlations for most significant subnetworks of a2(%)")
    print(a2Frac * 100)
    print('\n')
    a2_entropy = ent(a2Frac)
    print("Entropy of matrix a2(should be .49)")
    print(a1_entropy)
    print("fraction of first eigenvalue of a2")
    print(a2Frac[0])
    print('\n')

    #a3
    b2_pseoudoInv = np.linalg.pinv(sig_basis2)
    project2 = np.dot(sig_basis1, b1_pseoudoInv)
    sig3_matrix = np.dot(project2, sig_matrix[0:2120])

    #Picking 2120 genes out of the matrix

    sig3_matrix = sig3_matrix[0:2120, :]
    print("dimension of e3: ")
    print(sig3_matrix.shape)
    print('\n')

    #Calculating svd of e3   U: eigenarray, s: eigenexpresions, V: eigengenes

    eigenarrays3, eigenexprssions3, eigengenes3 = np.linalg.svd(
        sig3_matrix, full_matrices=False)

    #Creating a3 by e3

    a3 = np.dot(sig_matrix, sig_matrix.transpose())

    #Calculating Evd of a3:network3

    eigenarrays3_trans = eigenarrays3.transpose()
    a3_sSquare = np.square(eigenexprssions3)

    #Calculating significance of subnetworks of a3

    a3_fractions = a3_sSquare / np.sum(a3_sSquare)

    #a4

    b3_pseoudoInv = np.linalg.pinv(sig_basis3)
    project3 = np.dot(sig_basis1, b1_pseoudoInv)
    sig4_matrix = np.dot(project3, sig_matrix[0:2120])

    #Picking 2120 genes out of the matrix

    sig4_matrix = sig4_matrix[0:2120, :]
    print("dimension of e4: ")
    print(sig4_matrix.shape)
    print('\n')
    #Calculating svd of e4 = UsV   U: eigenarray, s: eigenexpresions, V: eigengenes

    eigenarrays4, eigenexprssions4, eigengenes4 = np.linalg.svd(
        sig4_matrix, full_matrices=False)

    #Creating a4 by e4

    a4 = np.dot(sig_matrix, sig_matrix.transpose())

    #Calculating Evd of a4:network4

    eigenarrays4_trans = eigenarrays4.transpose()
    a4_sSquare = np.square(eigenexprssions4)

    #Calculating significance of subnetworks of a4

    a4_fractions = a4_sSquare / np.sum(a4_sSquare)

    #-------------------------------------------------

    #Tensor Decomposition

    #Picking 1588 genes out of the matrix

    #   a_T = a1 + a2 + a3 + a4
    #print(a_T.shape)

    #Appending signal matrices e1 to e4

    sig_appendTemp = np.concatenate(
        (sig_matrix.transpose(), sig2_matrix.transpose(),
         sig3_matrix.transpose(), sig4_matrix.transpose()),
        axis=0)
    sig_append = sig_appendTemp.transpose()
    print("dimension of appended e")
    print(sig_append.shape)
    print('\n')

    #Calculating svd of appended e = UsV   U: eigenarray, s: eigenexpresions, V: eigengenes

    eigenarrays, eigenexprssions, eigengenes = np.linalg.svd(
        sig_append, full_matrices=True)

    #Calculating HOEVD of overall network a_T

    a_T_sSquare = np.square(eigenexprssions)
    a_T_append = np.dot(sig_append, sig_append.transpose())
    #print(a_T_sSquare.shape)

    #HOEVD for individual networks
    M = 18 + 12 + 12 + 8
    M_couple = M * (M - 1) / 2

    M1 = 15
    d = 1588

    #Picking 1588 genes out of the matrix

    a1 = a1[0:1588, 0:1588]

    HOEVD(a1, a1_sSquare, eigenarrays, M1, d)

    M2 = 12
    d = 1588
    HOEVD(a2, a2_sSquare, eigenarrays, M2, d)
Beispiel #26
0
    def benchmark_complete(data, ending_density=.02, step=.01):
        '''
        Input: Data array to benchmark on, the ending density to return results, the step bteween density imputation
        
        Output: Dataframe of output density and RMSE for each method with respect to each input density
        
        
        '''
        # removes min value that is greater than zero (checks density) in each iteration randomly chosen
        #density range to run
        nonzeroscount = np.count_nonzero(data)
        sizel = data.shape
        totalentr = sizel[0] * sizel[1]
        end = 0.02  # final density to test
        begin = (nonzeroscount / totalentr)  # Begning density of matrix given
        #step=.01 # step of density

        #intialize lists to store
        density_in = []
        RMSE_empca_scores = []
        RMSE_wpca_scores = []
        RMSE_sfi_scores = []
        RMSE_siv_scores = []
        RMSE_sni_scores = []
        RMSE_smi_scores = []
        RMSE_szi_scores = []
        RMSE_wmiC_scores = []
        RMSE_wmiP_scores = []
        Density_empca = []
        Density_wpca = []
        Density_sfi = []
        Density_siv = []
        Density_sni = []
        Density_smi = []
        Density_szi = []
        Density_wmiC = []
        Density_wmiP = []

        #radnomly remove values from known matrix and try to impute them

        for d in reversed(np.arange(end, begin, step)):
            otum = data.T.copy()

            #begin density check
            nonzeroscount = np.count_nonzero(otum)
            sizel = otum.shape
            totalentr = sizel[0] * sizel[1]

            while np.float64((nonzeroscount / totalentr)) > d:
                #remove a min frequency OTU and then check density
                j = np.random.randint(0, len(otum[:][:]) - 1)
                #make sure row is not all zero (all zero row causes singular matrix)
                if sum(list(otum[j][:])) < 1:
                    continue
                m = min(i for i in list(otum[j][:]) if i > 0)
                #make sure removing value will not result in zero row
                if sum(list(otum[j][:])) == m:
                    continue
                otum[j][list(otum[j][:]).index(m)] = 0
                #check denstiy to break
                nonzeroscount = float(np.count_nonzero(otum))
                sizel = otum.shape
                totalentr = float(sizel[0]) * float(sizel[1])

            # coherce float of the unknown and print new density
            print("Data table of %f generated" % d)
            otum = otum.T.astype(np.float64)

            # make zero unknown for fancy impute, avoid singular matrix by taking transpose
            otum2 = otum.T.copy()
            otum2 = otum2.astype(np.float64)
            otum2[otum2 == 0] = np.nan  #make unknown nan

            #WPCA and EMPCA

            #build wieghted matrix
            weight = otum.copy()
            for i in range(len(otum2.T)):
                for j in range(len(otum2.T[i])):
                    if otum2.T[i][j] == 0:
                        weight[i][j] = 1
                    else:
                        weight[i][j] = 1000

            print("Running EMPCA")
            EMPCAi = EMPCA(n_components=3).fit_reconstruct(otum.copy(), weight)
            print("Running WPCA")
            WPCAi = WPCA(n_components=3).fit_reconstruct(otum.copy(), weight)

            # fancy impute and zeros
            print("Nuclear Norm")
            sni = NuclearNormMinimization(min_value=(np.amin(otum2)),
                                          max_value=(np.amax(otum2))).complete(
                                              otum2.copy())
            print("Running Soft Impute")
            sfi = SoftImpute(shrinkage_value=None,
                             convergence_threshold=0.00001,
                             max_iters=1000,
                             max_rank=min(otum2.shape),
                             n_power_iterations=1,
                             init_fill_method="zero",
                             min_value=(np.amin(otum2)),
                             max_value=(np.amax(otum2)),
                             normalizer=None,
                             verbose=False).complete(otum2.copy())
            print("Running Iterative SVD")
            siv = IterativeSVD(rank=(min(otum2.shape) - 1),
                               convergence_threshold=0.00001,
                               max_iters=1000,
                               gradual_rank_increase=True,
                               svd_algorithm="arpack",
                               init_fill_method="zero",
                               min_value=(np.amin(otum2)),
                               max_value=(np.amax(otum2)),
                               verbose=False).complete(otum2.copy())
            print("Running Matrix Factorization")
            smi = MatrixFactorization(rank=(min(otum2.shape) - 1),
                                      initializer=np.random.randn,
                                      learning_rate=0.01,
                                      patience=5,
                                      l1_penalty=0.05,
                                      l2_penalty=0.05,
                                      min_improvement=0.01,
                                      max_gradient_norm=5,
                                      optimization_algorithm="adam",
                                      min_value=(np.amin(otum2)),
                                      max_value=(np.amax(otum2)),
                                      verbose=False).complete(otum2.copy())
            print("Imputing by filling with zeros for base comparison")
            szi = base.zeros(otum2.copy())
            print("Weighted Mean Interpolation without phylo-distance")
            wmiC = base.wmi_wrapper(X=otum2.copy())
            print("Weighted Mean Interpolation with phylo-distance")
            phylo = pd.read_csv(
                'data/Matched_Pheno_and_Phylo_Data/matched_phylo.csv/matched_phylo.csv'
            )
            wmiP = base.wmi_wrapper(X=otum2.copy(), D_j=phylo)

            # save the results

            #density in (after removed values)
            density_in.append(error.get_density(otum))

            # density imputed
            Density_empca.append(error.get_density(EMPCAi))
            Density_wpca.append(error.get_density(WPCAi))
            Density_sfi.append(error.get_density(sfi))
            Density_siv.append(error.get_density(siv))
            Density_sni.append(error.get_density(sni))
            Density_smi.append(error.get_density(smi))
            Density_szi.append(error.get_density(szi))
            Density_wmiC.append(error.get_density(wmiC))
            Density_wmiP.append(error.get_density(wmiP))

            # RMSE of imputed values
            missing_mask = np.isnan(
                otum2.T
            )  # masking to only check RMSE between values imputed and values removed
            RMSE_empca_scores.append(error.RMSE(data, EMPCAi, missing_mask))
            RMSE_wpca_scores.append(error.RMSE(data, WPCAi, missing_mask))
            RMSE_sfi_scores.append(error.RMSE(data, sfi.T, missing_mask))
            RMSE_siv_scores.append(error.RMSE(data, siv.T, missing_mask))
            RMSE_sni_scores.append(error.RMSE(data, sni.T, missing_mask))
            RMSE_smi_scores.append(error.RMSE(data, smi.T, missing_mask))
            RMSE_szi_scores.append(error.RMSE(data, szi.T, missing_mask))
            RMSE_wmiC_scores.append(error.RMSE(data, wmiC.T, missing_mask))
            RMSE_wmiP_scores.append(error.RMSE(data, wmiP.T, missing_mask))

        RMSEmapping = pd.DataFrame({
            'Density':
            list(map(int, density_in)),
            'EMPCA':
            RMSE_empca_scores,
            'Matrix Factorization':
            RMSE_smi_scores,
            'WPCA':
            RMSE_wpca_scores,
            'Soft Impute':
            RMSE_sfi_scores,
            'Iterative SVD':
            RMSE_siv_scores,
            'Nuclear Norm Minimization':
            RMSE_sni_scores,
            'Zeros Replace Unknown':
            RMSE_szi_scores,
            'Weighted-Mean Interpolation Correlation':
            RMSE_wmiC_scores,
            'Weighted-Mean Interpolation Phylo':
            RMSE_wmiP_scores
        })
        RMSEmapping.set_index(['Density'], inplace=True)
        Out_density = pd.DataFrame({
            'density':
            list(map(int, density_in)),
            'EMPCA':
            Density_empca,
            'Matrix Factorization':
            Density_smi,
            'WPCA':
            Density_wpca,
            'Soft Impute':
            Density_sfi,
            'Iterative SVD':
            Density_siv,
            'Nuclear Norm Minimization':
            Density_sni,
            'Zeros Replace Unknown':
            Density_szi,
            'Weighted-Mean Interpolation Correlation':
            Density_wmiC,
            'Weighted-Mean Interpolation Phylo':
            Density_wmiP
        })
        Out_density.set_index(['density'], inplace=True)

        return Out_density, RMSEmapping
Beispiel #27
0
imputed_knn_col = KNN(k=10, orientation="columns").fit_transform(scaled)

# inverse transformation -- we don't want the standard scores
inverse_knn_col = scaler.inverse_transform(imputed_knn_col)

# columns are samples
untransposed_knn_col = inverse_knn_col.transpose()

# write to file
knn_col_df = pd.DataFrame(untransposed_knn_col)
knn_col_df.index = data.index
knn_col_df.columns = data.columns.values
# not to be confused with the Sleipnir KNNImputer output
knn_col_outfile = outfile + "_KNN_fancyimpute_column.pcl"
knn_col_df.to_csv(knn_col_outfile, sep='\t')

print("IterativeSVD...")
# no transformation
imputed_svd = IterativeSVD(rank=10).fit_transform(transposed)

# columns are samples
untransposed_svd = imputed_svd.transpose()

# write to file
svd_df = pd.DataFrame(untransposed_svd)
svd_df.index = data.index
svd_df.columns = data.columns.values
# not to be confused with the Sleipnir KNNImputer output
svd_outfile = outfile + "_IterativeSVD.pcl"
svd_df.to_csv(svd_outfile, sep='\t')
Beispiel #28
0
    args = parser.parse_args()
    with open(args.config) as f:
        config = json.load(f)

    data_path = config["data_path"]  #Ground truth data
    corrupt_data_path = config[
        "corrupt_data_path"]  #Data containing missing values
    rank = config["rank"]
    trial_ind = config["trial_ind"]

    # LOAD DATA
    data = pd.read_csv(data_path).values
    data_missing = pd.read_csv(corrupt_data_path).values

    n_row = data_missing.shape[1]  # dimensionality of data space
    non_missing_row_ind = np.where(np.isfinite(np.sum(data_missing, axis=1)))
    na_ind = np.where(np.isnan(data_missing))
    na_count = len(na_ind[0])

    data_impute_SVD = IterativeSVD(rank=rank,
                                   convergence_threshold=0.0005,
                                   max_iters=16).fit_transform(data_missing)
    ReconstructionErrorSVD = sum(
        ((data_impute_SVD[na_ind] - data[na_ind])**2)**0.5) / na_count
    print('Reconstruction error (VAE):')
    print(ReconstructionErrorSVD)

    np.savetxt("./imputed_data_trial_" + str(trial_ind) + "_SVD.csv",
               data_impute_SVD,
               delimiter=",")
Beispiel #29
0
def calculate_cumulative_ratings(owner_id, project_id):
    import numpy as np
    from fancyimpute import SoftImpute, IterativeSVD
    from sklearn.preprocessing import MinMaxScaler

    ROW_WISE = 1
    # COL_WISE = 0

    scaler_top = MinMaxScaler(feature_range=(2, 3))
    scaler_bottom = MinMaxScaler(feature_range=(1, 2))

    # input format: worker_id, task_id, accuracy
    # output format: pivot table of worker_id and task_id with accuracy values
    query = '''
        SELECT
            r.id,
            r.target_id AS worker_id,
            u.username username,
            r.task_id AS task_id,
            weight AS accuracy
        FROM crowdsourcing_rating r
        INNER JOIN crowdsourcing_task t
            ON t.id = r.task_id
        INNER JOIN crowdsourcing_project p
            ON p.id = t.project_id
        INNER JOIN crowdsourcing_taskworker tw
            ON t.id = tw.task_id
                AND tw.worker_id=r.target_id
        INNER JOIN auth_user u ON u.id = r.target_id
        WHERE
            p.group_id = (%(project_id)s)
            AND origin_type=(%(origin_type)s);
    '''

    cursor = connection.cursor()
    cursor.execute(
        query, {
            'project_id': project_id,
            'origin_type': Rating.RATING_REQUESTER,
            'origin_id': owner_id
        })

    worker_ratings_raw = cursor.fetchall()

    # 0 - rating id
    # 1 - worker_id
    # 2 - username
    # 3 - task_id
    # 4 - accuracy
    d = [{
        'worker_id': worker_rating[1],
        'task_id': worker_rating[3],
        'accuracy': worker_rating[4],
    } for worker_rating in worker_ratings_raw]

    usernames = {}
    for rating in worker_ratings_raw:
        usernames['%d' % rating[1]] = rating[2]

    df = DataFrame(d)

    pivoted = pivot_table(df,
                          values='accuracy',
                          index=['worker_id'],
                          columns=['task_id'])
    pivoted = pivoted.reset_index('worker_id')
    pivoted.index.name = None

    # COLUMNS = ["worker_id", "score", "accuracy", "attempted", "correct", "boomerang"]

    data = pivoted.copy(deep=True)
    matrix = data.ix[:, 1:]  # without worker_id

    # data['accuracy'] = matrix.mean(axis=ROW_WISE) * 100
    # data['attempted'] = matrix.count(axis=ROW_WISE)
    # data['correct'] = matrix.sum(axis=ROW_WISE)

    # data = data[data["attempted"]>=MIN_TASKS]

    # turn incorrect to -1 as imputations will fill with 0
    # matrix[matrix <= 0] = -1

    try:
        mat = IterativeSVD(verbose=False,
                           init_fill_method="mean").complete(matrix)
    except Exception:
        mat = SoftImpute(verbose=False,
                         init_fill_method="mean").complete(matrix)

    data['score'] = mat.mean(axis=ROW_WISE)
    data = data.sort_values(by=['score'], ascending=[False])

    percentile = data['score'].quantile(settings.WORKER_SPLIT_PERCENTILE)

    # Top 25% = 3-2 and Bottom 75% = 2-1
    num_workers = len(data)
    num_workers_top_x = len(data[data['score'] >= percentile])

    top_x = data.head(num_workers_top_x)

    # add extra worker at inflexion point from top set as it will have 2.0 duplicated
    bottom_y = data.tail(num_workers - num_workers_top_x + 1)

    # accuracy = sum(data['correct']) * 100 / sum(data['attempted'])

    top_x_score = scaler_top.fit_transform(
        np.array(top_x['score']).reshape((len(top_x['score']), 1)))
    bottom_y_score = scaler_bottom.fit_transform(
        np.array(bottom_y['score']).reshape((len(bottom_y['score']), 1)))

    # ignore the 1st value of bottom list as it is duplicate one from top list.
    boomerang_scores = np.append(top_x_score, bottom_y_score[1:])

    data['boomerang'] = boomerang_scores

    boomerang_ratings = data.to_dict('records')

    worker_ratings = [{
        "worker_id": r.worker_id,
        "worker_username": usernames[r.worker_id],
        "task_avg": r.boomerang,
        "requester_avg": 0
    } for r in boomerang_ratings]

    return worker_ratings
data = pd.read_csv("train_with_missing/1.csv", index_col=False)
data.head(5)

# In[6]:

data.isnull().sum(axis=0)

# In[7]:

values = data.values
values.shape

# In[9]:

X_filled_svd = IterativeSVD().fit_transform(values)

# In[12]:

X_filled_svd

# In[42]:

y = values[:, 13]
x = np.delete(X_filled_svd, 0, 1)

# In[43]:

xtr, xt, ytr, yt = train_test_split(x, y, test_size=0.1, random_state=42)

# In[48]: