imputed_knn_col = KNN(k=10, orientation="columns").fit_transform(scaled) # inverse transformation -- we don't want the standard scores inverse_knn_col = scaler.inverse_transform(imputed_knn_col) # columns are samples untransposed_knn_col = inverse_knn_col.transpose() # write to file knn_col_df = pd.DataFrame(untransposed_knn_col) knn_col_df.index = data.index knn_col_df.columns = data.columns.values # not to be confused with the Sleipnir KNNImputer output knn_col_outfile = outfile + "_KNN_fancyimpute_column.pcl" knn_col_df.to_csv(knn_col_outfile, sep='\t') print("IterativeSVD...") # no transformation imputed_svd = IterativeSVD(rank=10).fit_transform(transposed) # columns are samples untransposed_svd = imputed_svd.transpose() # write to file svd_df = pd.DataFrame(untransposed_svd) svd_df.index = data.index svd_df.columns = data.columns.values # not to be confused with the Sleipnir KNNImputer output svd_outfile = outfile + "_IterativeSVD.pcl" svd_df.to_csv(svd_outfile, sep='\t')
def _perform_imputation(job_context: Dict) -> Dict: """ Take the inputs and perform the primary imputation. Via https://github.com/AlexsLemonade/refinebio/issues/508#issuecomment-435879283: - Combine all microarray samples with a full join to form a microarray_expression_matrix (this may end up being a DataFrame) - Combine all RNA-seq samples (lengthScaledTPM) with a full outer join to form a rnaseq_expression_matrix - Calculate the sum of the lengthScaledTPM values for each row (gene) of the rnaseq_expression_matrix (rnaseq_row_sums) - Calculate the 10th percentile of rnaseq_row_sums - Drop all rows in rnaseq_expression_matrix with a row sum < 10th percentile of rnaseq_row_sums; this is now filtered_rnaseq_matrix - log2(x + 1) transform filtered_rnaseq_matrix; this is now log2_rnaseq_matrix - Set all zero values in log2_rnaseq_matrix to NA, but make sure to keep track of where these zeroes are - Perform a full outer join of microarray_expression_matrix and log2_rnaseq_matrix; combined_matrix - Remove genes (rows) with >30% missing values in combined_matrix - Remove samples (columns) with >50% missing values in combined_matrix - "Reset" zero values that were set to NA in RNA-seq samples (i.e., make these zero again) in combined_matrix - Transpose combined_matrix; transposed_matrix - Perform imputation of missing values with IterativeSVD (rank=10) on the transposed_matrix; imputed_matrix - Untranspose imputed_matrix (genes are now rows, samples are now columns) - Quantile normalize imputed_matrix where genes are rows and samples are columns """ job_context['time_start'] = timezone.now() # Combine all microarray samples with a full join to form a microarray_expression_matrix (this may end up being a DataFrame) microarray_expression_matrix = job_context['microarray_inputs'] # Combine all RNA-seq samples (lengthScaledTPM) with a full outer join to form a rnaseq_expression_matrix rnaseq_expression_matrix = job_context['rnaseq_inputs'] # Calculate the sum of the lengthScaledTPM values for each row (gene) of the rnaseq_expression_matrix (rnaseq_row_sums) rnaseq_row_sums = np.sum(rnaseq_expression_matrix, axis=1) # Calculate the 10th percentile of rnaseq_row_sums rnaseq_tenth_percentile = np.percentile(rnaseq_row_sums, 10) # Drop all rows in rnaseq_expression_matrix with a row sum < 10th percentile of rnaseq_row_sums; this is now filtered_rnaseq_matrix # TODO: This is probably a better way to do this with `np.where` rows_to_filter = [] for (x, sum_val) in rnaseq_row_sums.items(): if sum_val < rnaseq_tenth_percentile: rows_to_filter.append(x) filtered_rnaseq_matrix = rnaseq_expression_matrix.drop(rows_to_filter) # log2(x + 1) transform filtered_rnaseq_matrix; this is now log2_rnaseq_matrix filtered_rnaseq_matrix_plus_one = filtered_rnaseq_matrix + 1 log2_rnaseq_matrix = np.log2(filtered_rnaseq_matrix_plus_one) # Cache our RNA-Seq zero values cached_zeroes = {} for column in log2_rnaseq_matrix.columns: cached_zeroes[column] = np.where(log2_rnaseq_matrix[column] == 0) # Set all zero values in log2_rnaseq_matrix to NA, but make sure to keep track of where these zeroes are log2_rnaseq_matrix[log2_rnaseq_matrix==0]=np.nan # Perform a full outer join of microarray_expression_matrix and log2_rnaseq_matrix; combined_matrix combined_matrix = microarray_expression_matrix.merge(log2_rnaseq_matrix, how='outer', left_index=True, right_index=True) # Remove genes (rows) with <=70% present values in combined_matrix thresh = combined_matrix.shape[1] * .7 # (Rows, Columns) row_filtered_combined_matrix = combined_matrix.dropna(axis='index', thresh=thresh) # Everything below `thresh` is dropped # Remove samples (columns) with <50% present values in combined_matrix # XXX: Find better test data for this! col_thresh = row_filtered_combined_matrix.shape[0] * .5 row_col_filtered_combined_matrix_samples = row_filtered_combined_matrix.dropna(axis='columns', thresh=col_thresh) # "Reset" zero values that were set to NA in RNA-seq samples (i.e., make these zero again) in combined_matrix for column in cached_zeroes.keys(): zeroes = cached_zeroes[column] # Skip purged columns if column not in row_col_filtered_combined_matrix_samples: continue # Place the zero try: np.put(row_col_filtered_combined_matrix_samples[column], zeroes, 0.0) except Exception as e: logger.exception("Error when replacing zero") continue # Label our new replaced data combined_matrix_zero = row_col_filtered_combined_matrix_samples # Transpose combined_matrix; transposed_matrix transposed_matrix = combined_matrix_zero.transpose() # row_col_filtered_combined_matrix_samples.transpose() # Remove -inf and inf # This should never happen, but make sure it doesn't! transposed_matrix = transposed_matrix.replace([np.inf, -np.inf], np.nan) # Perform imputation of missing values with IterativeSVD (rank=10) on the transposed_matrix; imputed_matrix imputed_matrix = IterativeSVD(rank=10).fit_transform(transposed_matrix) # Untranspose imputed_matrix (genes are now rows, samples are now columns) untransposed_imputed_matrix = imputed_matrix.transpose() # Convert back to Pandas untransposed_imputed_matrix_df = pd.DataFrame.from_records(untransposed_imputed_matrix) untransposed_imputed_matrix_df.index = row_col_filtered_combined_matrix_samples.index untransposed_imputed_matrix_df.columns = row_col_filtered_combined_matrix_samples.columns # Quantile normalize imputed_matrix where genes are rows and samples are columns # XXX: Refactor QN target acquisition and application before doing this job_context['organism'] = Organism.get_object_for_name(list(job_context['input_files'].keys())[0]) job_context['merged_no_qn'] = untransposed_imputed_matrix_df # Perform the Quantile Normalization job_context = smasher._quantile_normalize(job_context, ks_check=False) job_context['time_end'] = timezone.now() job_context['formatted_command'] = "create_compendia.py" return job_context
try: impute_me = set( random.sample(rows_to_impute, int(len(all_rows) * iteration_percent))) except Exception: # Population larger than sample impute_me = rows_to_impute rows_to_impute = rows_to_impute - impute_me df['SYNTHETIC'][impute_me] = np.nan needs_imputation_transposed = df.transpose() print("Imputing step!") imputed_matrix = IterativeSVD( rank=10).fit_transform(needs_imputation_transposed) imputed_matrix_transposed = imputed_matrix.transpose() print("Imputed!") # Convert back to Pandas df = df.transpose() df_imputed_matrix_transposed = pd.DataFrame.from_records( imputed_matrix_transposed) df_imputed_matrix_transposed.index = all_rows df_imputed_matrix_transposed.columns = all_cols df = df_imputed_matrix_transposed df.to_csv('synthetic_' + colname + "_" + str(iteration_percent) + '.tsv', sep='\t', encoding='utf-8') import pdb