Ejemplo n.º 1
0
imputed_knn_col = KNN(k=10, orientation="columns").fit_transform(scaled)

# inverse transformation -- we don't want the standard scores
inverse_knn_col = scaler.inverse_transform(imputed_knn_col)

# columns are samples
untransposed_knn_col = inverse_knn_col.transpose()

# write to file
knn_col_df = pd.DataFrame(untransposed_knn_col)
knn_col_df.index = data.index
knn_col_df.columns = data.columns.values
# not to be confused with the Sleipnir KNNImputer output
knn_col_outfile = outfile + "_KNN_fancyimpute_column.pcl"
knn_col_df.to_csv(knn_col_outfile, sep='\t')

print("IterativeSVD...")
# no transformation
imputed_svd = IterativeSVD(rank=10).fit_transform(transposed)

# columns are samples
untransposed_svd = imputed_svd.transpose()

# write to file
svd_df = pd.DataFrame(untransposed_svd)
svd_df.index = data.index
svd_df.columns = data.columns.values
# not to be confused with the Sleipnir KNNImputer output
svd_outfile = outfile + "_IterativeSVD.pcl"
svd_df.to_csv(svd_outfile, sep='\t')
Ejemplo n.º 2
0
def _perform_imputation(job_context: Dict) -> Dict:
    """

    Take the inputs and perform the primary imputation.

    Via https://github.com/AlexsLemonade/refinebio/issues/508#issuecomment-435879283: 
     - Combine all microarray samples with a full join to form a microarray_expression_matrix (this may end up being a DataFrame)
     - Combine all RNA-seq samples (lengthScaledTPM) with a full outer join to form a rnaseq_expression_matrix
     - Calculate the sum of the lengthScaledTPM values for each row (gene) of the rnaseq_expression_matrix (rnaseq_row_sums)
     - Calculate the 10th percentile of rnaseq_row_sums
     - Drop all rows in rnaseq_expression_matrix with a row sum < 10th percentile of rnaseq_row_sums; this is now filtered_rnaseq_matrix
     - log2(x + 1) transform filtered_rnaseq_matrix; this is now log2_rnaseq_matrix
     - Set all zero values in log2_rnaseq_matrix to NA, but make sure to keep track of where these zeroes are
     - Perform a full outer join of microarray_expression_matrix and log2_rnaseq_matrix; combined_matrix
     - Remove genes (rows) with >30% missing values in combined_matrix
     - Remove samples (columns) with >50% missing values in combined_matrix
     - "Reset" zero values that were set to NA in RNA-seq samples (i.e., make these zero again) in combined_matrix
     - Transpose combined_matrix; transposed_matrix
     - Perform imputation of missing values with IterativeSVD (rank=10) on the transposed_matrix; imputed_matrix
     - Untranspose imputed_matrix (genes are now rows, samples are now columns)
     - Quantile normalize imputed_matrix where genes are rows and samples are columns

    """
    job_context['time_start'] = timezone.now()

    # Combine all microarray samples with a full join to form a microarray_expression_matrix (this may end up being a DataFrame)
    microarray_expression_matrix = job_context['microarray_inputs']

    # Combine all RNA-seq samples (lengthScaledTPM) with a full outer join to form a rnaseq_expression_matrix
    rnaseq_expression_matrix = job_context['rnaseq_inputs']

    # Calculate the sum of the lengthScaledTPM values for each row (gene) of the rnaseq_expression_matrix (rnaseq_row_sums)
    rnaseq_row_sums = np.sum(rnaseq_expression_matrix, axis=1)

    # Calculate the 10th percentile of rnaseq_row_sums
    rnaseq_tenth_percentile = np.percentile(rnaseq_row_sums, 10)

    # Drop all rows in rnaseq_expression_matrix with a row sum < 10th percentile of rnaseq_row_sums; this is now filtered_rnaseq_matrix
    # TODO: This is probably a better way to do this with `np.where`
    rows_to_filter = []
    for (x, sum_val) in rnaseq_row_sums.items():
        if sum_val < rnaseq_tenth_percentile:
            rows_to_filter.append(x)

    filtered_rnaseq_matrix = rnaseq_expression_matrix.drop(rows_to_filter)

    # log2(x + 1) transform filtered_rnaseq_matrix; this is now log2_rnaseq_matrix
    filtered_rnaseq_matrix_plus_one = filtered_rnaseq_matrix + 1
    log2_rnaseq_matrix = np.log2(filtered_rnaseq_matrix_plus_one)

    # Cache our RNA-Seq zero values
    cached_zeroes = {}
    for column in log2_rnaseq_matrix.columns:
        cached_zeroes[column] = np.where(log2_rnaseq_matrix[column] == 0)

    # Set all zero values in log2_rnaseq_matrix to NA, but make sure to keep track of where these zeroes are
    log2_rnaseq_matrix[log2_rnaseq_matrix==0]=np.nan

    # Perform a full outer join of microarray_expression_matrix and log2_rnaseq_matrix; combined_matrix
    combined_matrix = microarray_expression_matrix.merge(log2_rnaseq_matrix, how='outer', left_index=True, right_index=True)

    # Remove genes (rows) with <=70% present values in combined_matrix
    thresh = combined_matrix.shape[1] * .7 # (Rows, Columns)
    row_filtered_combined_matrix = combined_matrix.dropna(axis='index', thresh=thresh) # Everything below `thresh` is dropped

    # Remove samples (columns) with <50% present values in combined_matrix
    # XXX: Find better test data for this!
    col_thresh = row_filtered_combined_matrix.shape[0] * .5
    row_col_filtered_combined_matrix_samples = row_filtered_combined_matrix.dropna(axis='columns', thresh=col_thresh)

    # "Reset" zero values that were set to NA in RNA-seq samples (i.e., make these zero again) in combined_matrix
    for column in cached_zeroes.keys():
        zeroes = cached_zeroes[column]

        # Skip purged columns
        if column not in row_col_filtered_combined_matrix_samples:
            continue
        
        # Place the zero
        try:
            np.put(row_col_filtered_combined_matrix_samples[column], zeroes, 0.0)
        except Exception as e:
            logger.exception("Error when replacing zero")
            continue

    # Label our new replaced data
    combined_matrix_zero = row_col_filtered_combined_matrix_samples

    # Transpose combined_matrix; transposed_matrix
    transposed_matrix = combined_matrix_zero.transpose() #  row_col_filtered_combined_matrix_samples.transpose()

    # Remove -inf and inf
    # This should never happen, but make sure it doesn't!
    transposed_matrix = transposed_matrix.replace([np.inf, -np.inf], np.nan)

    # Perform imputation of missing values with IterativeSVD (rank=10) on the transposed_matrix; imputed_matrix
    imputed_matrix = IterativeSVD(rank=10).fit_transform(transposed_matrix)

    # Untranspose imputed_matrix (genes are now rows, samples are now columns)
    untransposed_imputed_matrix = imputed_matrix.transpose()

    # Convert back to Pandas
    untransposed_imputed_matrix_df = pd.DataFrame.from_records(untransposed_imputed_matrix)
    untransposed_imputed_matrix_df.index = row_col_filtered_combined_matrix_samples.index
    untransposed_imputed_matrix_df.columns = row_col_filtered_combined_matrix_samples.columns

    # Quantile normalize imputed_matrix where genes are rows and samples are columns
    # XXX: Refactor QN target acquisition and application before doing this
    job_context['organism'] = Organism.get_object_for_name(list(job_context['input_files'].keys())[0])
    job_context['merged_no_qn'] = untransposed_imputed_matrix_df

    # Perform the Quantile Normalization
    job_context = smasher._quantile_normalize(job_context, ks_check=False)
    job_context['time_end'] = timezone.now()
    job_context['formatted_command'] = "create_compendia.py"

    return job_context
    try:
        impute_me = set(
            random.sample(rows_to_impute,
                          int(len(all_rows) * iteration_percent)))
    except Exception:
        # Population larger than sample
        impute_me = rows_to_impute
    rows_to_impute = rows_to_impute - impute_me

    df['SYNTHETIC'][impute_me] = np.nan

    needs_imputation_transposed = df.transpose()
    print("Imputing step!")
    imputed_matrix = IterativeSVD(
        rank=10).fit_transform(needs_imputation_transposed)
    imputed_matrix_transposed = imputed_matrix.transpose()
    print("Imputed!")

    # Convert back to Pandas
    df = df.transpose()
    df_imputed_matrix_transposed = pd.DataFrame.from_records(
        imputed_matrix_transposed)
    df_imputed_matrix_transposed.index = all_rows
    df_imputed_matrix_transposed.columns = all_cols
    df = df_imputed_matrix_transposed

df.to_csv('synthetic_' + colname + "_" + str(iteration_percent) + '.tsv',
          sep='\t',
          encoding='utf-8')

import pdb