def concatenate_sort_and_index_bams(out_bam, bams): tmp_bam = out_bam.replace('.bam', '_tmp.bam') # Drop the UMI tag since it's useless vdj_utils.concatenate_and_fix_bams(tmp_bam, bams, drop_tags=[PROCESSED_UMI_TAG]) # Wrap filenames in str() to prevent pysam crash on unicode input tk_bam.sort_and_index(str(tmp_bam), str(out_bam.replace('.bam', ''))) cr_io.remove(tmp_bam, allow_nonexisting=True)
def rm_files(filenames): for filename in filenames: cr_io.remove(filename, allow_nonexisting=True)
def run_plsa(matrix, temp_dir, plsa_features=None, plsa_bcs=None, n_plsa_components=None, random_state=None, threads=1, min_count_threshold=0): """ Run a PLSA on the matrix using the IRLBA matrix factorization algorithm. Prior to the PLSA analysis, the matrix is not normalized at all. If desired, only a subset of features (e.g. sample rows) can be selected for PLSA analysis. Each feature is ranked by its dispersion relative to other features that have a similar mean count. The top `plsa_features` as ranked by this method will then be used for the PLSA. One *cannot* select to subset number of barcodes to use because of the intricacies of PLSA. It is still available as an optional input to match the API for lsa and pca subroutines included in this package. Args: matrix (CountMatrix): The matrix to perform PLSA on. plsa_features (int): Number of features to subset from matrix and use in PLSA. The top plsa_features ranked by dispersion are used plsa_bcs (int): Number of barcodes to randomly sample for the matrix. n_plsa_components (int): How many PLSA components should be used. random_state (int): The seed for the RNG min_count_threshold (int): The minimum sum of each row/column for that row/column to be passed to PLSA (this filter is prior to any subsetting that occurs). Returns: A PLSA object """ if not os.path.exists(temp_dir): raise Exception( 'Temporary directory does not exist. Need it to run plsa binary. Aborting..' ) if random_state is None: random_state = analysis_constants.RANDOM_STATE np.random.seed(0) # Threshold the rows/columns of matrix, will throw error if an empty matrix results. thresholded_matrix, thresholded_bcs, thresholded_features = matrix.select_axes_above_threshold( min_count_threshold) # If requested, we can subsample some of the barcodes to get a smaller matrix for PLSA if plsa_bcs is not None: msg = "PLSA method does not allow subsetting barcodes" print(msg) plsa_bcs = thresholded_matrix.bcs_dim plsa_bc_indices = np.arange(thresholded_matrix.bcs_dim) # If requested, select fewer features to use by selecting the features with highest normalized dispersion if plsa_features is None: plsa_features = thresholded_matrix.features_dim elif plsa_features > thresholded_matrix.features_dim: msg = ( "You requested {} features but the matrix after thresholding only included {} features," "so the smaller amount is being used.").format( plsa_features, thresholded_matrix.features_dim) print(msg) plsa_features = thresholded_matrix.features_dim # Calc mean and variance of counts after normalizing # But don't transform to log space, in order to preserve the mean-variance relationship m = analysis_stats.normalize_by_umi(thresholded_matrix) # Get mean and variance of rows (mu, var) = analysis_stats.summarize_columns(m.T) dispersion = analysis_stats.get_normalized_dispersion( mu.squeeze(), var.squeeze()) # TODO set number of bins? plsa_feature_indices = np.argsort(dispersion)[-plsa_features:] # Now determine how many components. if n_plsa_components is None: n_plsa_components = analysis_constants.PLSA_N_COMPONENTS_DEFAULT likely_matrix_rank = min(plsa_features, plsa_bcs) if likely_matrix_rank < n_plsa_components: print(( "There are fewer nonzero features or barcodes ({}) than requested " "PLSA components ({}); reducing the number of components.").format( likely_matrix_rank, n_plsa_components)) n_plsa_components = likely_matrix_rank if (likely_matrix_rank * 0.5) <= float(n_plsa_components): print( "Requested number of PLSA components is large relative to the matrix size, an exact approach to matrix factorization may be faster." ) plsa_mat = thresholded_matrix.select_barcodes( plsa_bc_indices).select_features(plsa_feature_indices) # Write out sparse matrix without transforms # code picked up from save_mex plsa_mat.tocoo() out_matrix_fn = os.path.join(temp_dir, 'matrix.mtx') with open(out_matrix_fn, 'w') as stream: stream.write( np.compat.asbytes('%%MatrixMarket matrix {0} {1} {2}\n%%\n'.format( 'coordinate', 'integer', 'symmetry'))) stream.write( np.compat.asbytes( '%i %i %i\n' % (plsa_mat.m.shape[0], plsa_mat.m.shape[1], plsa_mat.m.nnz))) # write row, col, val in 1-based indexing for r, c, d in itertools.izip(plsa_mat.m.row + 1, plsa_mat.m.col + 1, plsa_mat.m.data): stream.write(np.compat.asbytes(("%i %i %i\n" % (r, c, d)))) del plsa_mat # Run plsa module, reading in sparse matrix # Iters and tol are designed for 15PCs proc = tk_subproc.Popen([ PLSA_BINPATH, out_matrix_fn, temp_dir, '--topics', str(n_plsa_components), '--iter', str(3000), '--tol', str(0.002), '--nt', str(threads) ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout_data, stderr_data = proc.communicate() if proc.returncode != 0: print stdout_data raise Exception( "%s returned error code while running plsa binary %d: %s" % (proc, proc.returncode, stderr_data)) # Read back data transformed_plsa_em_matrix_file = os.path.join(temp_dir, "transformed_matrix.csv") n_components_file = os.path.join(temp_dir, "components.csv") variance_explained_file = os.path.join(temp_dir, "topic_relevance.csv") org_rows_used = get_original_columns_used(thresholded_bcs, plsa_bc_indices) transformed_plsa_em_matrix = np.zeros((matrix.bcs_dim, n_plsa_components)) transformed_plsa_em_matrix[org_rows_used, :] = np.genfromtxt( transformed_plsa_em_matrix_file, delimiter=",").astype('float64') org_cols_used = get_original_columns_used(thresholded_features, plsa_feature_indices) plsa_em_components = np.zeros((n_plsa_components, matrix.features_dim)) plsa_em_components[:, org_cols_used] = np.genfromtxt( n_components_file, delimiter=",").astype('float64') variance_explained = np.genfromtxt(variance_explained_file, delimiter=",").astype('float64') # reorder components by variance explained as PLSA binary gives arbitrary order new_order = range(n_plsa_components) variance_explained, new_order = zip( *sorted(zip(variance_explained, new_order), reverse=True)) variance_explained = np.array(variance_explained) plsa_em_components = plsa_em_components[new_order, :] transformed_plsa_em_matrix = transformed_plsa_em_matrix[:, new_order] # delete files cr_io.remove(transformed_plsa_em_matrix_file, allow_nonexisting=True) cr_io.remove(n_components_file, allow_nonexisting=True) cr_io.remove(variance_explained_file, allow_nonexisting=True) cr_io.remove(out_matrix_fn, allow_nonexisting=True) features_selected = np.array( [f.id for f in matrix.feature_ref.feature_defs])[org_cols_used] # sanity check dimensions assert transformed_plsa_em_matrix.shape == (matrix.bcs_dim, n_plsa_components) assert plsa_em_components.shape == (n_plsa_components, matrix.features_dim) assert variance_explained.shape == (n_plsa_components, ) return PLSA(transformed_plsa_em_matrix, plsa_em_components, variance_explained, dispersion, features_selected)
def join(args, outs, chunk_defs, chunk_outs): contigs = [] contig_fastqs = [] contig_bams = [] if len(chunk_outs) == 0: # No input reads # Create empty BAM file with open(outs.contig_bam, 'w') as f: pass outs.contig_bam_bai = None # Create empty contig FASTA with open(outs.contig_fasta, 'w') as f: pass outs.contig_fasta_fai = None # Create empty contig FASTQ with open(outs.contig_fastq, 'w') as f: pass outs.metrics_summary_json = None outs.summary_tsv = None outs.umi_summary_tsv = None return summary_tsvs = [] umi_summary_tsvs = [] for chunk_out in chunk_outs: if not os.path.isfile(chunk_out.contig_fasta): continue contigs.append(chunk_out.contig_fasta) contig_fastqs.append(chunk_out.contig_fastq) contig_bams.append(chunk_out.contig_bam) summary_tsvs.append(chunk_out.summary_tsv) umi_summary_tsvs.append(chunk_out.umi_summary_tsv) cr_io.concatenate_files(outs.contig_fasta, contigs) if os.path.getsize(outs.contig_fasta) > 0: tk_subproc.check_call('samtools faidx %s' % outs.contig_fasta, shell=True) outs.contig_fasta_fai = outs.contig_fasta + '.fai' cr_io.concatenate_files(outs.contig_fastq, contig_fastqs) if len(summary_tsvs) > 0: cr_io.concatenate_headered_files(outs.summary_tsv, summary_tsvs) if len(umi_summary_tsvs) > 0: cr_io.concatenate_headered_files(outs.umi_summary_tsv, umi_summary_tsvs) if contig_bams: # Merge every N BAMs. Trying to merge them all at once # risks hitting the filehandle limit. n_merged = 0 while len(contig_bams) > 1: to_merge = contig_bams[0:MERGE_BAMS_N] tmp_bam = martian.make_path('merged-%04d.bam' % n_merged) n_merged += 1 print "Merging %d BAMs into %s ..." % (len(to_merge), tmp_bam) tk_bam.merge(tmp_bam, to_merge, threads=args.__threads) # Delete any temporary bams that have been merged for in_bam in to_merge: if os.path.basename(in_bam).startswith('merged-'): cr_io.remove(in_bam) # Pop the input bams and push the merged bam contig_bams = contig_bams[len(to_merge):] + [tmp_bam] if os.path.basename(contig_bams[0]).startswith('merged-'): # We merged at least two chunks together. # Rename it to the output bam. cr_io.move(contig_bams[0], outs.contig_bam) else: # There was only a single chunk, so copy it from the input cr_io.copy(contig_bams[0], outs.contig_bam) tk_bam.index(outs.contig_bam) # Make sure the Martian out matches the actual index filename outs.contig_bam_bai = outs.contig_bam + '.bai' # Merge the assembler summary jsons merged_summary = cr_io.merge_jsons_single_level( [out.metrics_summary_json for out in chunk_outs]) with open(outs.metrics_summary_json, 'w') as f: json.dump(tk_safe_json.json_sanitize(merged_summary), f, indent=4, sort_keys=True)
def run_plsa(matrix, temp_dir, plsa_features=None, plsa_bcs=None, n_plsa_components=None, random_state=None, threads=1): if not os.path.exists(temp_dir): raise Exception( 'Temporary directory does not exist. Need it to run plsa binary. Aborting..' ) if plsa_features is None: plsa_features = matrix.features_dim if plsa_bcs is None: plsa_bcs = matrix.bcs_dim if n_plsa_components is None: n_plsa_components = analysis_constants.PLSA_N_COMPONENTS_DEFAULT if n_plsa_components > plsa_features: print "There are fewer nonzero features than PLSA components; reducing the number of components." n_plsa_components = plsa_features if random_state is None: random_state = analysis_constants.RANDOM_STATE np.random.seed(random_state) # initialize PLSA subsets plsa_bc_indices = np.arange(matrix.bcs_dim) plsa_feature_indices = np.arange(matrix.features_dim) # NOTE: This is retained simply to follow PCA code # Calc mean and variance of counts after normalizing # Don't transform to log space in PLSA # Dispersion is not exactly meaningful after idf transform. m = analysis_stats.normalize_by_idf(matrix) (mu, var) = analysis_stats.summarize_columns(m.T) dispersion = analysis_stats.get_normalized_dispersion( mu.squeeze(), var.squeeze()) # TODO set number of bins? plsa_feature_indices = np.argsort(dispersion)[-plsa_features:] if plsa_bcs < matrix.bcs_dim: plsa_bc_indices = np.sort( np.random.choice(np.arange(matrix.bcs_dim), size=plsa_bcs, replace=False)) plsa_mat, _, plsa_features_nonzero = matrix.select_barcodes( plsa_bc_indices).select_features( plsa_feature_indices).select_nonzero_axes() plsa_feature_nonzero_indices = plsa_feature_indices[plsa_features_nonzero] if plsa_mat.features_dim < 2 or plsa_mat.bcs_dim < 2: print "Matrix is too small for further downsampling - num_plsa_bcs and num_plsa_features will be ignored." plsa_mat, _, plsa_features_nonzero = matrix.select_nonzero_axes() plsa_feature_nonzero_indices = plsa_features_nonzero ### Write out sparse matrix without transforms plsa_mat.tocoo() out_matrix_fn = os.path.join(temp_dir, 'matrix.mtx') sp_io.mmwrite(out_matrix_fn, plsa_mat.m, field='integer', symmetry='general') ### Run plsa module, reading in sparse matrix proc = tk_subproc.Popen([ PLSA_BINPATH, out_matrix_fn, temp_dir, '--topics', str(n_plsa_components), '--nt', str(threads), ], stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout_data, stderr_data = proc.communicate() if proc.returncode != 0: print stdout_data raise Exception( "%s returned error code while running plsa binary %d: %s" % (proc, proc.returncode, stderr_data)) ### Read back data transformed_plsa_em_matrix_file = os.path.join(temp_dir, "transformed_matrix.csv") n_components_file = os.path.join(temp_dir, "components.csv") variance_explained_file = os.path.join(temp_dir, "topic_relevance.csv") transformed_plsa_em_matrix = np.genfromtxt(transformed_plsa_em_matrix_file, delimiter=",").astype('float64') plsa_em_components = np.zeros((n_plsa_components, matrix.features_dim)) plsa_em_components[:, plsa_feature_nonzero_indices] = np.genfromtxt( n_components_file, delimiter=",").astype('float64') variance_explained = np.genfromtxt(variance_explained_file, delimiter=",").astype('float64') ### reorder components by variance explained as PLSA binary gives arbitrary order new_order = range(n_plsa_components) variance_explained, new_order = zip( *sorted(zip(variance_explained, new_order), reverse=True)) variance_explained = np.array(variance_explained) plsa_em_components = plsa_em_components[new_order, :] transformed_plsa_em_matrix = transformed_plsa_em_matrix[:, new_order] ### delete files cr_io.remove(transformed_plsa_em_matrix_file, allow_nonexisting=True) cr_io.remove(n_components_file, allow_nonexisting=True) cr_io.remove(variance_explained_file, allow_nonexisting=True) cr_io.remove(out_matrix_fn, allow_nonexisting=True) features_selected = np.array([ f.id for f in matrix.feature_ref.feature_defs ])[plsa_feature_nonzero_indices] # sanity check dimensions assert transformed_plsa_em_matrix.shape == (matrix.bcs_dim, n_plsa_components) assert plsa_em_components.shape == (n_plsa_components, matrix.features_dim) assert variance_explained.shape == (n_plsa_components, ) return PLSA(transformed_plsa_em_matrix, plsa_em_components, variance_explained, dispersion, features_selected)