def save_lsa_csv(lsa_map, matrix, base_dir): for n_components, lsa in lsa_map.iteritems(): n_components_dir = os.path.join(base_dir, '%d_components' % n_components) cr_io.makedirs(n_components_dir, allow_existing=True) matrix_fn = os.path.join(n_components_dir, 'projection.csv') n_columns = lsa.transformed_lsa_matrix.shape[1] assert n_columns <= n_components matrix_header = ['Barcode'] + ['PC-%d' % (i + 1) for i in xrange(n_columns)] analysis_io.save_matrix_csv(matrix_fn, lsa.transformed_lsa_matrix, matrix_header, matrix.bcs) components_fn = os.path.join(n_components_dir, 'components.csv') components_header = ['PC'] + [f.id for f in matrix.feature_ref.feature_defs] analysis_io.save_matrix_csv(components_fn, lsa.components, components_header, range(1, n_components + 1)) variance_fn = os.path.join(n_components_dir, 'variance.csv') variance_header = ['PC', 'Proportion.Variance.Explained'] analysis_io.save_matrix_csv(variance_fn, lsa.variance_explained, variance_header, range(1, n_components + 1)) dispersion_fn = os.path.join(n_components_dir, 'dispersion.csv') dispersion_header = ['Feature', 'Normalized.Dispersion'] analysis_io.save_matrix_csv(dispersion_fn, lsa.dispersion, dispersion_header, [f.id for f in matrix.feature_ref.feature_defs]) features_fn = os.path.join(n_components_dir, 'features_selected.csv') # TODO: there are two columns here, but only 1 entry in the header...BAD features_header = ['Feature'] analysis_io.save_matrix_csv(features_fn, lsa.features_selected, features_header, range(1, len(lsa.features_selected) + 1))
def save_differential_expression_csv(clustering_key, de, matrix, base_dir, cluster_names = None, file_name = 'differential_expression'): out_dir = base_dir if clustering_key is not None: out_dir = os.path.join(base_dir, clustering_key) cr_io.makedirs(out_dir, allow_existing=True) diff_expression_fn = os.path.join(out_dir, file_name + '.csv') diff_expression_header = ['Feature ID', 'Feature Name'] n_clusters = de.data.shape[1] / 3 for i in xrange(n_clusters): if cluster_names is None: diff_expression_header += ['Cluster %d Mean Counts' % (i + 1), 'Cluster %d Log2 fold change' % (i + 1), 'Cluster %d Adjusted p value' % (i + 1), ] else: diff_expression_header += ['Perturbation %s, Mean Counts' % cluster_names[i], 'Perturbation %s, Log2 fold change' % cluster_names[i], 'Perturbation %s, Adjusted p value' % cluster_names[i], ] diff_expression_prefixes = [(f.id, f.name) for f in matrix.feature_ref.feature_defs] analysis_io.save_matrix_csv(diff_expression_fn, de.data, diff_expression_header, diff_expression_prefixes)
def save_top_perturbed_genes(base_dir, results_per_perturbation, column_map=TOP_GENES_SUMMARY_MAP, num_genes_to_keep=NUM_TOP_GENES): if results_per_perturbation is None or results_per_perturbation == {}: return cr_io.makedirs(base_dir, allow_existing=True) fn = os.path.join(base_dir + '/', 'top_perturbed_genes.csv') list_df_results = [] summary_df_columns = [] for perturbation in results_per_perturbation: this_results = sanitize_perturbation_results( results_per_perturbation.get(perturbation)) if this_results is None: continue this_results = this_results[column_map.keys()] this_results = this_results[0:num_genes_to_keep] this_results.reset_index(drop=True, inplace=True) list_df_results.append(this_results) summary_df_columns += [ 'Perturbation: ' + perturbation + ', ' + s for s in column_map.values() ] summary_df = pd.concat(list_df_results, ignore_index=True, axis=1) summary_df.columns = summary_df_columns summary_df.to_csv(fn, index=False)
def save_gem_class_json(self, base_dir): json_file_path = MultiGenomeAnalysis.json_path(base_dir) cr_io.makedirs(os.path.dirname(json_file_path), allow_existing=True) with open(json_file_path, 'w') as f: json.dump(tk_safe_json.json_sanitize(self.result), f, indent=4, sort_keys=True)
def save_clustering_csv(base_dir, clustering_key, labels, barcodes): out_dir = os.path.join(base_dir, clustering_key) cr_io.makedirs(out_dir, allow_existing=True) clusters_fn = os.path.join(out_dir, 'clusters.csv') header = ['Barcode', 'Cluster'] analysis_io.save_matrix_csv(clusters_fn, labels, header, barcodes)
def main(args, outs): if args.skip: return analysis_io.combine_h5_files([args.kmeans_h5, args.graphclust_h5], outs.clustering_h5, [analysis_constants.ANALYSIS_H5_KMEANS_GROUP, analysis_constants.ANALYSIS_H5_CLUSTERING_GROUP]) csv_path = os.path.join(outs.clustering_csv) cr_io.makedirs(csv_path, allow_existing=True) copy_subdirs(args.kmeans_csv, csv_path) copy_subdirs(args.graphclust_csv, csv_path)
def save_tsne_csv(tsne, matrix, base_dir): """Save a TSNE object to CSV""" # Preserve backward compatibility with pre-3.0 CSV files # where the CSV directory was named "2_components" and the HDF5 dataset was named "_2" key = tsne.key + '_components' tsne_dir = os.path.join(base_dir, key) cr_io.makedirs(tsne_dir, allow_existing=True) matrix_fn = os.path.join(tsne_dir, 'projection.csv') n_tsne_components = tsne.transformed_tsne_matrix.shape[1] matrix_header = ['Barcode'] + [ 'TSNE-%d' % (i + 1) for i in xrange(n_tsne_components) ] analysis_io.save_matrix_csv(matrix_fn, tsne.transformed_tsne_matrix, matrix_header, matrix.bcs)
def save_differential_expression_csv_from_features(clustering_key, de, diff_expression_prefixes, base_dir): out_dir = os.path.join(base_dir, clustering_key) cr_io.makedirs(out_dir, allow_existing=True) diff_expression_fn = os.path.join(out_dir, 'differential_expression.csv') diff_expression_header = ['Feature ID', 'Feature Name'] n_clusters = de.data.shape[1] / 3 for i in xrange(n_clusters): diff_expression_header += ['Cluster %d Mean Counts' % (i + 1), 'Cluster %d Log2 fold change' % (i + 1), 'Cluster %d Adjusted p value' % (i + 1), ] analysis_io.save_matrix_csv(diff_expression_fn, de.data, diff_expression_header, diff_expression_prefixes)
def save_pca_csv_with_bc_feature(pca_map, barcodes, features, base_dir): for n_components, pca in pca_map.iteritems(): n_components_dir = os.path.join(base_dir, '%d_components' % n_components) cr_io.makedirs(n_components_dir, allow_existing=True) matrix_fn = os.path.join(n_components_dir, 'projection.csv') n_columns = pca.transformed_pca_matrix.shape[1] assert n_columns <= n_components matrix_header = ['Barcode' ] + ['PC-%d' % (i + 1) for i in xrange(n_columns)] analysis_io.save_matrix_csv(matrix_fn, pca.transformed_pca_matrix, matrix_header, barcodes) # FBPCA presently provides 0-sized entries for the following PCA() member variables. # This allows us to distinguish FBPCA from IRLBA, and also avoids weird empty files. if pca.components.size > 0: components_fn = os.path.join(n_components_dir, 'components.csv') components_header = ['PC'] + [f.id for f in features] analysis_io.save_matrix_csv(components_fn, pca.components, components_header, range(1, n_components + 1)) if pca.variance_explained.size > 0: variance_fn = os.path.join(n_components_dir, 'variance.csv') variance_header = ['PC', 'Proportion.Variance.Explained'] analysis_io.save_matrix_csv(variance_fn, pca.variance_explained, variance_header, range(1, n_components + 1)) if pca.dispersion.size > 0: dispersion_fn = os.path.join(n_components_dir, 'dispersion.csv') dispersion_header = ['Feature', 'Normalized.Dispersion'] analysis_io.save_matrix_csv(dispersion_fn, pca.dispersion, dispersion_header, [f.id for f in features]) if pca.features_selected.size > 0: features_fn = os.path.join(n_components_dir, 'features_selected.csv') # TODO: there are two columns here, but only 1 entry in the header...BAD features_header = ['Feature'] analysis_io.save_matrix_csv( features_fn, pca.features_selected, features_header, range(1, len(pca.features_selected) + 1))
def save_gem_class_csv(self, base_dir): csv_file_path = os.path.join(base_dir, 'gem_classification.csv') cr_io.makedirs(os.path.dirname(csv_file_path), allow_existing=True) with open(csv_file_path, 'wb') as f: writer = csv.writer(f, lineterminator=os.linesep) writer.writerow([ 'barcode', self.result['genome0'], self.result['genome1'], 'call' ]) for i in xrange(len(self.result['barcode'])): call = self.result['call'][i] call = call.replace(analysis_constants.GEM_CLASS_GENOME0, self.result['genome0']) call = call.replace(analysis_constants.GEM_CLASS_GENOME1, self.result['genome1']) writer.writerow([ self.result['barcode'][i], self.result['count0'][i], self.result['count1'][i], call, ])
def main(args, outs): list_of_files = [ args.protospacer_calls_summary, args.protospacer_calls_per_cell, args.cells_per_protospacer, args.protospacer_umi_thresholds_csv, args.protospacer_umi_thresholds_json, args.perturbation_efficiencies_by_feature, args.perturbations_efficiencies_by_target ] cr_io.makedirs(outs.crispr_analysis, allow_existing=True) for (file_path, file_name) in itertools.izip( list_of_files, protospacer_calling.CRISPR_ANALYSIS_FILE_NAMES): if file_path is None: continue cr_io.copy(file_path, os.path.join(outs.crispr_analysis, file_name)) if os.path.isdir(args.perturbation_effects_by_feature): perturbation_effects_by_feature_dir = os.path.join( outs.crispr_analysis, 'perturbation_effects_by_feature') cr_io.makedirs(perturbation_effects_by_feature_dir, allow_existing=True) cr_io.copytree(args.perturbation_effects_by_feature, perturbation_effects_by_feature_dir, allow_existing=True) if os.path.isdir(args.perturbation_effects_by_target): perturbation_effects_by_target_dir = os.path.join( outs.crispr_analysis, 'perturbation_effects_by_target') cr_io.makedirs(perturbation_effects_by_target_dir, allow_existing=True) cr_io.copytree(args.perturbation_effects_by_target, perturbation_effects_by_target_dir, allow_existing=True)
def main(args, outs): if args.skip: return if args.is_multi_genome: cr_io.copytree(args.multi_genome_json, outs.analysis) cr_io.copytree(args.multi_genome_csv, outs.analysis_csv) analysis_h5 = analysis_io.h5_path(outs.analysis) cr_io.makedirs(os.path.dirname(analysis_h5), allow_existing=True) # Pytables doesn't support variable len strings, so use h5py first with h5.File(args.matrix_h5, 'r') as matrix,\ h5.File(analysis_h5, 'w') as out: # TODO: copy the first group; fixme when we have a key name = matrix.keys()[0] matrix.copy(matrix[name], out, name='matrix') with tables.open_file(args.pca_h5, 'r') as pca,\ tables.open_file(args.clustering_h5, 'r') as clustering,\ tables.open_file(args.diffexp_h5, 'r') as diffexp,\ tables.open_file(args.tsne_h5, 'r') as tsne,\ tables.open_file(analysis_h5, 'a') as out: pca.copy_children(pca.root, out.root, recursive=True) clustering.copy_children(clustering.root, out.root, recursive=True) diffexp.copy_children(diffexp.root, out.root, recursive=True) tsne.copy_children(tsne.root, out.root, recursive=True) pca_dir = os.path.join(outs.analysis_csv, 'pca') cr_io.copytree(args.pca_csv, pca_dir) clustering_dir = os.path.join(outs.analysis_csv, 'clustering') cr_io.copytree(args.clustering_csv, clustering_dir) diffexp_dir = os.path.join(outs.analysis_csv, 'diffexp') cr_io.copytree(args.diffexp_csv, diffexp_dir) tsne_dir = os.path.join(outs.analysis_csv, 'tsne') cr_io.copytree(args.tsne_csv, tsne_dir)
def save_mex(self, base_dir, save_features_func, metadata=None, compress=True): """Save in Matrix Market Exchange format. Args: base_dir (str): Path to directory to write files in. save_features_func (func): Func that takes (FeatureReference, base_dir, compress) and writes a file describing the features. metadata (dict): Optional metadata to encode into the comments as JSON. """ self.tocoo() cr_io.makedirs(base_dir, allow_existing=True) out_matrix_fn = os.path.join(base_dir, 'matrix.mtx') out_barcodes_fn = os.path.join(base_dir, 'barcodes.tsv') if compress: out_matrix_fn += '.gz' out_barcodes_fn += '.gz' # This method only supports an integer matrix. assert self.m.dtype in ['uint32', 'int32', 'uint64', 'int64'] assert type(self.m) == sp_sparse.coo.coo_matrix rows, cols = self.m.shape # Header fields in the file rep = 'coordinate' field = 'integer' symmetry = 'general' metadata = metadata or {} metadata.update({ 'format_version': MATRIX_H5_VERSION, }) metadata_str = json.dumps(metadata) comment = 'metadata_json: %s' % metadata_str with cr_io.open_maybe_gzip(out_matrix_fn, 'w') as stream: # write initial header line stream.write( np.compat.asbytes('%%MatrixMarket matrix {0} {1} {2}\n'.format( rep, field, symmetry))) # write comments for line in comment.split('\n'): stream.write(np.compat.asbytes('%%%s\n' % (line))) # write shape spec stream.write( np.compat.asbytes('%i %i %i\n' % (rows, cols, self.m.nnz))) # write row, col, val in 1-based indexing for r, c, d in itertools.izip(self.m.row + 1, self.m.col + 1, self.m.data): stream.write(np.compat.asbytes(("%i %i %i\n" % (r, c, d)))) # both GEX and ATAC provide an implementation of this in respective feature_ref.py save_features_func(self.feature_ref, base_dir, compress=compress) with cr_io.open_maybe_gzip(out_barcodes_fn, 'w') as f: for bc in self.bcs: f.write(bc + '\n')