Esempio n. 1
0
def save_lsa_csv(lsa_map, matrix, base_dir):
    for n_components, lsa in lsa_map.iteritems():
        n_components_dir = os.path.join(base_dir, '%d_components' % n_components)
        cr_io.makedirs(n_components_dir, allow_existing=True)

        matrix_fn = os.path.join(n_components_dir, 'projection.csv')
        n_columns = lsa.transformed_lsa_matrix.shape[1]
        assert n_columns <= n_components
        matrix_header = ['Barcode'] + ['PC-%d' % (i + 1) for i in xrange(n_columns)]
        analysis_io.save_matrix_csv(matrix_fn, lsa.transformed_lsa_matrix, matrix_header,
                                    matrix.bcs)

        components_fn = os.path.join(n_components_dir, 'components.csv')
        components_header = ['PC'] + [f.id for f in matrix.feature_ref.feature_defs]
        analysis_io.save_matrix_csv(components_fn, lsa.components, components_header,
                                    range(1, n_components + 1))

        variance_fn = os.path.join(n_components_dir, 'variance.csv')
        variance_header = ['PC', 'Proportion.Variance.Explained']
        analysis_io.save_matrix_csv(variance_fn, lsa.variance_explained, variance_header,
                                    range(1, n_components + 1))

        dispersion_fn = os.path.join(n_components_dir, 'dispersion.csv')
        dispersion_header = ['Feature', 'Normalized.Dispersion']
        analysis_io.save_matrix_csv(dispersion_fn, lsa.dispersion, dispersion_header,
                                    [f.id for f in matrix.feature_ref.feature_defs])

        features_fn = os.path.join(n_components_dir, 'features_selected.csv')
        # TODO: there are two columns here, but only 1 entry in the header...BAD
        features_header = ['Feature']
        analysis_io.save_matrix_csv(features_fn, lsa.features_selected, features_header, range(1, len(lsa.features_selected) + 1))
def save_differential_expression_csv(clustering_key, de, matrix, base_dir,
                                        cluster_names = None,
                                        file_name = 'differential_expression'):
    out_dir = base_dir
    if clustering_key is not None:
        out_dir = os.path.join(base_dir, clustering_key)
    cr_io.makedirs(out_dir, allow_existing=True)

    diff_expression_fn = os.path.join(out_dir, file_name + '.csv')
    diff_expression_header = ['Feature ID', 'Feature Name']

    n_clusters = de.data.shape[1] / 3
    for i in xrange(n_clusters):
        if cluster_names is None:
            diff_expression_header += ['Cluster %d Mean Counts' % (i + 1),
                                       'Cluster %d Log2 fold change' % (i + 1),
                                       'Cluster %d Adjusted p value' % (i + 1), ]
        else:
            diff_expression_header += ['Perturbation %s, Mean Counts' % cluster_names[i],
                                       'Perturbation %s, Log2 fold change' % cluster_names[i],
                                       'Perturbation %s, Adjusted p value' % cluster_names[i], ]


    diff_expression_prefixes = [(f.id, f.name) for f in matrix.feature_ref.feature_defs]
    analysis_io.save_matrix_csv(diff_expression_fn,
                                de.data,
                                diff_expression_header,
                                diff_expression_prefixes)
def save_top_perturbed_genes(base_dir,
                             results_per_perturbation,
                             column_map=TOP_GENES_SUMMARY_MAP,
                             num_genes_to_keep=NUM_TOP_GENES):
    if results_per_perturbation is None or results_per_perturbation == {}:
        return
    cr_io.makedirs(base_dir, allow_existing=True)
    fn = os.path.join(base_dir + '/', 'top_perturbed_genes.csv')

    list_df_results = []
    summary_df_columns = []
    for perturbation in results_per_perturbation:
        this_results = sanitize_perturbation_results(
            results_per_perturbation.get(perturbation))

        if this_results is None:
            continue

        this_results = this_results[column_map.keys()]
        this_results = this_results[0:num_genes_to_keep]
        this_results.reset_index(drop=True, inplace=True)
        list_df_results.append(this_results)
        summary_df_columns += [
            'Perturbation: ' + perturbation + ', ' + s
            for s in column_map.values()
        ]

    summary_df = pd.concat(list_df_results, ignore_index=True, axis=1)
    summary_df.columns = summary_df_columns
    summary_df.to_csv(fn, index=False)
Esempio n. 4
0
 def save_gem_class_json(self, base_dir):
     json_file_path = MultiGenomeAnalysis.json_path(base_dir)
     cr_io.makedirs(os.path.dirname(json_file_path), allow_existing=True)
     with open(json_file_path, 'w') as f:
         json.dump(tk_safe_json.json_sanitize(self.result),
                   f,
                   indent=4,
                   sort_keys=True)
Esempio n. 5
0
def save_clustering_csv(base_dir, clustering_key, labels, barcodes):
    out_dir = os.path.join(base_dir, clustering_key)
    cr_io.makedirs(out_dir, allow_existing=True)

    clusters_fn = os.path.join(out_dir, 'clusters.csv')

    header = ['Barcode', 'Cluster']
    analysis_io.save_matrix_csv(clusters_fn, labels, header, barcodes)
Esempio n. 6
0
def main(args, outs):
    if args.skip:
        return

    analysis_io.combine_h5_files([args.kmeans_h5, args.graphclust_h5], outs.clustering_h5,
                           [analysis_constants.ANALYSIS_H5_KMEANS_GROUP,
                            analysis_constants.ANALYSIS_H5_CLUSTERING_GROUP])

    csv_path = os.path.join(outs.clustering_csv)
    cr_io.makedirs(csv_path, allow_existing=True)
    copy_subdirs(args.kmeans_csv, csv_path)
    copy_subdirs(args.graphclust_csv, csv_path)
Esempio n. 7
0
def save_tsne_csv(tsne, matrix, base_dir):
    """Save a TSNE object to CSV"""
    # Preserve backward compatibility with pre-3.0 CSV files
    #   where the CSV directory was named "2_components" and the HDF5 dataset was named "_2"
    key = tsne.key + '_components'

    tsne_dir = os.path.join(base_dir, key)
    cr_io.makedirs(tsne_dir, allow_existing=True)

    matrix_fn = os.path.join(tsne_dir, 'projection.csv')
    n_tsne_components = tsne.transformed_tsne_matrix.shape[1]
    matrix_header = ['Barcode'] + [
        'TSNE-%d' % (i + 1) for i in xrange(n_tsne_components)
    ]
    analysis_io.save_matrix_csv(matrix_fn, tsne.transformed_tsne_matrix,
                                matrix_header, matrix.bcs)
def save_differential_expression_csv_from_features(clustering_key, de, diff_expression_prefixes, base_dir):
    out_dir = os.path.join(base_dir, clustering_key)
    cr_io.makedirs(out_dir, allow_existing=True)

    diff_expression_fn = os.path.join(out_dir, 'differential_expression.csv')
    diff_expression_header = ['Feature ID', 'Feature Name']

    n_clusters = de.data.shape[1] / 3
    for i in xrange(n_clusters):
        diff_expression_header += ['Cluster %d Mean Counts' % (i + 1),
                                   'Cluster %d Log2 fold change' % (i + 1),
                                   'Cluster %d Adjusted p value' % (i + 1), ]

    analysis_io.save_matrix_csv(diff_expression_fn,
                                de.data,
                                diff_expression_header,
                                diff_expression_prefixes)
Esempio n. 9
0
def save_pca_csv_with_bc_feature(pca_map, barcodes, features, base_dir):
    for n_components, pca in pca_map.iteritems():
        n_components_dir = os.path.join(base_dir,
                                        '%d_components' % n_components)
        cr_io.makedirs(n_components_dir, allow_existing=True)

        matrix_fn = os.path.join(n_components_dir, 'projection.csv')
        n_columns = pca.transformed_pca_matrix.shape[1]
        assert n_columns <= n_components
        matrix_header = ['Barcode'
                         ] + ['PC-%d' % (i + 1) for i in xrange(n_columns)]
        analysis_io.save_matrix_csv(matrix_fn, pca.transformed_pca_matrix,
                                    matrix_header, barcodes)

        # FBPCA presently provides 0-sized entries for the following PCA() member variables.
        #   This allows us to distinguish FBPCA from IRLBA, and also avoids weird empty files.
        if pca.components.size > 0:
            components_fn = os.path.join(n_components_dir, 'components.csv')
            components_header = ['PC'] + [f.id for f in features]
            analysis_io.save_matrix_csv(components_fn, pca.components,
                                        components_header,
                                        range(1, n_components + 1))

        if pca.variance_explained.size > 0:
            variance_fn = os.path.join(n_components_dir, 'variance.csv')
            variance_header = ['PC', 'Proportion.Variance.Explained']
            analysis_io.save_matrix_csv(variance_fn, pca.variance_explained,
                                        variance_header,
                                        range(1, n_components + 1))

        if pca.dispersion.size > 0:
            dispersion_fn = os.path.join(n_components_dir, 'dispersion.csv')
            dispersion_header = ['Feature', 'Normalized.Dispersion']
            analysis_io.save_matrix_csv(dispersion_fn, pca.dispersion,
                                        dispersion_header,
                                        [f.id for f in features])

        if pca.features_selected.size > 0:
            features_fn = os.path.join(n_components_dir,
                                       'features_selected.csv')
            # TODO: there are two columns here, but only 1 entry in the header...BAD
            features_header = ['Feature']
            analysis_io.save_matrix_csv(
                features_fn, pca.features_selected, features_header,
                range(1,
                      len(pca.features_selected) + 1))
Esempio n. 10
0
 def save_gem_class_csv(self, base_dir):
     csv_file_path = os.path.join(base_dir, 'gem_classification.csv')
     cr_io.makedirs(os.path.dirname(csv_file_path), allow_existing=True)
     with open(csv_file_path, 'wb') as f:
         writer = csv.writer(f, lineterminator=os.linesep)
         writer.writerow([
             'barcode', self.result['genome0'], self.result['genome1'],
             'call'
         ])
         for i in xrange(len(self.result['barcode'])):
             call = self.result['call'][i]
             call = call.replace(analysis_constants.GEM_CLASS_GENOME0,
                                 self.result['genome0'])
             call = call.replace(analysis_constants.GEM_CLASS_GENOME1,
                                 self.result['genome1'])
             writer.writerow([
                 self.result['barcode'][i],
                 self.result['count0'][i],
                 self.result['count1'][i],
                 call,
             ])
Esempio n. 11
0
def main(args, outs):
    list_of_files = [
        args.protospacer_calls_summary, args.protospacer_calls_per_cell,
        args.cells_per_protospacer, args.protospacer_umi_thresholds_csv,
        args.protospacer_umi_thresholds_json,
        args.perturbation_efficiencies_by_feature,
        args.perturbations_efficiencies_by_target
    ]

    cr_io.makedirs(outs.crispr_analysis, allow_existing=True)

    for (file_path, file_name) in itertools.izip(
            list_of_files, protospacer_calling.CRISPR_ANALYSIS_FILE_NAMES):
        if file_path is None:
            continue
        cr_io.copy(file_path, os.path.join(outs.crispr_analysis, file_name))

    if os.path.isdir(args.perturbation_effects_by_feature):
        perturbation_effects_by_feature_dir = os.path.join(
            outs.crispr_analysis, 'perturbation_effects_by_feature')
        cr_io.makedirs(perturbation_effects_by_feature_dir,
                       allow_existing=True)
        cr_io.copytree(args.perturbation_effects_by_feature,
                       perturbation_effects_by_feature_dir,
                       allow_existing=True)

    if os.path.isdir(args.perturbation_effects_by_target):
        perturbation_effects_by_target_dir = os.path.join(
            outs.crispr_analysis, 'perturbation_effects_by_target')
        cr_io.makedirs(perturbation_effects_by_target_dir, allow_existing=True)
        cr_io.copytree(args.perturbation_effects_by_target,
                       perturbation_effects_by_target_dir,
                       allow_existing=True)
Esempio n. 12
0
def main(args, outs):
    if args.skip:
        return

    if args.is_multi_genome:
        cr_io.copytree(args.multi_genome_json, outs.analysis)
        cr_io.copytree(args.multi_genome_csv, outs.analysis_csv)

    analysis_h5 = analysis_io.h5_path(outs.analysis)
    cr_io.makedirs(os.path.dirname(analysis_h5), allow_existing=True)

    # Pytables doesn't support variable len strings, so use h5py first
    with h5.File(args.matrix_h5, 'r') as matrix,\
         h5.File(analysis_h5, 'w') as out:
        # TODO: copy the first group; fixme when we have a key
        name = matrix.keys()[0]
        matrix.copy(matrix[name], out, name='matrix')

    with tables.open_file(args.pca_h5, 'r') as pca,\
         tables.open_file(args.clustering_h5, 'r') as clustering,\
         tables.open_file(args.diffexp_h5, 'r') as diffexp,\
         tables.open_file(args.tsne_h5, 'r') as tsne,\
         tables.open_file(analysis_h5, 'a') as out:

        pca.copy_children(pca.root, out.root, recursive=True)
        clustering.copy_children(clustering.root, out.root, recursive=True)
        diffexp.copy_children(diffexp.root, out.root, recursive=True)
        tsne.copy_children(tsne.root, out.root, recursive=True)

    pca_dir = os.path.join(outs.analysis_csv, 'pca')
    cr_io.copytree(args.pca_csv, pca_dir)

    clustering_dir = os.path.join(outs.analysis_csv, 'clustering')
    cr_io.copytree(args.clustering_csv, clustering_dir)

    diffexp_dir = os.path.join(outs.analysis_csv, 'diffexp')
    cr_io.copytree(args.diffexp_csv, diffexp_dir)

    tsne_dir = os.path.join(outs.analysis_csv, 'tsne')
    cr_io.copytree(args.tsne_csv, tsne_dir)
Esempio n. 13
0
    def save_mex(self,
                 base_dir,
                 save_features_func,
                 metadata=None,
                 compress=True):
        """Save in Matrix Market Exchange format.
        Args:
          base_dir (str): Path to directory to write files in.
          save_features_func (func): Func that takes (FeatureReference, base_dir, compress) and writes
                                     a file describing the features.
          metadata (dict): Optional metadata to encode into the comments as JSON.
        """
        self.tocoo()

        cr_io.makedirs(base_dir, allow_existing=True)

        out_matrix_fn = os.path.join(base_dir, 'matrix.mtx')
        out_barcodes_fn = os.path.join(base_dir, 'barcodes.tsv')
        if compress:
            out_matrix_fn += '.gz'
            out_barcodes_fn += '.gz'

        # This method only supports an integer matrix.
        assert self.m.dtype in ['uint32', 'int32', 'uint64', 'int64']
        assert type(self.m) == sp_sparse.coo.coo_matrix

        rows, cols = self.m.shape
        # Header fields in the file
        rep = 'coordinate'
        field = 'integer'
        symmetry = 'general'

        metadata = metadata or {}
        metadata.update({
            'format_version': MATRIX_H5_VERSION,
        })

        metadata_str = json.dumps(metadata)
        comment = 'metadata_json: %s' % metadata_str

        with cr_io.open_maybe_gzip(out_matrix_fn, 'w') as stream:
            # write initial header line
            stream.write(
                np.compat.asbytes('%%MatrixMarket matrix {0} {1} {2}\n'.format(
                    rep, field, symmetry)))

            # write comments
            for line in comment.split('\n'):
                stream.write(np.compat.asbytes('%%%s\n' % (line)))

            # write shape spec
            stream.write(
                np.compat.asbytes('%i %i %i\n' % (rows, cols, self.m.nnz)))
            # write row, col, val in 1-based indexing
            for r, c, d in itertools.izip(self.m.row + 1, self.m.col + 1,
                                          self.m.data):
                stream.write(np.compat.asbytes(("%i %i %i\n" % (r, c, d))))

        # both GEX and ATAC provide an implementation of this in respective feature_ref.py
        save_features_func(self.feature_ref, base_dir, compress=compress)

        with cr_io.open_maybe_gzip(out_barcodes_fn, 'w') as f:
            for bc in self.bcs:
                f.write(bc + '\n')