Beispiel #1
0
def main(args, outs):
    raw_profiles, mask = coverage_matrix.load_matrix(
        args.raw_singlecell_profiles, args.reference_path)
    # Get mappability, GC content
    ncells = raw_profiles[0].shape[0]
    # Default:
    (intercept, linear, quadratic) = (1.0, 0.0, 0.0)
    # Sum up all single-cell profiles
    try:
        print('DEBUG 0')
        result = estimate_gc_bias(args.raw_singlecell_profiles, args.tracks,
                                  args.reference_path)
        print('DEBUG result')
        print(result)
        (quadratic, linear,
         intercept) = result['Summary']['quadratic_coefficients']
        print('DEBUG intercept=%f, linear=%f, quadratic=%f' %
              (intercept, linear, quadratic))
    except Exception as error:
        martian.alarm(
            "stages/copy_number_processor/estimate_gc_bias_coefficients/__init__ encountered an exception. Error: %s"
            % repr(error))
    # try/except
    #
    # Export scale factor and GC bias coefficients
    outs.linear = linear
    outs.quadratic = quadratic
def load_data(fn, reference_path):

    X, Xm = coverage_matrix.load_matrix(fn, reference_path)

    store = pd.HDFStore(fn, "r")
    barcodes = np.array(store["barcodes"])
    store.close()
    return X, Xm, barcodes
def main(args, outs):
    raw_profiles, mask = coverage_matrix.load_matrix(
        args.raw_singlecell_profiles, args.reference_path)
    chromosomes = coverage_matrix.list_primary_contigs(
        args.raw_singlecell_profiles, args.reference_path)
    print(chromosomes)

    bin_size = coverage_matrix.get_bin_size(args.raw_singlecell_profiles)
    tracks = pd.HDFStore(args.tracks, 'r')
    coverage_matrix.store_matrix(file_name=outs.normalized_singlecell_profiles,
                                 chroms=chromosomes,
                                 profiles=raw_profiles,
                                 tracks=tracks,
                                 window_size=bin_size)
    tracks.close()
Beispiel #4
0
 def _load_hdf5_file(self, filename, reference_path, reuse_mask_from=None):
     X, Xm = load_matrix(filename, reference_path)
     if len(X) == 0:
         raise ValueError(
             "Loading profiles from %s returned zero contigs matching the reference at %s"
             % (filename, reference_path))
     # it's low cost to hang on to a contig_manager instance
     self._contig_manager = contig_manager.contig_manager(reference_path)
     # this list has to match the list inside load_matrix
     primary_contigs = self._contig_manager.primary_contigs(
         allow_sex_chromosomes=True)
     store = pd.HDFStore(filename, "r")
     self._window_size = store['constants']['window_size']
     if 'barcodes' in store:
         self._barcodes = np.array(store['barcodes'])
         self._num_cells = len(self._barcodes)
     else:
         self._barcodes = None
         self._num_cells = len(X[0])
     profile_contigs = set(list_all_contigs(store))
     i = 0
     self._contig_list = []
     self._contig_coverage = {}
     self._contig_mask = {}
     self._contig_idx = {}
     for chrom in primary_contigs:
         if chrom in profile_contigs:
             self._contig_list.append(chrom)
             self._contig_coverage[chrom] = X[i]
             if reuse_mask_from:
                 self._contig_mask[chrom] = reuse_mask_from._contig_mask[
                     chrom]
             else:
                 self._contig_mask[chrom] = Xm[i]
             nbins = X[i].shape[1]
             self._contig_idx[chrom] = np.arange(1,
                                                 nbins * self._window_size,
                                                 self._window_size)
             i = i + 1
     store.close()
def main(args, outs):
    normalized_singlecell_profiles, mask = coverage_matrix.load_matrix(
        args.normalized_singlecell_profiles, args.reference_path)

    print('DEBUG generate_final_clusters/__init__.main():')
    print('normalized_singlecell_profiles[0].shape')
    print(normalized_singlecell_profiles[0].shape)

    ncells = normalized_singlecell_profiles[0].shape[0]
    results = [range(ncells)]
    try:
        if args.skip_clustering:
            print('Skipping clustering.')
        else:
            ## NOTE: this is a temporary short circuit of clustering when there are more than
            ## 500 cells. We will revisit this module and fix the issue later.
            if True:  # ncells < 500:
                # results = cluster_jedna.cluster(normalized_singlecell_profiles, mask, n_merge=25, score_cutoff=10)
                results = cluster_jedna.cluster(normalized_singlecell_profiles,
                                                mask,
                                                n_merge=25,
                                                score_cutoff=5)
            else:
                martian.alarm(
                    "Too many cells for clustering. Putting all cells in one cluster."
                )
            # if ncells else
        # if skip_clustering else
    except Exception as error:
        martian.alarm(
            "Clustering encountered an exception. Putting all cells in one cluster. Error: %s"
            % repr(error))
    # try/except
    #
    out_file = open(outs.clusters, 'w')
    out_file.write(tenkit.safe_json.safe_jsonify(results))
    out_file.close()
Beispiel #6
0
def main(args, outs):
    # Read in heuristics
    dvr_segment_length = int(args.params["dvr_segment_length"])
    dvr_f_cutoff = args.params["dvr_f_cutoff"]
    dvr_trim_level = args.params["dvr_trim_level"]
    dvr_min_length = int(args.params["dvr_min_length"])
    #
    print('-' * 80)
    print('Entering __init__.main()')
    #
    default_n_merge = 50
    default_bin_size = 2e4
    target_bin_count = 200.0
    confident_genome_fraction = 1.0
    raw_profiles, _ = coverage_matrix.load_matrix(args.raw_profiles,
                                                  args.reference_path)
    #
    # Iterate over all chrom_name
    chromosomes = coverage_matrix.list_primary_contigs(
        args.normalized_profiles,
        args.reference_path,
        allow_sex_chromosomes=False)  # TODO: allow sex chromosomes
    normalized_profiles_h5 = pd.HDFStore(args.normalized_profiles, 'r')
    try:
        original_bin_size = normalized_profiles_h5["constants"]["window_size"]
    except Exception as error:
        print('__init__.main() caught an error %s' % repr(error))
        original_bin_size = default_bin_size
    # try/except
    #
    segments = []
    profiles_list = []
    for chrom_name in chromosomes:
        tmp = []
        chr_profiles = normalized_profiles_h5['/contigs/' + chrom_name].astype(
            np.float64).values
        mask = coverage_matrix.contig_to_mask(normalized_profiles_h5,
                                              chrom_name)
        #
        #print(mask[:10])
        assert (mask.shape[0] == chr_profiles.shape[1])
        n_nodes = chr_profiles.shape[0]
        for node_id in xrange(n_nodes):
            # Dynamically determine bin resolution
            raw_cell_counts = scn.get_single_cell_counts(raw_profiles, node_id)
            n_bins = raw_cell_counts.shape[0]
            n_merge = scn.estimate_n_merge(raw_cell_counts, target_bin_count,
                                           confident_genome_fraction)
            if (np.isnan(n_merge) | ~np.isfinite(n_merge) |
                (n_merge > n_bins)):
                n_merge = default_n_merge
            # if NaN
            bin_size = original_bin_size * n_merge
            #print('n_merge=%d' % n_merge)
            #print('bin_size=%d' % bin_size)
            #
            weights = mask.copy()
            weights = weights.astype(float)
            weights = scn.merge_bins_single_cell_single_chrom(weights,
                                                              n_merge,
                                                              average=True)
            weights[weights <= 0.2] = np.nan
            #print('weights:')
            #print(weights)
            #
            cnv_profile = chr_profiles[node_id, :]
            #print('cnv_profile before masking:')
            #print(cnv_profile[:20])
            cnv_profile[~mask] = 0.0
            #print('cnv_profile after masking:')
            #print(cnv_profile[:20])
            #
            # Merge bins:
            cnv_profile = scn.merge_bins_single_cell_single_chrom(cnv_profile,
                                                                  n_merge,
                                                                  average=True)
            cnv_profile /= weights
            #
            #print('type(cnv_profile):')
            #print(type(cnv_profile))
            #print('cnv_profile.shape:')
            #print(cnv_profile.shape)
            #print('cnv_profile:')
            #print(cnv_profile)
            #print(cnv_profile.tolist())
            #
            # Call CNVs on this profile. The result is an array with the copy number
            # per bin at each node_id.
            #
            block = dvr.partition_profile(cnv_profile,
                                          segment_length=dvr_segment_length,
                                          f_cutoff=dvr_f_cutoff,
                                          trim_level=dvr_trim_level,
                                          min_length=dvr_min_length)
            block['NodeID'] = node_id
            block['Chr'] = chrom_name
            block['Start'] = block['Start'] * bin_size + 1
            block['End'] = (block['End'] + 1) * bin_size
            segments.append(block)
            #
            # Debugging:
            tmp.append(cnv_profile)
            #
        # for node_id
        #
        # Debugging:
        profiles_list.append(tmp)
        #
    # for chrom_name
    normalized_profiles_h5.close()
    #
    #print('block:')
    #print(block)
    export_segments(outs.cnvs, segments)
    #
    # Debugging:
    np.save(outs.cnvs + '_profiles.npy', profiles_list)
    print('Leaving __init__.main()')
    print('.' * 80)
Beispiel #7
0
def load_data(file_name, reference_path):
    profiles, mask = coverage_matrix.load_matrix(file_name, reference_path)
    return((profiles, mask))
def main(args, outs):
    """Compute a CNV confidence score from the profile for a specific choice of cluster
    and contig."""
    martian.log_info('Entering __init__.main()')
    node_start = args.chunk['start']
    # exclusive end
    node_end = args.chunk['end']
    raw_profiles, mask = coverage_matrix.load_matrix(args.raw_profiles,
                                                     args.reference_path,
                                                     start_cell=node_start,
                                                     end_cell=node_end)

    bin_size = coverage_matrix.get_bin_size(args.raw_profiles)

    ## read in CNV data for nodes of interest
    node_column = COLUMN_NAMES.index("NodeID")
    cnv_calls = read_cnv_data(args.cnv_calls, node_start, node_end,
                              node_column)
    #
    scale = get_scaling_factors(raw_profiles, cnv_calls)
    with open(args.gc_norm_params, "r") as handle:
        gc_norm_params = json.load(handle)
    linear = gc_norm_params["linear"]
    quadratic = gc_norm_params["quadratic"]
    #
    ref = contig_manager.contig_manager(args.reference_path)
    #
    # Get mappability, GC content:
    bin_parameters = []
    vesna.load_track_parameters(args.tracks, bin_parameters, ref)
    #
    logp, cnv_calls2 = ccs.process_cnv_calls(raw_profiles, mask,
                                             bin_parameters,
                                             args.reference_path, args.sex,
                                             scale, linear, quadratic,
                                             cnv_calls, bin_size)

    export_segments(outs.cnvs, cnv_calls2, node_start)

    # free some memory
    del cnv_calls
    del cnv_calls2

    #
    # Compute confidence values for unmerged, broken-up CNV calls
    #
    unmerged_cnv_calls = read_cnv_data(args.unmerged_cnv_calls, node_start,
                                       node_end, node_column)

    _, unmerged_cnv_calls2 = ccs.process_cnv_calls(raw_profiles,
                                                   mask,
                                                   bin_parameters,
                                                   args.reference_path,
                                                   args.sex,
                                                   scale,
                                                   linear,
                                                   quadratic,
                                                   unmerged_cnv_calls,
                                                   bin_size,
                                                   logp=logp)

    export_segments(outs.unmerged_cnvs, unmerged_cnv_calls2, node_start)
    #
    # Debugging:
    #
    martian.log_info('Leaving __init__.main()')
    martian.log_info('.' * 80)
Beispiel #9
0
def main(args, outs):
    normalized_profiles = []
    raw_profiles, mask = coverage_matrix.load_matrix(
        args.raw_singlecell_profiles, args.reference_path)
    print('len(mask)=%d' % len(mask))
    print('len(raw_profiles)=%d' % len(raw_profiles))

    chromosomes = coverage_matrix.list_primary_contigs(
        args.raw_singlecell_profiles, args.reference_path)
    print('chromosomes:')
    print(chromosomes)
    n_chrom = len(chromosomes)
    #
    # Get mappability, GC content:
    bin_parameters = []
    vesna.load_track_parameters(args.tracks, bin_parameters)
    n_cells = raw_profiles[0].shape[0]
    linear = args.linear
    quadratic = args.quadratic
    gc0 = 0.45  # TODO: Replace this with mean of GC in good bins across entire genome
    #
    remove = []
    for chrom_index, chrom_name in enumerate(chromosomes):
        try:
            mappability = get_mappability(bin_parameters, chrom_name,
                                          ordered_chromosomes)
            gc_gc0 = get_gc(bin_parameters, gc0, chrom_name,
                            ordered_chromosomes)
            print('len(mappability)=%d' % len(mappability))
            print('len(gc_gc0)=%d' % len(gc_gc0))
            print('raw_profiles[chrom_index].shape:')
            print(raw_profiles[chrom_index].shape)
            expectation = mappability * (1.0 + linear * gc_gc0 +
                                         quadratic * gc_gc0 * gc_gc0)
            #print('expectation')
            #print(expectation.tolist())
            tmp = np.zeros(raw_profiles[chrom_index].shape, dtype='float')
            for cell in range(n_cells):
                #print('tmp[cell, :] before:')
                #print(tmp[cell, :].tolist())
                tmp[cell, :] = raw_profiles[chrom_index][cell, :] / expectation
                tmp[cell, tmp[cell, :] < 0.0] = 0.0
                #print('tmp[cell, :] after:')
                #print(tmp[cell, :].tolist())
            # for cell
            normalized_profiles.append(tmp)
        except Exception as error:
            martian.alarm(
                "stages/copy_number_processor/normalize_gc_bias/__init__ encountered an exception. Error: %s"
                % repr(error))
            print(
                "stages/copy_number_processor/normalize_gc_bias/__init__ encountered an exception. Error: %s"
                % repr(error))
            print(
                'Removing chrom_name=%s, chrom_index=%d (absent from input raw profiles)'
                % (chrom_name, chrom_index))
            remove.append(chrom_name)
        # try/except
    # for chrom
    for chrom_name in remove:
        if chrom_name in chromosomes:
            chromosomes.remove(chrom_name)
        # if chrom_name
    # for chrom_name
    #
    # Export normalized cell profiles
    bin_size = 20000  # TODO: Fetch this value from input raw_profiles h5 file
    tracks = pd.HDFStore(args.tracks, 'r')
    coverage_matrix.store_matrix(file_name=outs.normalized_singlecell_profiles,
                                 chroms=chromosomes,
                                 profiles=normalized_profiles,
                                 tracks=tracks,
                                 window_size=bin_size)
    tracks.close()