Example #1
0
def join(args, outs, chunk_defs, chunk_outs):
    # Sample ID / pipestance name
    check_sample_id(args.sample_id)

    # factorization.
    check_factorization(args.factorization)

    # normalization checks
    if args.normalization not in ALLOWED_NORMALIZATIONS:
        martian.exit(
            "Unsupported normalization method provided. Options are {}.".
            format(", ".join(ALLOWED_NORMALIZATIONS)))

    # # Reference
    # ref directory structure and timestamps
    ok, msg = check_refdata(args.reference_path, max_contigs=None)
    if ok:
        martian.log_info(msg)
    else:
        martian.exit(msg)

    # usability and check file formats
    check_reference_format(args.reference_path)

    # aggr csv checks
    if args.aggr_csv is None:
        martian.exit("aggregation csv must be provided")
    check_aggr_csv(args.aggr_csv, args.reference_path)

    # Open file handles limit
    if args.check_executables:
        check_filehandle_limit()

    martian.log_info(tk_preflight.record_package_versions())
Example #2
0
def run_assembly(fastq_pref, fasta_pref, args):
    cmd = [
        'vdj_asm',
        'asm',
        fastq_pref,
        fasta_pref,
        '--kmers=' + str(args.min_kmer_count),
        '--min-contig=' + str(args.min_contig_len),
        '--npaths=' + str(args.npaths),
        '--nx=' + str(args.nx),
        '--min-qual=' + str(args.min_qual),
        '--score-factor=' + str(args.score_factor),
        '--qual-factor=' + str(args.qual_factor),
        '--min-sw-score=' + str(args.min_sw_score),
        '--rt-error=' + str(args.rt_error),
        '--subsample-rate=' + str(args.subsample_rate[str(args.gem_group)]),
    ]
    if not cr_chem.has_umis(args.chemistry_def):
        martian.log_info('Assembly without UMIs is not fully supported.')
    if not args.use_sw:
        cmd.append('--fast-align')
    if not args.min_readpairs_per_umi is None:
        # If only assembling with read2, adjust this cutoff
        # NOTE: Martian stores the gem_group dict keys as strings
        cutoff = args.min_readpairs_per_umi[str(args.gem_group)]
        cmd.append('--min-umi-reads=' + str(cutoff))
    print >> sys.stderr, 'Running', ' '.join(cmd)
    subprocess.check_call(cmd, cwd=os.getcwd())
Example #3
0
def get_downsample_info(downsample, total_seq_bases):
    """Returns information about input versus requested GB when downsampling
    """
    available_gb = total_seq_bases / 1e9
    requested_gb = None
    requested_rate = None
    downsample_succeeded = True

    if downsample is None:
        post_downsample_gb = available_gb
    else:
        if downsample.get('gigabases', None) is not None:
            requested_gb = float(downsample['gigabases'])
            post_downsample_gb = min(available_gb, requested_gb)
            if available_gb < requested_gb:
                martian.log_info(
                    "Downsample requested more GB than was available; will not downsample."
                )
                downsample_succeeded = False
        elif downsample.get('subsample_rate', None) is not None:
            requested_rate = float(downsample['subsample_rate'])
            post_downsample_gb = available_gb * requested_rate

    return {
        'available_gb': available_gb,
        'requested_gb': requested_gb,
        'requested_rate': requested_rate,
        'post_downsample_gb': post_downsample_gb,
        'downsample_succeeded': downsample_succeeded
    }
Example #4
0
def run_model(dirname, targeted=False):
    if targeted:
        exe = "len_mass_model_targeted"
        iters = 600
    else:
        exe = 'len_mass_model'
        iters = 900

    args = [
        exe,
        'optimize',
        'iter=%d' % iters,
        'data',
        "file=" + os.path.join(dirname, "input.R"),
        'output',
        'file=' + os.path.join(dirname, "output.csv"),
        "refresh=10",
        "init=" + os.path.join(dirname, "init.R"),
        "random",
        "seed=4072180573",
    ]

    print " ".join(args)
    proc = subprocess.Popen(args, stdout=subprocess.PIPE)
    (stdoutdata, stderrdata) = proc.communicate()
    martian.log_info(stdoutdata)
    martian.log_info(stderrdata)
    return (proc.returncode == 0)
Example #5
0
def split(args):
    contig_info = get_contig_info(args)
    length_bases = [v for v in contig_info["contig_lengths"].values()]
    total_bins = 0
    for bases in length_bases:
        bins, over = divmod(bases, args.profile_bin_size)
        total_bins += bins
        if over > 0:
            total_bins += 1

    with open(args.per_cell_summary_metrics) as f:
        summary_line_count = len(f.readlines())
    total_nodes = 2 * (summary_line_count - 1) - 1
    martian.log_info("Bins: %d, Nodes: %d" % (total_bins, total_nodes))

    # observed worst-case dlconverter memory scenario is when the pipeline is
    # copying two elements: the complete float64 float read depth
    # 2*(8 x (nodes*2(cells)-1) and the int8 ploidy (in theory,
    # 2*nodes*2(cells)-1, for a total of 18*(total_nodes*total_bins),  but
    # observed is closer to 21x.  Going higher (24x) to be safe.
    #
    # Estimated hit for a 1000-cell dataset is 7GB.
    # Update 5/29/18: Test dataset JD-100_79 broke these assumptions, and requested
    # slightly more memory than expected. The easiest solution is to just add a bit
    # of a buffer. Changing it to 26x and adding a 4GB minimum. This is an ad-hoc
    # solution, but it *should* be safe generally. Better to err on the side of
    # not crashing things.
    mem_bytes = total_nodes * total_bins * 26
    mem_gb = int(math.ceil(mem_bytes / (1024.0 * 1024.0 * 1024.0))) + 6

    martian.log_info("Bins: %d, Nodes: %d, Bytes: %d" %
                     (total_bins, total_nodes, mem_bytes))

    return {'chunks': [], 'join': {'__mem_gb': mem_gb, '__threads': 2}}
Example #6
0
def join(args, outs, chunk_defs, chunk_outs):
    contig_info = get_contig_info(args)
    with open(outs.contig_info_json, 'w') as outfile:
        json.dump(contig_info, outfile)

    call = [
        "dlconverter", args.sample_id, "--output", outs.output_for_dloupe,
        "--description", args.sample_desc, "--node-profile-h5",
        args.normalized_node_profiles, "--contig-info-json",
        outs.contig_info_json, "--merged-bed", args.node_cnv_calls,
        "--tree-data", args.tree_data, "--tracks", args.tracks,
        "--per-cell-summary", args.per_cell_summary_metrics
    ]

    gene_annotation_path = tk_ref.get_loupe_genes(args.reference_path)
    if os.path.exists(gene_annotation_path):
        call.extend(["--gene-annotations", gene_annotation_path])

    # the sample desc may be unicode, so send the whole
    # set of args str utf-8 to check_output
    unicode_call = [arg.encode('utf-8') for arg in call]

    martian.log_info("Running dlconverter: %s" % " ".join(call))
    try:
        results = tk_subproc.check_output(unicode_call)
        martian.log_info("dlconverter output: %s" % results)
    except subprocess.CalledProcessError, e:
        outs.output_for_dloupe = None
        martian.throw("Could not generate .dloupe file: \n%s" % e.output)
def main(args, outs):
    metrics = {}
    for fname in args.metrics:
        if fname is not None:
            with open(fname, 'r') as f:
                metrics.update(json.load(f))

    # Normalize "NaN" values
    for key in metrics:
        value = metrics[key]
        if str(value) == 'NaN' or (isinstance(value, float) and np.isnan(value)):
            metrics[key] = None

    # add version info
    metrics['cellranger-atac_version'] = martian.get_pipelines_version()

    if len(metrics) > 0:
        martian.log_info('Writing out summary_metrics')
        with open(outs.metrics, 'w') as outfile:
            outfile.write(tenkit.safe_json.safe_jsonify(metrics, pretty=True))

    # compile summary.csv metrics
    # load library info and fake libraries as species
    metric_registry = MetricAnnotations()
    metrics_csv_dict = {}
    if args.library_info is not None:
        with open(args.library_info, 'r') as f:
            library_info = pickle.load(f)
        library_list = [library_info[n]['library_id'] for n in library_info.keys()]
        metrics_csv_dict.update(metric_registry.compile_summary_metrics(metrics, species_list=library_list))

    # load species level metrics
    ctg_mgr = ReferenceManager(args.reference_path)
    metrics_csv_dict.update(metric_registry.compile_summary_metrics(metrics, species_list=ctg_mgr.list_species()))
    write_dict_to_csv(outs.metrics_csv, metrics_csv_dict, sort=True)
Example #8
0
def log_ps( pid = None ):
    MAX_LINES = 6
    ps_cmd = ["ps", "--sort=-rss", "-eo", "pid,pmem,rss,uid,cmd"]

    ps_log=textwrap.dedent("""
            ---- START of ps output ----
            Output of the 'ps' command is intended to diagnose whether there are
            other memory intensive processes running on the same machine.  Process
            names are limited to the first five characters, and Supernova processes
            show up as "exe."  Note that the Supernova process that failed here will
            *not* be in the list, as it has already exited.  The RSS column describes
            the physical memory used by the process in kB.

            """)

    if pid is not None:
       ps_log+="Note that he process id for this Supernova process was " + str(pid) + "\n\n"

    ps_log += "Running command " + " ".join(ps_cmd) + "\n"
    try:
        ps_out = subprocess.check_output( ps_cmd )
        for line in ps_out.split("\n")[-MAX_LINES:]:
            s=line.split()
            if len(s) == 5:
                ps_log += "\t%8s %5s %12s %s %5.5s\n" % tuple(s)
            else:
                ps_log += " ".join(s)
    except Exception as e:
        ps_log += "Running ps command failed: " + str(e) + " on line " + line + "\n"
    ps_log += "---- END of ps output ----\n"
    martian.log_info( ps_log )
Example #9
0
def run_assembly(fastq_pref, fasta_pref, args):
    cmd = [
        'vdj_asm', 'asm', fastq_pref, fasta_pref,
        '--kmers=' + str(args.min_kmer_count),
        '--min-contig=' + str(args.min_contig_len),
        '--min-qual=' + str(args.min_qual),
        '--score-factor=' + str(args.score_factor),
        '--qual-factor=' + str(args.qual_factor),
        '--min-sw-score=' + str(args.min_sw_score),
        '--rt-error=' + str(args.rt_error)
    ]

    if not cr_chem.has_umis(args.chemistry_def):
        martian.log_info('Assembly without UMIs is not fully supported.')

    cutoff = args.min_readpairs_per_umi[str(args.gem_group)]
    if cr_chem.is_paired_end(args.chemistry_def):
        cmd.append('--min-umi-reads=' + str(2 * cutoff))
    else:
        cmd.append('--min-umi-reads=' + str(cutoff))
        cmd.append('--single-end')

    if args.use_unmapped:
        cmd.append('--use-unmapped')

    #cmd.append('--mixture-filter')

    print >> sys.stderr, 'Running', ' '.join(cmd)
    tk_subproc.check_call(cmd, cwd=os.getcwd())
Example #10
0
def prune(matrix, num_analysis_bcs=None, random_state=None):
    """Remove all cells that show no counts. If num_analysis_bcs is provided, it choses those number of barcodes.
    Finally, it returns a modified input matrix"""

    np.random.seed(0 if random_state is None else random_state)

    if matrix is None:
        return None

    if num_analysis_bcs:
        num_bcs = len(matrix.bcs)
        bc_indices = np.sort(np.random.choice(np.arange(num_bcs), size=min(num_analysis_bcs, num_bcs), replace=False))
        matrix = matrix.select_barcodes(bc_indices)

    nbcs = matrix.bcs_dim

    # keep barcode that have at least one peak left
    peaks_per_bc = (matrix.m > 0).sum(axis=0)
    keep_bcs = np.squeeze(np.array(peaks_per_bc != 0))

    matrix = matrix.select_barcodes(np.where(keep_bcs)[0])
    nbcs_new = matrix.bcs_dim
    martian.log_info("filtered out {} barcodes".format(nbcs - nbcs_new))

    return matrix
Example #11
0
def main(args, outs):
    paired_end = cr_chem.is_paired_end(args.chemistry_def)

    # Write compressed files
    outs.read1s += cr_constants.LZ4_SUFFIX
    outs.read2s += cr_constants.LZ4_SUFFIX

    cutadapt_out = os.path.join(os.path.dirname(outs.chunked_reporter), 'cutadapt_stdout')
    with open(cutadapt_out, 'w') as cut_stdout:
        status = run_cutadapt(args,
                              outs.read1s, outs.read2s,
                              args.chemistry_def,
                              cut_stdout)

    if args.read2s_chunk == None:
        outs.read2s = None

    if status != 0:
        martian.log_info('Error while running cutadapt')
    else:
        reporter = vdj_report.VdjReporter(primers=cr_utils.get_primers_from_dicts(args.primers))
        get_vdj_trim_metrics(reporter,
                             cutadapt_out,
                             paired_end)
        reporter.save(outs.chunked_reporter)
Example #12
0
def join(args, outs, chunk_defs, chunk_outs):
    if do_not_make_cloupe(args):
        outs.output_for_cloupe = None
        return

    reference = ReferenceManager(args.reference_path)

    contig_info_fn = martian.make_path("contig_info.json")
    with open(contig_info_fn, 'w') as outfile:
        contig_info = get_contig_info(args.reference_path)
        json.dump(contig_info, outfile)

    gem_group_index_json = get_gem_group_index_json(args, outs)

    call = [
        "crconverter",
        args.sample_id,
        args.pipestance_type,
        "--matrix",
        args.feature_barcode_matrix,
        "--analysis",
        args.analysis,
        "--output",
        outs.output_for_cloupe,
        "--description",
        '"' + args.sample_desc + '"',
        "--peaks",
        args.peaks,
        "--fragmentsindex",
        args.fragments_index,
        "--geneannotations",
        reference.genes,
        "--contiginfo",
        contig_info_fn,
    ]

    if args.metrics_json is not None:
        call.extend(["--metrics", args.metrics_json])
    if args.aggregation_csv is not None:
        call.extend(["--aggregation", args.aggregation_csv])
    if gem_group_index_json is not None:
        call.extend(["--gemgroups", gem_group_index_json])
    transcript_gene_types = get_annotation_gene_types(args)
    if transcript_gene_types is not None:
        call.extend(["--geneannotationtypes", ",".join(transcript_gene_types)])

    # the sample desc may be unicode, so send the whole
    # set of args str utf-8 to check_output
    unicode_call = [arg.encode('utf-8') for arg in call]

    # but keep the arg 'call' here because log_info inherently
    # attempts to encode the message... (TODO: should log_info
    # figure out the encoding of the input string)
    martian.log_info("Running crconverter: %s" % " ".join(call))
    try:
        results = tk_subproc.check_output(unicode_call)
        martian.log_info("crconverter output: %s" % results)
    except subprocess.CalledProcessError as e:
        outs.output_for_cloupe = None
        martian.throw("Could not generate .cloupe file: \n%s" % e.output)
Example #13
0
def split(args):
    # determine number of fastq file for each library and gem group, {gem_group : {library_type : count_of_fastq_file} }
    chunk_counts = defaultdict(lambda: defaultdict(int))
    for chunk in args.chunks:
        chunk_counts[chunk["gem_group"]][chunk["library_type"]] += 1

    single_library = True
    for gem_group in chunk_counts:
        if len(chunk_counts[gem_group]) > 1:
            single_library = False

    if single_library:
        martian.log_info(
            'Single library in input. No need to check barcode compatibility.')
        # `[]` for the chunks will skip the main
        return {'chunks': [], 'join': {}}

    num_reads_to_check_barcode = cr_constants.NUM_READS_TO_CHECK_BARCODE if args.num_reads_to_check_barcode is None else args.num_reads_to_check_barcode
    chunks = []
    for chunk in args.chunks:
        chunk_def = chunk
        chunk_def['num_reads_per_chunk_to_check_barcode'] = int(
            tk_stats.robust_divide(
                num_reads_to_check_barcode,
                chunk_counts[chunk["gem_group"]][chunk["library_type"]]))
        chunks.append(chunk_def)

    return {'chunks': chunks, 'join': {'__mem_gb': 4}}
Example #14
0
def process_fastq_chunk_no_demult(seq_iters, filenames, file_cache,
    _interleave_map, summary_counts, max_reads = -1):

    if _interleave_map is None:
        interleave_map = range(len(seq_iters))
    else:
        interleave_map = _interleave_map

    read_iterators = itertools.izip(*seq_iters)
    n = 0

    for read_set in read_iterators:
        # Log the counts for each sample index
        summary_counts[DEMULTIPLEX_INVALID_SAMPLE_INDEX] += 1

        target_streams = [file_cache.get(x) for x in filenames]

        for i in range(len(read_set)):
            target_index = interleave_map[i]
            read_set[i].write(target_streams[target_index])

        n += 1
        if (n%10**5) == 0:
            martian.log_info("Reads processed %i" % n)

        if max_reads > 0 and n >= max_reads:
            break
Example #15
0
def _read_stitched_coverage(profiles_h5,
                            reference_path,
                            window_size,
                            mask_data=None):
    #
    # load profiles into memory
    #
    profiles = crdna_profiles.ProfilesData2(profiles_h5,
                                            reference_path,
                                            load_conf_filter=False,
                                            reuse_mask_from=mask_data)
    #
    # apply mappability mask
    #
    nbins, n_unmasked = profiles.apply_mask(use_default_mask=True,
                                            use_conf_filter=False)
    martian.log_info("%s: (%d/%d) unmasked bins" %
                     (profiles_h5, n_unmasked, nbins))
    #
    # rebin data to requested window size
    #
    norm_factor = window_size / profiles.get_window_size()
    profiles.aggregate(norm_factor)
    #
    # calculate dpcv and local_dpcv
    # only include autosomes
    #
    _, _, coverage = profiles.get_stitched_coverage(
        allow_sex_chromosomes=False)

    return coverage, profiles
Example #16
0
    def pick_common_indexes(self, fastqs):
        index_counts = self.get_index_counts(fastqs)

        items_list = index_counts.items()
        items_list.sort(cmp=None, key=lambda x: x[1], reverse=True)
        total_counts = sum(v for (k,v) in items_list)

        c = 0
        i = 0
        for i in range(len(index_counts)):
            c += items_list[i][1]

            if c > 0.90 * total_counts:
                break

        # number of barcodes that account for 90% of reads
        c90 = i

        # median # of observations of barcodes accounting for the 90%
        num_obs_good_bcs = numpy.median([ count for (bc, count) in items_list[:(c90+1)] ])
        martian.log_info("Median counts of good barcodes in 2e6 reads: %s" % num_obs_good_bcs)

        min_obs_bc = max(num_obs_good_bcs / 250, 20)

        # only demultiplex a reasonable number of sample indices
        if len(items_list) > MAX_INDICES:
            min_obs_bc = max(min_obs_bc, items_list[MAX_INDICES][1])

        good_bcs = [ k for (k,v) in items_list if v > min_obs_bc ]
        noise_bcs = [ k for (k,v) in items_list if v <= min_obs_bc ]

        return (good_bcs, noise_bcs)
Example #17
0
def main(args, outs):
    if do_not_make_cloupe(args):
        outs.output_for_cloupe = None
        return

    gem_group_index_json = get_gem_group_index_json(args, outs)

    call = [
        "crconverter", args.sample_id, args.pipestance_type, "--matrix",
        args.filtered_gene_bc_matrices_h5, "--analysis",
        get_analysis_h5_path(args), "--output", outs.output_for_cloupe,
        "--description", args.sample_desc
    ]

    if args.metrics_json:
        call.extend(["--metrics", args.metrics_json])
    if args.aggregation_csv:
        call.extend(["--aggregation", args.aggregation_csv])
    if gem_group_index_json:
        call.extend(["--gemgroups", gem_group_index_json])

    martian.log_info("Running crconverter: %s" % " ".join(call))
    try:
        results = subprocess.check_output(call)
        martian.log_info("crconverter output: %s" % results)
    except subprocess.CalledProcessError, e:
        outs.output_for_cloupe = None
        martian.throw("Could not generate .cloupe file: \n%s" % e.output)
Example #18
0
def main(args, outs):
    if args.pipestance_type != "count" and args.pipestance_type != "aggr":
        martian.exit("The type argument must be one of: count, aggr")

    if args.pipestance_type == "count":
        pname = "SC_RNA_COUNTER_CS"
    if args.pipestance_type == "aggr":
        pname = "SC_RNA_AGGREGATOR_CS"

    pipestance_exists = os.path.exists(args.pipestance_path)
    if not pipestance_exists:
        martian.exit("Invalid pipestance path: %s" % args.pipestance_path)

    # check to see if an analysis file exists.  If it doesn't, then
    # this is likely a barnyard sample, and we cannot generate a
    # .loupe file (CELLRANGER-773);
    analysis_h5_path = os.path.join(args.pipestance_path,
                                    "outs/analysis/analysis.h5")

    # 1.2.0 location only
    internal_count_h5_path = os.path.join(
        args.pipestance_path,
        "SC_RNA_COUNTER_CS/SC_RNA_COUNTER/SC_RNA_ANALYZER/SUMMARIZE_ANALYSIS/fork0/files/analysis/analysis.h5"
    )

    internal_aggr_h5_path = os.path.join(
        args.pipestance_path,
        "SC_RNA_AGGREGATOR_CS/SC_RNA_AGGREGATOR/SC_RNA_ANALYZER/SUMMARIZE_ANALYSIS/fork0/files/analysis/analysis.h5"
    )

    if not os.path.exists(analysis_h5_path) \
            and not os.path.exists(internal_count_h5_path) \
            and not os.path.exists(internal_aggr_h5_path):
        martian.exit(
            "Could not find single-species analysis HDF5 file. " +
            "Loupe Cell Browser files are not generated for multi-species experiments."
        )

    # has to be 1.2 or higher
    cellranger_pd_before_1_2_path = os.path.join(args.pipestance_path,
                                                 "CELLRANGER_PD")
    cellranger_cs_before_1_2_path = os.path.join(args.pipestance_path,
                                                 "CELLRANGER_CS")
    if os.path.exists(cellranger_pd_before_1_2_path) or os.path.exists(
            cellranger_cs_before_1_2_path):
        martian.exit(
            "mkloupe is only supported for Cell Ranger 1.2 and later.")

    call = [
        "crconverter", args.sample_id, pname, "--pipestance",
        args.pipestance_path, "--output", outs.output_for_cloupe
    ]

    martian.log_info("Running crconverter: %s" % " ".join(call))
    try:
        results = subprocess.check_output(call)
        martian.log_info("crconverter output: %s" % results)
    except subprocess.CalledProcessError, e:
        outs.output_for_cloupe = None
        martian.throw("Could not generate .cloupe file: \n%s" % e.output)
Example #19
0
def split(args):
    # validate
    if args.kit_type not in ["5'", "3'"]:
        martian.exit("Kit type is not one of 5' or 3'.")
    # group by gene
    tx_dict = get_gene_pred_dict(args.transcripts)
    tx_by_name2 = defaultdict(list)
    valid_chroms = set(args.valid_chroms)
    for tx in tx_dict.itervalues():
        if tx.chromosome in valid_chroms:
            tx_by_name2[tx.name2].append(tx)

    singletons = [
        x[0] for x in tx_by_name2.itervalues() if len(x) == 1
        and args.lower_size_cutoff <= len(x[0]) <= args.upper_size_cutoff
    ]

    avg = np.mean([len(x) for x in singletons])
    med = np.median([len(x) for x in singletons])
    martian.log_info(
        '{} singleton genes under consideration (avg size = {} median size = {}'
        .format(len(singletons), avg, med))

    def tx_to_str(singletons):
        for x in singletons:
            yield x.get_gene_pred()

    chunks = [{
        'tx_subset': list(x),
        '__mem_gb': 4
    } for x in grouper(tx_to_str(singletons), 20)]
    return {'chunks': chunks, 'join': {'__mem_gb': 32}}
Example #20
0
def main(args, outs):
    if do_not_make_cloupe(args):
        outs.output_for_cloupe = None
        return

    gem_group_index_json = get_gem_group_index_json(args, outs)

    call = [
        "crconverter", args.sample_id, args.pipestance_type, "--matrix",
        args.filtered_gene_bc_matrices_h5, "--analysis",
        get_analysis_h5_path(args), "--output", outs.output_for_cloupe,
        "--description", args.sample_desc
    ]

    if args.metrics_json:
        call.extend(["--metrics", args.metrics_json])
    if args.aggregation_csv:
        call.extend(["--aggregation", args.aggregation_csv])
    if gem_group_index_json:
        call.extend(["--gemgroups", gem_group_index_json])

    # the sample desc may be unicode, so send the whole
    # set of args str utf-8 to check_output
    unicode_call = [arg.encode('utf-8') for arg in call]

    # but keep the arg 'call' here because log_info inherently
    # attempts to encode the message... (TODO: should log_info
    # figure out the encoding of the input string)
    martian.log_info("Running crconverter: %s" % " ".join(call))
    try:
        results = tk_subproc.check_output(unicode_call)
        martian.log_info("crconverter output: %s" % results)
    except subprocess.CalledProcessError, e:
        outs.output_for_cloupe = None
        martian.throw("Could not generate .cloupe file: \n%s" % e.output)
Example #21
0
def join(args, outs, chunk_defs, chunk_outs):
    if not chunk_defs or chunk_defs[0].skip:
        martian.log_info('Skipping peak annotation')
        outs.peak_annotation = None
        return

    chunk_peak_annotations = [chunk.peak_annotation for chunk in chunk_outs if chunk.peak_annotation is not None]
    combine_csv(chunk_peak_annotations, outs.peak_annotation)
Example #22
0
def main(args, outs):
    hostname = socket.gethostname()

    if args.output_format == 'bam' and args.read_group is None:
        martian.exit(
            "Please specify a read_group to populate the @RG field of the BAM file"
        )

    if args.sample_id is not None:
        if not re.match("^[\w-]+$", args.sample_id):
            martian.exit(
                "Sample name may only contain letters, numbers, underscores, and dashes: "
                + args.sample_id)

    for sample_def in args.sample_def:
        read_path = sample_def["read_path"]
        if not read_path.startswith('/'):
            martian.exit(
                "Specified FASTQ folder must be an absolute path: %s" %
                read_path)
        if not os.path.exists(read_path):
            martian.exit(
                "On machine: %s, specified FASTQ folder does not exist: %s" %
                (hostname, read_path))
        if not os.access(read_path, os.X_OK):
            martian.exit(
                "On machine: %s, longranger does not have permission to open FASTQ folder: %s"
                % (hostname, read_path))
        if not os.listdir(read_path):
            martian.exit("Specified FASTQ folder is empty: " + read_path)

        library_id = sample_def.get("library_id")
        if library_id is not None:
            if not re.match("^[\w-]+$", library_id):
                martian.exit(
                    "Library name may only contain letters, numbers, underscores, and dashes: "
                    + library_id)

        lanes = sample_def["lanes"]
        if lanes is not None:
            for lane in lanes:
                if not tk_preflight.is_int(lane):
                    martian.exit(
                        "Lanes must be a comma-separated list of numbers.")

        ok, msg = tk_preflight.check_sample_indices(sample_def)
        if not ok:
            martian.exit(msg)

    # Check open file handles limit
    ok, msg = tk_preflight.check_open_fh()
    if not ok:
        martian.exit(msg)

    martian.log_info(tk_preflight.record_package_versions())
Example #23
0
def main(args, outs):
    """For each slice produce a fasta file sampling reads from that slice. 
    We split our section of the genome into a bunch of 20kb chunks. For each
    chunk we sample an identical number of paired end reads. The name of each
    read encodes the true position that it was sampled from."""

    # Grab basic stats for the read lengths and quality scores
    stats_fp = open(args.basic_stats)
    stats = json.load(stats_fp)

    # Fix the random seed
    np.random.seed(0)

    # Info is a map we use everywhere to track the sampling parameters.
    # r1_len: the length of read1
    # r2_len: the length of read2
    # insert_size_map: a map of insert-size (as a string) to frequency
    # q_score_map a map of quality score (as a string) to frequency

    info = {'r1_len': stats['r1_len'], 'r2_len': stats['r2_len']}

    info['q_score_map'] = {
        '30': stats['bc_q30_bases'],
        '20': stats['bc_q20_bases'] - stats['bc_q30_bases'],
        '0': stats['bc_tot_bases'] - stats['bc_q20_bases']
    }

    stats_is_fp = open(args.insert_sizes)
    info['insert_size_map'] = json.load(stats_is_fp)['60']

    # How many samples will we make from each window?
    samples = int(
        round(2.0 * args.target_coverage *
              (float(args.window_size) / (stats['r1_len'] + stats['r2_len']))))

    martian.log_info("Using %i samples per %i bin" %
                     (samples, args.window_size))
    output_path = martian.make_path("chnk.fasta")
    output = open(output_path, "w")

    ref = reference.open_reference(args.reference_path)
    #Loop over every window in every loci.
    for (chrom, start, end) in (tk_io.get_locus_info(l) for l in args.loci):
        cur = start
        while (cur < end):
            # Sample |samples| reads from chrom:cur-chrom:cur+window_size and put
            # the results in the output file
            perbin(chrom, cur, ref, output, info, args.window_size, samples)
            cur += args.window_size
    outs.tmp = output_path
    outs.samples_per_bin = samples
    output.close()
Example #24
0
def log_ps( ):
    MAX_LINES = 6
    ps_cmd = ["ps", "--sort=-rss", "-eo", "pid,pmem,rss,comm,uid"]
    ps_log = "Running command " + " ".join(ps_cmd) + "\n"
    try:
        ps_out = subprocess.check_output( ps_cmd )
        for i, line in enumerate( ps_out.split("\n") ):
            if i == MAX_LINES:
                break
            ps_log += "%8s %5s %12s %5.5s %s\n" % tuple(line.split())
    except:
        ps_log += "Running ps command failed\n"
    martian.log_info( ps_log )
def join(args, outs, chunk_defs, chunk_outs):
    """Joins the various chunk outputs and computes
    further summary metrics based on the merged metrics."""
    martian.log_info("Combining miscellaneous summary managers")
    misc_sm = combine_summary_managers([chunk.misc_sm for chunk in chunk_outs])

    martian.log_info("Computing summary metrics")
    compute_summary_metrics(misc_sm)

    with open(outs.summary, 'w') as outfile:
        outfile.write(
            tenkit.safe_json.safe_jsonify(
                misc_sm.get_summarizer('metrics').dict, pretty=True))
 def gen_metric_helptext(self, keys):
     """Processes a metrics dictionary and generates helptext for keys if present in metrics.csv"""
     output = []
     for key in keys:
         if key in self.metric_data:
             metric_info = self.metric_data[key]
             if metric_info.help_description is not None:
                 output += [[
                     metric_info.full_name, [metric_info.help_description]
                 ]]
         else:
             martian.log_info(
                 '{} not found in registered metrics'.format(key))
     return output
Example #27
0
    def load_fragments_filtered(self, fn, bcs_to_use):
        ''' Load fragment data for coalescence calculation '''

        martian.log_info("loading fragment data")

        def fragment_filter(frags):
            #return np.logical_and(frags.num_reads > 1, frags.bc.isin(bcs_to_use))
            return frags.bc.isin(bcs_to_use)

        frags = kt_hdf.read_data_frame_filtered(
            fn,
            fragment_filter,
            query_cols=['bc', 'num_reads', 'chrom', 'start_pos'])
        return frags
Example #28
0
def do_not_make_cloupe(args):
    """
    Returns True if there is a reason why this stage should not attempt to
    generate a .cloupe file
    """
    if args.no_secondary_analysis:
        martian.log_info(
            "Skipping .cloupe generation by instruction (--no-secondary-analysis)"
        )
        return True
    if args.analysis is None:
        martian.log_info(
            "Skipping .cloupe generation due to missing analysis folder")
        return True
    if not os.path.exists(args.filtered_gene_bc_matrices_h5):
        martian.log_info(
            "Skipping .cloupe generation due to missing or zero-length gene-barcode matrix"
        )
        return True
    genomes = cr_matrix.GeneBCMatrices.load_genomes_from_h5(
        args.filtered_gene_bc_matrices_h5)
    if len(genomes) > 1:
        martian.log_info(
            "Skipping .cloupe generation due to multiple species in the gene-barcode matrix"
        )
        return True
    return False
Example #29
0
def do_not_make_cloupe(args):
    """
    Returns True if there is a reason why this stage should not attempt to
    generate a .cloupe file
    """
    if args.no_secondary_analysis:
        martian.log_info(
            "Skipping .cloupe generation by instruction (--no-secondary-analysis)"
        )
        return True
    if args.analysis is None or not os.path.exists(args.analysis):
        martian.log_info(
            "Skipping .cloupe generation due to missing analysis hdf5 file")
        return True
    if args.feature_barcode_matrix is None or not os.path.exists(
            args.feature_barcode_matrix):
        martian.log_info(
            "Skipping .cloupe generation due to missing or zero-length feature-barcode matrix"
        )
        return True
    ref_mgr = ReferenceManager(args.reference_path)
    if len(ref_mgr.list_species()) > 1:
        martian.log_info(
            "Skipping .cloupe generation as the sample is composed of multiple genomes"
        )
        return True
    return False
Example #30
0
def estimate_mean_coverage(targets_file, bam_in, read_filter=lambda x: True):

    if targets_file is not None:
        target_regions_dict = tk_io.get_target_regions_dict(open(targets_file))

        # Pick a random sample of target regions to estimate overall depth on
        targets = [(chrom, start, end)
                   for (chrom, regions) in target_regions_dict.items()
                   for (start, end) in regions if end - start > 0]
        if len(targets) == 0:
            martian.log_info("No non-empty target regions")
            return 1.0

        np.random.seed(0)
        regions_to_sample = min(EXONS_SAMPLE_COVERAGE, len(targets))
        region_indices = np.random.choice(len(targets),
                                          regions_to_sample,
                                          replace=False)
        sample_targets = [targets[idx] for idx in region_indices]

    else:
        # Pick a series of random intervals on the genome to measure coverage
        np.random.seed(0)

        if sum(bam_in.lengths) < 1e6:
            num_windows = WGS_WINDOWS_SMALL_GENOME
        else:
            num_windows = WGS_WINDOWS_SAMPLE_COVERAGE

        chrom_probs = np.array(bam_in.lengths, dtype=np.float) / sum(
            bam_in.lengths)
        rand_chroms = np.random.choice(len(bam_in.lengths),
                                       num_windows,
                                       replace=True,
                                       p=chrom_probs)

        starts = [
            np.random.randint(max(bam_in.lengths[chrom] - WGS_WINDOW_SIZE, 1))
            for chrom in rand_chroms
        ]
        sample_targets = [(bam_in.references[chrom], start,
                           min(start + WGS_WINDOW_SIZE, bam_in.lengths[chrom]))
                          for (chrom, start) in zip(rand_chroms, starts)]

    mean_depth = float(
        np.mean([
            mean_coverage_region(bam_in, region, read_filter)
            for region in sample_targets
        ]))
    return mean_depth