Beispiel #1
0
def join(args, outs, chunk_defs, chunk_outs):
    contig_info = get_contig_info(args)
    with open(outs.contig_info_json, 'w') as outfile:
        json.dump(contig_info, outfile)

    call = [
        "dlconverter", args.sample_id, "--output", outs.output_for_dloupe,
        "--description", args.sample_desc, "--node-profile-h5",
        args.normalized_node_profiles, "--contig-info-json",
        outs.contig_info_json, "--merged-bed", args.node_cnv_calls,
        "--tree-data", args.tree_data, "--tracks", args.tracks,
        "--per-cell-summary", args.per_cell_summary_metrics
    ]

    gene_annotation_path = tk_ref.get_loupe_genes(args.reference_path)
    if os.path.exists(gene_annotation_path):
        call.extend(["--gene-annotations", gene_annotation_path])

    # the sample desc may be unicode, so send the whole
    # set of args str utf-8 to check_output
    unicode_call = [arg.encode('utf-8') for arg in call]

    martian.log_info("Running dlconverter: %s" % " ".join(call))
    try:
        results = tk_subproc.check_output(unicode_call)
        martian.log_info("dlconverter output: %s" % results)
    except subprocess.CalledProcessError, e:
        outs.output_for_dloupe = None
        martian.throw("Could not generate .dloupe file: \n%s" % e.output)
Beispiel #2
0
def main(args, outs):
    if do_not_make_cloupe(args):
        outs.output_for_cloupe = None
        return

    gem_group_index_json = get_gem_group_index_json(args, outs)

    call = [
        "crconverter", args.sample_id, args.pipestance_type, "--matrix",
        args.filtered_gene_bc_matrices_h5, "--analysis",
        get_analysis_h5_path(args), "--output", outs.output_for_cloupe,
        "--description", args.sample_desc
    ]

    if args.metrics_json:
        call.extend(["--metrics", args.metrics_json])
    if args.aggregation_csv:
        call.extend(["--aggregation", args.aggregation_csv])
    if gem_group_index_json:
        call.extend(["--gemgroups", gem_group_index_json])

    # the sample desc may be unicode, so send the whole
    # set of args str utf-8 to check_output
    unicode_call = [arg.encode('utf-8') for arg in call]

    # but keep the arg 'call' here because log_info inherently
    # attempts to encode the message... (TODO: should log_info
    # figure out the encoding of the input string)
    martian.log_info("Running crconverter: %s" % " ".join(call))
    try:
        results = tk_subproc.check_output(unicode_call)
        martian.log_info("crconverter output: %s" % results)
    except subprocess.CalledProcessError, e:
        outs.output_for_cloupe = None
        martian.throw("Could not generate .cloupe file: \n%s" % e.output)
Beispiel #3
0
def main(args, outs):
    if do_not_make_cloupe(args):
        outs.output_for_cloupe = None
        return

    gem_group_index_json = get_gem_group_index_json(args, outs)

    call = [
        "crconverter", args.sample_id, args.pipestance_type, "--matrix",
        args.filtered_gene_bc_matrices_h5, "--analysis",
        get_analysis_h5_path(args), "--output", outs.output_for_cloupe,
        "--description", args.sample_desc
    ]

    if args.metrics_json:
        call.extend(["--metrics", args.metrics_json])
    if args.aggregation_csv:
        call.extend(["--aggregation", args.aggregation_csv])
    if gem_group_index_json:
        call.extend(["--gemgroups", gem_group_index_json])

    martian.log_info("Running crconverter: %s" % " ".join(call))
    try:
        results = subprocess.check_output(call)
        martian.log_info("crconverter output: %s" % results)
    except subprocess.CalledProcessError, e:
        outs.output_for_cloupe = None
        martian.throw("Could not generate .cloupe file: \n%s" % e.output)
Beispiel #4
0
def main(args, outs):
    if args.pipestance_type != "count" and args.pipestance_type != "aggr":
        martian.exit("The type argument must be one of: count, aggr")

    if args.pipestance_type == "count":
        pname = "SC_RNA_COUNTER_CS"
    if args.pipestance_type == "aggr":
        pname = "SC_RNA_AGGREGATOR_CS"

    pipestance_exists = os.path.exists(args.pipestance_path)
    if not pipestance_exists:
        martian.exit("Invalid pipestance path: %s" % args.pipestance_path)

    # check to see if an analysis file exists.  If it doesn't, then
    # this is likely a barnyard sample, and we cannot generate a
    # .loupe file (CELLRANGER-773);
    analysis_h5_path = os.path.join(args.pipestance_path,
                                    "outs/analysis/analysis.h5")

    # 1.2.0 location only
    internal_count_h5_path = os.path.join(
        args.pipestance_path,
        "SC_RNA_COUNTER_CS/SC_RNA_COUNTER/SC_RNA_ANALYZER/SUMMARIZE_ANALYSIS/fork0/files/analysis/analysis.h5"
    )

    internal_aggr_h5_path = os.path.join(
        args.pipestance_path,
        "SC_RNA_AGGREGATOR_CS/SC_RNA_AGGREGATOR/SC_RNA_ANALYZER/SUMMARIZE_ANALYSIS/fork0/files/analysis/analysis.h5"
    )

    if not os.path.exists(analysis_h5_path) \
            and not os.path.exists(internal_count_h5_path) \
            and not os.path.exists(internal_aggr_h5_path):
        martian.exit(
            "Could not find single-species analysis HDF5 file. " +
            "Loupe Cell Browser files are not generated for multi-species experiments."
        )

    # has to be 1.2 or higher
    cellranger_pd_before_1_2_path = os.path.join(args.pipestance_path,
                                                 "CELLRANGER_PD")
    cellranger_cs_before_1_2_path = os.path.join(args.pipestance_path,
                                                 "CELLRANGER_CS")
    if os.path.exists(cellranger_pd_before_1_2_path) or os.path.exists(
            cellranger_cs_before_1_2_path):
        martian.exit(
            "mkloupe is only supported for Cell Ranger 1.2 and later.")

    call = [
        "crconverter", args.sample_id, pname, "--pipestance",
        args.pipestance_path, "--output", outs.output_for_cloupe
    ]

    martian.log_info("Running crconverter: %s" % " ".join(call))
    try:
        results = subprocess.check_output(call)
        martian.log_info("crconverter output: %s" % results)
    except subprocess.CalledProcessError, e:
        outs.output_for_cloupe = None
        martian.throw("Could not generate .cloupe file: \n%s" % e.output)
Beispiel #5
0
def main(args, outs):
    chunk = args.chunk
    assert(chunk['reads_interleaved'])
    if not chunk['reads_interleaved'] and (chunk['read1'] is None or chunk['read2'] is None):
        martian.throw("must supply a read1 and read2 when reads_interleave == False")
    output_dir = os.path.dirname(os.path.realpath(outs.default))
    if args.barcode_whitelist:
        barcode_whitelist = BARCODE_LOCATION + "/" + args.barcode_whitelist + ".txt"
    else:
        barcode_whitelist = "none"
    gem_group = chunk["gem_group"] or 1
    barcode_read = chunk["barcode"] or "none"
    barcode_counts = args.barcode_counts or "none"
    sample_index = chunk["sample_index"] or "none"
    read_group_string = chunk["read_group"] or "none"

    subprocess.check_call(['bucket_fastq_by_bc',
                           '-reads='+chunk["read1"],
                           '-read_group_string='+read_group_string,
                           '-barcodes='+barcode_read,
                           '-barcodeCounts='+barcode_counts,
                           '-bcConfidenceThreshold='+str(args.bc_confidence_threshold),
                           '-output_directory='+output_dir,
                           '-sample_index_reads='+sample_index,
                           '-gem_group='+str(gem_group),
                           '-barcode_whitelist='+barcode_whitelist,
                           '-interleaved='+str(chunk['reads_interleaved']),
                           '-max_expected_barcode_errors='+str(args.max_expected_barcode_errors),
                           '-buckets='+str(args.buckets)])
Beispiel #6
0
def join(args, outs, chunk_defs, chunk_outs):
    if do_not_make_cloupe(args):
        outs.output_for_cloupe = None
        return

    reference = ReferenceManager(args.reference_path)

    contig_info_fn = martian.make_path("contig_info.json")
    with open(contig_info_fn, 'w') as outfile:
        contig_info = get_contig_info(args.reference_path)
        json.dump(contig_info, outfile)

    gem_group_index_json = get_gem_group_index_json(args, outs)

    call = [
        "crconverter",
        args.sample_id,
        args.pipestance_type,
        "--matrix",
        args.feature_barcode_matrix,
        "--analysis",
        args.analysis,
        "--output",
        outs.output_for_cloupe,
        "--description",
        '"' + args.sample_desc + '"',
        "--peaks",
        args.peaks,
        "--fragmentsindex",
        args.fragments_index,
        "--geneannotations",
        reference.genes,
        "--contiginfo",
        contig_info_fn,
    ]

    if args.metrics_json is not None:
        call.extend(["--metrics", args.metrics_json])
    if args.aggregation_csv is not None:
        call.extend(["--aggregation", args.aggregation_csv])
    if gem_group_index_json is not None:
        call.extend(["--gemgroups", gem_group_index_json])
    transcript_gene_types = get_annotation_gene_types(args)
    if transcript_gene_types is not None:
        call.extend(["--geneannotationtypes", ",".join(transcript_gene_types)])

    # the sample desc may be unicode, so send the whole
    # set of args str utf-8 to check_output
    unicode_call = [arg.encode('utf-8') for arg in call]

    # but keep the arg 'call' here because log_info inherently
    # attempts to encode the message... (TODO: should log_info
    # figure out the encoding of the input string)
    martian.log_info("Running crconverter: %s" % " ".join(call))
    try:
        results = tk_subproc.check_output(unicode_call)
        martian.log_info("crconverter output: %s" % results)
    except subprocess.CalledProcessError as e:
        outs.output_for_cloupe = None
        martian.throw("Could not generate .cloupe file: \n%s" % e.output)
def main(args, outs):
    if not args.run_qc:
        return

    out_base = os.path.dirname(outs.qc_summary)
    whitelist_path = tk_preflight.check_barcode_whitelist(args.barcode_whitelist)
    file_infos = [tk_fasta.IlmnFastqFile(path) for path in args.input_files]

    bc_file_type = args.file_read_types_map[args.bc_read_type]
    barcode_files = [f for f in file_infos if f.read == bc_file_type]
    outs.summary_chunk = {
        'barcode': [],
        'read1': [],
        'read2': []
    }
    for idx, bf in enumerate(barcode_files):
        output_json_path = os.path.join(out_base, "output_%d_BC.json" % idx)
        subproc_args = [ 'barcodeqc', bf.filename, output_json_path, "--whitelist", whitelist_path,
                         "--bc-start-index", str(args.bc_start_index), "--bc-length",
                         str(args.bc_length)]
        if args.bc_read_type == "I2" and args.rc_i2_read:
            subproc_args.append("--rc")
        try:
            tk_proc.check_call(subproc_args)
        except subprocess.CalledProcessError, e:
            martian.throw("Could not QC barcodes: return code %s" % e.returncode)

        outs.summary_chunk['barcode'].append(output_json_path)
Beispiel #8
0
    def main(self):
        """Parses command line arguments and runs the stage main."""
        # Load args and retvals from metadata.
        args = martian.Record(self.metadata.read('args'))

        if self._run_type == 'split':
            self._run(
                lambda: self._record_result(lambda: self._module.split(args)))
            self.metadata.write('stage_defs', self._result)
            return

        outs = martian.Record(self.metadata.read('outs'))

        if self._run_type == 'main':
            self._run(lambda: self._module.main(args, outs))
        elif self._run_type == 'join':
            chunk_defs = [
                martian.Record(chunk_def)
                for chunk_def in self.metadata.read('chunk_defs')
            ]
            chunk_outs = [
                martian.Record(chunk_out)
                for chunk_out in self.metadata.read('chunk_outs')
            ]
            self._run(
                lambda: self._module.join(args, outs, chunk_defs, chunk_outs))
        else:
            martian.throw('Invalid run type %s' % self._run_type)

        # Write the output as JSON.
        self.metadata.write('outs', outs.items())
Beispiel #9
0
def main(args, outs):
    chunk = args.chunk

    if not chunk['reads_interleaved'] and (chunk['read1'] is None
                                           or chunk['read2'] is None):
        martian.throw(
            "must supply a read1 and read2 when reads_interleave == False")

    if chunk['reads_interleaved']:
        reads = chunk['read1']
    else:
        reads = [chunk['read1']]
        if chunk['read2'] is not None:
            reads.append(chunk['read2'])

    a = tenkit.align.Aligner(reads, outs.default)
    aligner = args.aligner

    ref_fasta = tenkit.reference.get_fasta(args.reference_path)
    a.output_alignment(aligner=aligner,
                       aligner_params={
                           'ref_fasta': ref_fasta,
                           'algorithm': args.aligner_method
                       },
                       num_threads=args.__threads,
                       sample=args.read_group_sample)
def main(args, outs):
    # this silences a weird non-failure in --strict=error mode
    # TODO(lhepler): remove this when martian upstream handles this itself
    outs.default = []

    chunk = args.chunk

    if not chunk['reads_interleaved'] and (chunk['read1'] is None
                                           or chunk['read2'] is None):
        martian.throw(
            "must supply a read1 and read2 when reads_interleave == False")

    if chunk['reads_interleaved']:
        reads = chunk['read1']
    else:
        reads = [chunk['read1']]
        if chunk['read2'] is not None:
            reads.append(chunk['read2'])
    a = tenkit.align.Aligner(reads, outs.output)
    aligner = args.aligner

    ref_fasta = tenkit.reference.get_fasta(args.reference_path)
    rg_string = chunk['read_group']
    read_group_header = tk_bam.make_rg_header(rg_string)
    a.output_alignment(aligner=aligner,
                       aligner_params={
                           'ref_fasta': ref_fasta,
                           'algorithm': args.aligner_method
                       },
                       num_threads=args.__threads,
                       read_group_header=read_group_header)
Beispiel #11
0
def main(args, outs):
    if not args.run_qc:
        return

    out_base = os.path.dirname(outs.qc_summary)
    whitelist_path = tk_preflight.check_barcode_whitelist(
        args.barcode_whitelist)
    file_infos = [tk_fasta.IlmnFastqFile(path) for path in args.input_files]

    bc_file_type = args.file_read_types_map[args.bc_read_type]
    barcode_files = [f for f in file_infos if f.read == bc_file_type]

    # Note: this is Martian 3 incompatible; revert back to summary_chunk if merging
    # back into master (also applies to additional references to `qc_summary` in the main function)
    #
    # see https://github.com/10XDev/tenkit/commit/2c59c9a24b0e7cd81945544f62ffde7ab632ed42
    outs.qc_summary = {'barcode': [], 'read1': [], 'read2': []}
    for idx, bf in enumerate(barcode_files):
        output_json_path = os.path.join(out_base, "output_%d_BC.json" % idx)
        subproc_args = [
            'barcodeqc', bf.filename, output_json_path, "--whitelist",
            whitelist_path, "--bc-start-index",
            str(args.bc_start_index), "--bc-length",
            str(args.bc_length)
        ]
        if args.bc_read_type == "I2" and args.rc_i2_read:
            subproc_args.append("--rc")
        try:
            tk_proc.check_call(subproc_args)
        except subprocess.CalledProcessError, e:
            martian.throw("Could not QC barcodes: return code %s" %
                          e.returncode)

        # needs to be summary_chunk in Martian 3
        outs.qc_summary['barcode'].append(output_json_path)
Beispiel #12
0
 def _run(self, cmd):
     """Run the given command under the currently configured profiler."""
     if self.jobinfo.profile_mode == 'mem':
         profiler = _MemoryProfile()
         self._run_profiler(cmd, profiler, 'profile_mem_txt')
     elif self.jobinfo.profile_mode == 'line':
         profiler = None
         try:
             profiler = line_profiler.LineProfiler()
         except NameError:
             martian.throw(
                 'Line-level profiling was requested, but line_profiler was not found.'
             )
         for func in self.funcs:
             profiler.add_function(func)
         self._run_profiler(cmd, profiler, 'profile_line_bin')
         iostr = StringIO()
         profiler.print_stats(stream=iostr)
         self.metadata.write_raw('profile_line_txt', iostr.getvalue())
     elif self.jobinfo.profile_mode == 'cpu':
         profiler = cProfile.Profile()
         self._run_profiler(cmd, profiler, 'profile_cpu_bin')
         iostr = StringIO()
         stats = pstats.Stats(profiler,
                              stream=iostr).sort_stats('cumulative')
         stats.print_stats()
         self.metadata.write_raw('profile_cpu_txt', iostr.getvalue())
     else:
         if self.jobinfo.profile_mode and self.jobinfo.profile_mode != 'disable':
             # Give the profiler a little bit of time to attach.
             time.sleep(0.5)
         cmd()
Beispiel #13
0
def load_alerts():
    alerts_file = os.path.join(ALARMS_LOCATION, "alarms-supernova.json")
    json_string = open(alerts_file, "r").read()
    try:
        alerts = json.loads(json_string)
    except:
        martian.throw("Incorrectly formatted alarms-supernova.json file.")
    for stage, alert_list in alerts.iteritems():
        for alert in alert_list:
            check_alert(stage, alert)
    return alerts
Beispiel #14
0
 def base_mask(read):
     if read["read_name"][0] == "R":
         return "Y" + str(read["read_length"])
     elif read["read_name"][0] == "I":
         if ignore_dual_index and read["read_name"] != sample_index_read:
             return "N" + str(read["read_length"])
         elif dual_indexed or read["read_name"] == sample_index_read:
             return "I" + str(read["read_length"])
         else:
             return "Y" + str(read["read_length"])
     else:
         martian.throw("read name was not recognized: %s" % read["read_name"])
Beispiel #15
0
def main(args, outs):
    """
    Run the vlconverter executable with inputs that should be available in the outs
    folder at the end of the pipeline run.  This will generate "output_for_vloupe.vloupe"
    in the stage folder.

    Memory usage not expected to be excessive with this (thus no custom split/join
    as of yet); it will need to load a few full files (bam.bai, fasta.fai) into memory.
    """
    if args.concat_ref_bam is None or not os.path.isfile(args.concat_ref_bam) or \
       args.consensus_bam is None or not os.path.isfile(args.consensus_bam) or \
       args.contig_bam_bai is None or not os.path.isfile(args.contig_bam_bai):
        martian.log_info(
            'One or more bam files missing - cannot make vloupe file')
        return

    call = [
        "vlconverter", args.sample_id, args.pipestance_type, "--output",
        outs.output_for_vloupe, "--reference-bam", args.concat_ref_bam,
        "--reference-bam-index", args.concat_ref_bam_bai, "--reference-fasta",
        args.concat_ref_fasta, "--reference-fasta-index",
        args.concat_ref_fasta_fai, "--reference-annotations",
        args.concat_ref_annotations_json, "--clonotypes", args.clonotypes_csv,
        "--consensus-bam", args.consensus_bam, "--consensus-bam-index",
        args.consensus_bam_bai, "--consensus-annotations",
        args.consensus_annotations_json, "--consensus-fasta",
        args.consensus_fasta, "--consensus-fasta-index",
        args.consensus_fasta_fai, "--contig-bam-relative-path",
        args.contig_bam_relative_path, "--contig-bam-index",
        args.contig_bam_bai, "--contig-annotations",
        args.contig_annotations_json, "--contig-bed",
        args.contig_annotations_bed, "--contig-fasta", args.contig_fasta,
        "--contig-fasta-index", args.contig_fasta_fai, "--description",
        args.sample_desc
    ]

    # the sample desc may be unicode, so send the whole
    # set of args str utf-8 to check_output
    unicode_call = [arg.encode('utf-8') for arg in call]

    # but keep the arg 'call' here because log_info inherently
    # attempts to encode the message... (TODO: should log_info
    # figure out the encoding of the input string)
    martian.log_info("Running vlconverter: %s" % " ".join(call))
    try:
        results = tk_subproc.check_output(unicode_call)
        martian.log_info("vlconverter output: %s" % results)
    except subprocess.CalledProcessError, e:
        outs.output_for_vloupe = None
        martian.throw("Could not generate .vloupe file: \n%s" % e.output)
Beispiel #16
0
def split(args):
    vc_mode, variant_caller, precalled_filename, gatk_path = tk_io.get_vc_mode(
        args.vc_precalled, args.variant_mode)
    precalled_file = None
    if vc_mode == "precalled" or vc_mode == "precalled_plus":
        mem_gb = 8
        threads = 1
        precalled_file = martian.make_path("precalled_vcf.vcf")
        tenkit.log_subprocess.check_call(
            ['cp', precalled_filename, precalled_file])
        tk_tabix.index_vcf(precalled_file)
        precalled_file = precalled_file + ".gz"
    if vc_mode != "precalled":
        if variant_caller == 'freebayes':
            mem_gb = 5
            threads = 1
        elif variant_caller == "gatk":
            mem_gb = 8
            threads = 2
            # make sure the gatk jar file exists
            if gatk_path is None:
                martian.throw(
                    "variant_caller 'gatk' selected, must supply path to gatk jar file -- e.g. \"gatk:/path/to/GenomeAnalysisTK.jar\""
                )

            gatk_loc = gatk_path
            if not (os.path.exists(gatk_loc)):
                martian.throw(
                    "variant_caller 'gatk' selected, gatk jar file does not exist: %s"
                    % gatk_loc)
        else:
            raise NotSupportedException('Variant caller not supported: ' +
                                        vc_mode)

    primary_contigs = tk_reference.load_primary_contigs(args.reference_path)
    bam_chunk_size_gb = 3.0

    if args.restrict_locus is None:
        loci = tk_chunks.get_sized_bam_chunks(args.input,
                                              bam_chunk_size_gb,
                                              contig_whitelist=primary_contigs,
                                              extra_args={
                                                  '__mem_gb': mem_gb,
                                                  '__threads': threads,
                                                  'split_input': precalled_file
                                              })
    else:
        loci = [{'locus': args.restrict_locus}]

    return {'chunks': loci}
Beispiel #17
0
def write_stage_alerts(stage, path, alerts_file="alerts.list"):
    alerts = load_alerts()
    out_file = os.path.join(path, alerts_file)
    if not os.path.exists(path):
        os.makedirs(path)
    out_handle = open(out_file, "w")
    keys = ["metric", "threshold", "compare", "action", "message"]
    if not alerts.has_key(stage):
        martian.throw("No alerts found for stage %s" % stage)
    for alert in alerts[stage]:
        out_handle.write("#\n")
        out_handle.write(alert["metric"] + "\n")
        out_handle.write(str(alert["threshold"]) + "\n")
        out_handle.write(alert["compare"] + "\n")
        out_handle.write(alert["action"] + "\n")
        out_handle.write(alert["message"] + "\n")
    out_handle.close()
Beispiel #18
0
def split(args):
    if args.fragments is None:
        return {'chunks': [], 'join': {}}

    if args.peaks is None:
        martian.throw("peaks BED file expected")
    if args.cell_barcodes is None:
        martian.throw("cell barcodes CSV file expected")

    ctg_mgr = ReferenceManager(args.reference_path)
    all_contigs = ctg_mgr.primary_contigs(allow_sex_chromosomes=True)

    chunks = []
    for contig in all_contigs:
        chunks.append({'contig': contig, '__mem_gb': 4})

    return {'chunks': chunks, 'join': {'__mem_gb': 8}}
Beispiel #19
0
def main(args, outs):
    """
    Run the vlconverter executable with inputs that should be available in the outs
    folder at the end of the pipeline run.  This will generate "output_for_vloupe.vloupe"
    in the stage folder.

    Memory usage not expected to be excessive with this (thus no custom split/join
    as of yet); it will need to load a few full files (bam.bai, fasta.fai) into memory.
    """
    if not os.path.isfile(args.concat_ref_bam) or \
       not os.path.isfile(args.consensus_bam) or \
       not os.path.isfile(args.contig_bam_bai):
        martian.log_info(
            'One or more bam files missing - cannot make vloupe file')
        return

    call = [
        "vlconverter", args.sample_id, args.pipestance_type, "--output",
        outs.output_for_vloupe, "--reference-bam", args.concat_ref_bam,
        "--reference-bam-index", args.concat_ref_bam_bai, "--reference-fasta",
        args.concat_ref_fasta, "--reference-fasta-index",
        args.concat_ref_fasta_fai, "--reference-annotations",
        args.concat_ref_annotations_json, "--clonotypes", args.clonotypes_csv,
        "--consensus-bam", args.consensus_bam, "--consensus-bam-index",
        args.consensus_bam_bai, "--consensus-annotations",
        args.consensus_annotations_json, "--consensus-fasta",
        args.consensus_fasta, "--consensus-fasta-index",
        args.consensus_fasta_fai, "--contig-bam-relative-path",
        args.contig_bam_relative_path, "--contig-bam-index",
        args.contig_bam_bai, "--contig-annotations",
        args.contig_annotations_json, "--contig-bed",
        args.contig_annotations_bed, "--contig-fasta", args.contig_fasta,
        "--contig-fasta-index", args.contig_fasta_fai, "--description",
        args.sample_desc
    ]

    martian.log_info("Running vlconverter: %s" % " ".join(call))
    try:
        results = subprocess.check_output(call)
        martian.log_info("vlconverter output: %s" % results)
    except subprocess.CalledProcessError, e:
        outs.output_for_vloupe = None
        martian.throw("Could not generate .vloupe file: \n%s" % e.output)
Beispiel #20
0
def main(args, outs):
    if args.chrom is None or len(args.starts) == 0 or args.barcode_whitelist is None:
        tk_sv_io.write_sv_df_to_bedpe(None, outs.sv_calls)
        return

    max_insert, ins_logsf_fun = tk_sv_utils.get_insert_size_info(args.insert_sizes, MAX_INSERT_SIZE_PRC)
    if max_insert is None:
        martian.throw('No Q60 reads')

    # This is slightly bigger than the maximum "normal" insert
    min_call_insert, _ = tk_sv_utils.get_insert_size_info(args.insert_sizes, MIN_SV_INSERT_SIZE_PRC)
    min_sv_len = max(args.min_sv_len, min_call_insert)
    martian.log_info('Setting min_sv_len to {}'.format(min_sv_len))
    
    with open(args.basic_summary, 'r') as f:
        summary = json.load(f)

    chimera_rate_del = summary['far_chimera_rate']
    chimera_rate_inv = summary['far_chimera_rate'] + summary['same_dir_chimera_rate']
    chimera_rate_dup = summary['far_chimera_rate'] + summary['outward_dir_chimera_rate']
    chimera_rates = {tk_readpairs.DEL_STR:chimera_rate_del,
                     tk_readpairs.INV_STR:chimera_rate_inv,
                     tk_readpairs.TDUP_STR:chimera_rate_dup,
                     tk_readpairs.TRANS_STR:summary['far_chimera_rate']}

    df, read_counts, _ = tk_readpairs.get_discordant_loci(args.possorted_bam, chrom = str(args.chrom),
                                                          starts = args.starts, stops = args.stops,
                                                          min_mapq = args.min_mapq, min_insert = 0,
                                                          max_insert = max_insert,
                                                          max_merge_range = args.merge_range_factor * max_insert,
                                                          min_sv_len = min_sv_len, max_sv_len = args.max_sv_len,
                                                          ins_logsf_fun = ins_logsf_fun,
                                                          min_lr_to_call = args.min_lr_to_call,
                                                          min_reads_to_call = args.min_reads_to_call,
                                                          chimera_rate = chimera_rates, reads_as_qual = True)

    # Need to convert to dict because defaultdict doesn't get pickled properly
    read_counts['split'] = dict(read_counts['split'])
    read_counts['pair'] = dict(read_counts['pair'])
    tk_sv_io.write_sv_df_to_bedpe(df, outs.sv_calls)
    with open(outs.discordant_read_counts, 'w') as f:
        f.write(tenkit.safe_json.safe_jsonify(read_counts))
def main(args, outs):
    chunk = args.chunk

    if not chunk['reads_interleaved'] and (chunk['read1'] is None or chunk['read2'] is None):
        martian.throw("must supply a read1 and read2 when reads_interleave == False")

    if chunk['reads_interleaved']:
        reads = chunk['read1']
    else:
        reads = [chunk['read1']]
        if chunk['read2'] is not None:
            reads.append(chunk['read2'])

    a = tenkit.align.Aligner(reads, outs.default)
    aligner = args.aligner

    ref_fasta = tenkit.reference.get_fasta(args.reference_path)
    rg_string = chunk['read_group']
    read_group_header = tk_bam.make_rg_header(rg_string)
    a.output_alignment(aligner=aligner, aligner_params={'ref_fasta': ref_fasta, 'algorithm': args.aligner_method}, num_threads=martian.get_threads_allocation(), read_group_header=read_group_header)
Beispiel #22
0
def main(args, outs):
    """
    run_path must be the top-level Illumina flowcell directory
    """
    if not os.path.exists(args.run_path):
        martian.throw("Run directory does not exist: %s" % args.run_path)

    run_info_xml = os.path.join(args.run_path, "RunInfo.xml")
    read_info, flowcell = tk_bcl.load_run_info(run_info_xml)
    outs.si_read_type = get_si_read_type(read_info)

    (rta_version, rc_i2_read,
     bcl_params) = tk_bcl.get_rta_version(args.run_path)
    martian.log_info("BCL folder RTA Version: %s" % rta_version)
    martian.log_info("BCL params: %s" % str(bcl_params))
    martian.log_info("RC'ing i2 read: %s" % str(rc_i2_read))
    outs.rc_i2_read = rc_i2_read

    split_by_tile = _split_by_tile(args)
    martian.log_info("Splitting by tile: %s" % str(split_by_tile))
    outs.split_by_tile = split_by_tile
Beispiel #23
0
def read_sv_bedpe_to_df(bedpe):
    col_names = [
        'chrom1', 'start1', 'stop1', 'chrom2', 'start2', 'stop2', 'name',
        'qual', 'strand1', 'strand2', 'filters', 'info'
    ]
    if bedpe is None:
        return pd.DataFrame(columns=col_names)

    try:
        df = pd.read_csv(bedpe,
                         sep='\t',
                         header=None,
                         index_col=None,
                         comment='#')
    except ValueError:
        return pd.DataFrame(columns=col_names)
    if df.shape[1] < 6:
        martian.throw('BEDPE file must have at least 6 columns')
    ncols = min(len(col_names), df.shape[1])
    df = df.iloc[:, 0:ncols]
    df.columns = col_names[0:ncols]
    df[['chrom1', 'chrom2']] = df[['chrom1', 'chrom2']].astype(object)
    df[['start1', 'stop1', 'start2',
        'stop2']] = df[['start1', 'stop1', 'start2', 'stop2']].astype(int)
    if not 'name' in df.columns:
        df['name'] = np.arange((len(df), ))
    if not 'qual' in df.columns:
        df['qual'] = 1
    if not 'strand1' in df.columns:
        df['strand1'] = '.'
    if not 'strand2' in df.columns:
        df['strand2'] = '.'
    if not 'filters' in df.columns:
        df['filters'] = '.'
    if not 'info' in df.columns:
        df['info'] = '.'

    return df
Beispiel #24
0
def split(args):
    # if the files have not been split by tile, we're done, just bail
    if not args.split_by_tile:
        return {'chunks': [{'lane': None, 'bcs': []}]}

    # from here forward, assume that we're dealing with FASTQs separated
    # by tile
    file_glob = os.path.join(args.demultiplexed_fastq_path, "Tile*", "*.fastq*")
    files = glob.glob(file_glob)

    if len(files) == 0:
        martian.throw("No FASTQ files were found.")

    # find the unique # of lanes there are in all
    file_info = [ BclProcessorFastqFile(x) for x in files ]
    lanes = sorted(set([fi.lane for fi in file_info]))

    # lexicographically sort barcodes (incoming order is in reverse frequency
    # order) in order to spread work around more evenly
    sorted_bcs = sorted(args.common_bcs)
    bc_groups, bc_remgroup = divmod(len(sorted_bcs), BARCODE_GROUP_SIZE)

    chunks = []
    for group_index in range(bc_groups):
        bcs = sorted_bcs[group_index*BARCODE_GROUP_SIZE:(group_index+1)*BARCODE_GROUP_SIZE]
        for lane in lanes:
            chunks.append({'__mem_gb': CHUNK_MEM_GB, 'lane': lane, 'bcs': bcs})
    if bc_remgroup > 0:
        bcs = sorted_bcs[bc_groups*BARCODE_GROUP_SIZE:]
        for lane in lanes:
            chunks.append({'__mem_gb': CHUNK_MEM_GB, 'lane': lane, 'bcs': bcs})

    # finally, the leftovers (si_X)
    for lane in lanes:
        chunks.append({'__mem_gb': CHUNK_MEM_GB, 'lane': lane, 'bcs': [DEMULTIPLEX_INVALID_SAMPLE_INDEX]})

    return {'chunks': chunks}
Beispiel #25
0
def validate_input(args):
    """Does various parsing and checking of input arguments before we enter the main flow path
    """
    ok, msg = tk_preflight.check_gem_groups(args.sample_def)
    if not ok:
        martian.exit(msg)

    def check_key(n, dict_in, name, tys):
        if not name in dict_in:
            martian.exit("Entry %d in sample_def missing required field: %s" %
                         (n, name))
        if not (type(dict_in[name]) in tys):
            martian.exit(
                "Entry %d in sample_def for '%s' has incorrect type -- expecting %s, got %s"
                % (n, name, str(tys), type(dict_in[name])))

    for (idx, sample_item) in enumerate(args.sample_def):
        check_key(idx, sample_item, "read_path", [str, unicode])
        check_key(idx, sample_item, "lanes", [list, type(None)])
        check_key(idx, sample_item, "gem_group", [int, type(None)])
        if args.input_mode == "BCL_PROCESSOR":
            check_key(idx, sample_item, "sample_indices", [list, type(None)])
        elif args.input_mode == "ILMN_BCL2FASTQ":
            check_key(idx, sample_item, "sample_names", [list, type(None)])

    if args.input_mode not in ["BCL_PROCESSOR", "ILMN_BCL2FASTQ"]:
        martian.throw("Unrecognized input_mode: %s" % args.input_mode)

    if args.downsample is not None:
        assert ("gigabases" in args.downsample
                or "subsample_rate" in args.downsample)
        assert (not ("gigabases" in args.downsample
                     and "subsample_rate" in args.downsample))
        if 'subsample_rate' in args.downsample and args.downsample[
                'subsample_rate'] is not None:
            assert (args.downsample['subsample_rate'] <= 1.0)
Beispiel #26
0
def main(args, outs):
    ok, msg = tk_preflight.check_gem_groups(args.sample_def)
    if not ok:
        martian.exit(msg)

    outs.chunks = []
    for sample_def in args.sample_def:
        fastq_mode = sample_def['fastq_mode']
        chunks = []

        if fastq_mode == tk_constants.BCL_PROCESSOR_FASTQ_MODE:
            chunks = main_bcl_processor(args.sample_id, sample_def,
                                        args.chemistry_name,
                                        args.custom_chemistry_def)
        elif fastq_mode == tk_constants.ILMN_BCL2FASTQ_FASTQ_MODE:
            chunks = main_ilmn_bcl2fastq(args.sample_id, sample_def,
                                         args.chemistry_name,
                                         args.custom_chemistry_def)
        else:
            martian.throw("Unrecognized fastq_mode: %s" % fastq_mode)

        if len(chunks) == 0:
            martian.exit(cr_constants.NO_INPUT_FASTQS_MESSAGE)

        outs.chunks += chunks

    if len(outs.chunks) == 0:
        martian.exit(cr_constants.NO_INPUT_FASTQS_MESSAGE)

    check_chunk_fastqs(outs.chunks)

    check_chunk_chemistries(outs.chunks)

    # Output chemistry and barcode whitelist
    outs.chemistry_def = outs.chunks[0]['chemistry']
    outs.barcode_whitelist = cr_chem.get_barcode_whitelist(outs.chemistry_def)
Beispiel #27
0
def check_alert(stage, alert):
    keys = ["action", "metric", "compare", "threshold", "message"]
    for key in keys:
        if not alert.has_key(key):
            print key, " is missing in "
            print alert
            martian.throw("incorrectly formatted alert, see stdout.")
    if not (alert["compare"] == "<" or alert["compare"] == ">"):
        print alert
        martian.throw("invalid value for compare in alert")
    if not (type(alert["threshold"]) == int
            or type(alert["threshold"]) == float):
        martian.throw("%s: invalid type for threshold" %
                      type(alert["threshold"]))
def main(args, outs):
    martian.throw('No chunks defined.')
def main(args, outs):
    """Combine reads from multiple input FASTQ files, and potentially trim.
       Demultiplex outputs a series of FASTQ files with filenames of the form:
       read-[RA|I1|I2]_si-AGTAACGT_lane-001_chunk_001.fastq[.gz].
    """

    def check_key(n, dict_in, name, tys):
        if not dict_in.has_key(name):
            martian.exit("Entry %d in sample_def missing required field: %s" % (n, name))

        if not (type(dict_in[name]) in tys):
            martian.exit("Entry %d in sample_def for '%s' has incorrect type -- expecting %s, got %s" % (n, name, str(tys), type(dict_in[name])))

    global_subsample_rate = 1.0
    downsample_gigabases = False
    downsample_reads      = False
    if args.downsample is not None:
        ## make sure that exactly one downsampling option is specified
        options_supplied=0
        for subsample_key in ["gigabases", "subsample_rate", "target_reads"]:
            if args.downsample.get(subsample_key, None) is not None:
                options_supplied += 1
        assert( options_supplied == 1 )
        ##
        if 'subsample_rate' in args.downsample and args.downsample['subsample_rate'] is not None:
            global_subsample_rate = args.downsample['subsample_rate']
            assert( global_subsample_rate <= 1.0 )
        elif 'target_reads' in args.downsample and args.downsample['target_reads'] is not None:
            downsample_reads = True
        else:
            downsample_gigabases = True

    # Check for self-consistent gem_group settings in the sample_def entries
    gem_groups = [x['gem_group'] for x in args.sample_def]
    all_null = all([x is None for x in gem_groups])
    all_int = all([type(x) is int for x in gem_groups])

    if not (all_null or all_int):
        martian.exit("Inconsistent gem_group tags. Please specify all gem_group tags as null, or all gem_group tags with an integer")

    # If all gem_groups are set to null, then set them all to 1
    if all_null:
        for sample_item in args.sample_def:
            sample_item['gem_group'] = 1

    # Predicted input bases
    total_seq_bases = 0
    total_seq_reads = 0

    # verify input mode upfront
    if args.input_mode not in ["BCL_PROCESSOR", "ILMN_BCL2FASTQ"]:
        martian.throw("Unrecognized input_mode: %s" % args.input_mode)

    for (idx, sample_item) in enumerate(args.sample_def):
        # validate fields
        check_key(idx, sample_item, "read_path", [str, unicode])
        check_key(idx, sample_item, "lanes",  [list, type(None)])
        check_key(idx, sample_item, "gem_group", [int, type(None)])
        if args.input_mode == "BCL_PROCESSOR":
            check_key(idx, sample_item, "sample_indices", [list, type(None)])
        elif args.input_mode == "ILMN_BCL2FASTQ":
            check_key(idx, sample_item, "sample_names", [list, type(None)])

    interleaved_read_type = "RA"

    chunks = []
    read_groups = set()

    for read_chunk in args.sample_def:
        # Check if subsample_rate exists in sample_def
        if 'subsample_rate' in read_chunk.keys():
            subsample_rate = global_subsample_rate * read_chunk['subsample_rate']
        else:
            subsample_rate = global_subsample_rate

        bc_in_read = {}
        if read_chunk.has_key('bc_in_read'):
            if read_chunk['bc_in_read'] is not None:
                bc_in_read['bc_in_read'] = read_chunk['bc_in_read']
                bc_in_read['bc_length'] = read_chunk['bc_length']

        path = read_chunk['read_path']
        lanes = read_chunk['lanes']
        gem_group = read_chunk['gem_group']
        unbarcoded = read_chunk.get('unbarcoded')
        sample_id = args.sample_id
        library_id = read_chunk.get('library_id', 'MissingLibrary')

        # split on BCL_PROCESSOR / ILMN_BCL2FASTQ
        # the main difference is that BCL_PROCESSOR uses interleaved reads and labels FASTQs by sample index;
        # whereas ILMN_BCL2FASTQ uses R1/R2 and labels by sample name

        if args.input_mode == "BCL_PROCESSOR":
            sample_index_strings, msg = tk_preflight.check_sample_indices(read_chunk)
            if sample_index_strings is None:
                martian.exit(msg)

            sample_seq_bases = 0
            sample_seq_reads = 0
            find_func = tk_fasta.find_input_fastq_files_10x_preprocess
            for sample_index in sample_index_strings:
                # process interleaved reads
                reads = find_func(path, interleaved_read_type, sample_index, lanes)
                for read in reads:
                    predicted_seq_reads, predicted_seq_bases = fastq_data_estimate(read)
                    sample_seq_bases += predicted_seq_bases
                    sample_seq_reads += predicted_seq_reads

            martian.log_info("Input data: Predict %f GB from %s" % (float(sample_seq_bases)/1e9, path))
            total_seq_bases += sample_seq_bases
            total_seq_reads += sample_seq_reads

            for sample_index in sample_index_strings:
                reads = find_func(path, interleaved_read_type, sample_index, lanes)
                # TODO confirm that this works with cellranger
                si_read, bc_read = ("I1", "I2")
                if 'barcode_read' in read_chunk and read_chunk['barcode_read'] == 'I1':
                    si_read, bc_read = ("I2", "I1")
                sis = find_func(path, si_read, sample_index, lanes)

                # allow empty sample index case if all reads in lane are same sample
                if sis is None or sis == []:
                    sis = [None] * len(reads)

                if not unbarcoded:
                    barcodes = find_func(path, bc_read, sample_index, lanes)
                    if len(barcodes) == 0:
                        barcodes = [None] * len(reads)
                else:
                    barcodes = [None] * len(reads)

                # calculate chunks
                for r,b,si in zip(reads, barcodes, sis):
                    (flowcell, lane) = get_run_data(r)
                    rg_string = ':'.join([sample_id, library_id, str(gem_group), flowcell, lane])
                    new_chunk = {
                        'read1': r, 'read2': None, 'reads_interleaved': True, 'barcode': b, 
                        'sample_index': si, 'barcode_reverse_complement': False, 'gem_group': gem_group,
                        'subsample_rate': subsample_rate, 'read_group': rg_string
                    }
                    new_chunk.update(bc_in_read)
                    chunks.append(new_chunk)
                    read_groups.add(rg_string)

        elif args.input_mode == "ILMN_BCL2FASTQ":
            sample_names = read_chunk['sample_names']

            sample_seq_bases = 0
            sample_seq_reads = 0
            find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult
            for sample_name in sample_names:
                # process read 1
                reads = find_func(path, "R1", sample_name, lanes)
                for read in reads:
                    predicted_seq_reads, predicted_seq_bases = fastq_data_estimate(read)
                    sample_seq_bases += predicted_seq_bases
                    sample_seq_reads += predicted_seq_reads
                # process read 2
                reads = find_func(path, "R2", sample_name, lanes)
                for read in reads:
                    predicted_seq_reads, predicted_seq_bases = fastq_data_estimate(read)
                    sample_seq_bases += predicted_seq_bases
                    sample_seq_reads += predicted_seq_reads

            martian.log_info("Input data: Predict %f GB from %s" % (float(sample_seq_bases)/1e9, path))
            total_seq_bases += sample_seq_bases
            total_seq_reads += sample_seq_reads

            for sample_name in sample_names:
                r1_reads = find_func(path, "R1", sample_name, lanes)
                r2_reads = find_func(path, "R2", sample_name, lanes)

                # TODO confirm that this works with cellranger
                si_read, bc_read = ("I1", "I2")
                if 'barcode_read' in read_chunk and read_chunk['barcode_read'] == 'I1':
                    si_read, bc_read = ("I2", "I1")
                sis = find_func(path, si_read, sample_name, lanes)

                # allow empty sample index case if all reads in lane are same sample
                if sis is None or sis == []:
                    sis = [None] * len(r1_reads)

                # in Chromium chemistry... there shouldn't be separate barcode reads...
                if not unbarcoded:
                    barcodes = find_func(path, bc_read, sample_name, lanes)
                    if len(barcodes) == 0:
                        barcodes = [None] * len(r1_reads)
                else:
                    barcodes = [None] * len(r1_reads)

                # again, with Chromium, the barcodes should be an array of Nones, but
                # just in case...
                if not (len(r1_reads) == len(r2_reads) == len(barcodes)):
                    martian.log_info("Read 1 files: %s" % str(r1_reads))
                    martian.log_info("Read 2 files: %s" % str(r2_reads))
                    martian.log_info("Barcode files: %s" % str(barcodes))
                    martian.exit("Read1, Read2, and Barcode files are mismatched. Exiting pipline")

                # calculate chunks
                for r1,r2,b,si in zip(r1_reads, r2_reads, barcodes, sis):
                    (flowcell, lane) = get_run_data(r1)
                    rg_string = ':'.join([sample_id, library_id, str(gem_group), flowcell, lane])
                    new_chunk = {
                        'read1': r1, 'read2': r2, 'reads_interleaved': False, 'barcode': b,
                        'sample_index': si, 'barcode_reverse_complement': False, 'gem_group': gem_group,
                        'subsample_rate': subsample_rate, 'read_group': rg_string
                    }
                    new_chunk.update(bc_in_read)
                    chunks.append(new_chunk)
                    read_groups.add(rg_string)

    martian.log_info("Input data: Predict %f total GB" % (float(total_seq_bases)/1e9))
    martian.log_info("            Predict %d total reads" % total_seq_reads)

    if len(chunks) == 0:
        martian.exit("No input FASTQs were found for the requested parameters.")

    if downsample_gigabases and args.downsample['gigabases'] is not None:
        # Calculate global downsample rate
        global_subsample_rate = min(1.0, float(args.downsample['gigabases'])*1e9 / float(total_seq_bases))
        martian.log_info("Input data downsampling: Requested: %.2f GB, Estimated Input: %.2f GB, Downsample Rate: %.3f" \
         % (float(args.downsample['gigabases']), float(total_seq_bases)/1e9, global_subsample_rate))

        for chunk in chunks:
            chunk['subsample_rate'] = chunk['subsample_rate'] * global_subsample_rate
    elif downsample_reads:
        global_subsample_rate = min(1.0, float(args.downsample['target_reads'])/float(total_seq_reads))
        martian.log_info("Input data downsampling: Requested: %.2f M reads, Estimated Input: %.2f M reads, Downsample Rate: %.3f" \
         % (float(args.downsample['target_reads'])/1e6, float(total_seq_reads)/1e6, global_subsample_rate))

        for chunk in chunks:
            chunk['subsample_rate'] = chunk['subsample_rate'] * global_subsample_rate



    martian.log_info("Input reads: %s" % str(chunks))
    outs.chunks = chunks
    outs.read_groups = [rg for rg in read_groups]

    # log info about input vs requested GB
    # first, set defaults
    available_gb = float(total_seq_bases)/1e9
    requested_gb = None
    available_reads = total_seq_reads
    requested_reads = None
    requested_rate = None
    post_downsample_gb = requested_gb
    downsample_succeeded = True

    if args.downsample is not None and args.downsample.get('gigabases') is not None:
        requested_gb = float(args.downsample['gigabases'])
        post_downsample_gb = min(available_gb, requested_gb)
        if available_gb < requested_gb:
            martian.log_info("Downsample requested more GB than was available; will not downsample.")
            downsample_succeeded = False

    elif args.downsample is not None and args.downsample.get('subsample_rate') is not None:
        requested_rate = float(args.downsample['subsample_rate'])
        post_downsample_gb = available_gb * requested_rate

    elif args.downsample is not None and args.downsample.get('target_reads') is not None:
        requested_reads = float(args.downsample['target_reads'])


    downsample_info = {}
    downsample_info['available_gb'] = available_gb
    downsample_info['requested_gb'] = requested_gb
    downsample_info['available_reads'] = available_reads
    downsample_info['requested_reads'] = requested_reads
    downsample_info['requested_rate'] = requested_rate
    downsample_info['post_downsample_gb'] = post_downsample_gb
    downsample_info['downsample_succeeded'] = downsample_succeeded

    with open(outs.downsample_info, 'w') as downsample_out:
        tenkit.safe_json.dump_numpy(downsample_info, downsample_out)

    check_fastqs(outs.chunks)
Beispiel #30
0
def main(args, outs):
    martian.throw("No chunks defined")