Esempio n. 1
0
    def post(self):
        self._posted = True

        ## if alarm file does not exist, do nothing
        if os.path.exists(self._alarms_file):
            with open(self._alarms_file, 'r') as fp:
                alerts = json.loads(fp.read())
        else:
            return

        meta_alarm = []
        exit_str = ''

        for k, v in alerts.iteritems():
            if not k in self.SN_ALERT_HANDLERS:
                meta_alarm.append("unknown key {} in {} (BUG)".format(
                    k, self._alarms_file))
            elif k == self.SN_EXIT:
                exit_str = ';'.join(v)
            else:
                handler = self.SN_ALERT_HANDLERS[k]
                for post in v:
                    handler(post)

        for alarm in meta_alarm:
            martian.alarm(alarm)

        if len(exit_str) > 0:
            self.exit(exit_str)
Esempio n. 2
0
def main(args, outs):
    raw_profiles, mask = coverage_matrix.load_matrix(
        args.raw_singlecell_profiles, args.reference_path)
    # Get mappability, GC content
    ncells = raw_profiles[0].shape[0]
    # Default:
    (intercept, linear, quadratic) = (1.0, 0.0, 0.0)
    # Sum up all single-cell profiles
    try:
        print('DEBUG 0')
        result = estimate_gc_bias(args.raw_singlecell_profiles, args.tracks,
                                  args.reference_path)
        print('DEBUG result')
        print(result)
        (quadratic, linear,
         intercept) = result['Summary']['quadratic_coefficients']
        print('DEBUG intercept=%f, linear=%f, quadratic=%f' %
              (intercept, linear, quadratic))
    except Exception as error:
        martian.alarm(
            "stages/copy_number_processor/estimate_gc_bias_coefficients/__init__ encountered an exception. Error: %s"
            % repr(error))
    # try/except
    #
    # Export scale factor and GC bias coefficients
    outs.linear = linear
    outs.quadratic = quadratic
Esempio n. 3
0
def split(args):
    '''We just align each chunk independently -- joining will happen in the join step of SORT_READS'''

    # Pull some reads from fastq files -- bail out if it's less than 25bp
    fastq_tests = [x['read1'] for x in args.chunks]

    for fastq_test in fastq_tests:
        with open(fastq_test) as in_file:
            reader = tk_fasta.read_generator_fastq(in_file)
            for name, read, qual in itertools.islice(reader, 10):
                continue
                if len(read) < MIN_READ_LENGTH:
                    martian.alarm(
                        "BWA-MEM can't handle reads <25bp -- reads will be unmapped."
                    )
                    continue

    # estimated amount of memory needed to process genome is 2x(num gigabases)+4GB
    reference_pyfasta = tenkit.reference.open_reference(args.reference_path)
    reference_bases = sum(
        len(reference_pyfasta[contig]) for contig in reference_pyfasta)
    base_mem_in_gb = int(math.ceil(2 * reference_bases / (1024.0**3)))

    mem_in_gb = base_mem_in_gb + 4
    chunks = [{
        'chunk': x,
        '__threads': args.num_threads,
        '__mem_gb': mem_in_gb
    } for x in args.chunks]
    return {'chunks': chunks}
Esempio n. 4
0
 def issue(self, metric, value, format_string=""):
     for alert in self.alerts:
         ## find the right metric
         if alert["metric"] == metric:
             ## should we trigger?
             if (alert["compare"] == ">") ^ (value < alert["threshold"]):
                 ## optional formatting of alert message with format_string or value
                 if len(format_string) == 0:
                     format_string = str(value)
                 message = alert["message"].replace("{}", format_string)
                 ## issue an alert
                 if alert["action"] == "alarm":
                     martian.alarm(message)
                 elif alert["action"] == "exit":
                     martian.exit(message)
def main(args, outs):
    normalized_singlecell_profiles, mask = coverage_matrix.load_matrix(
        args.normalized_singlecell_profiles, args.reference_path)

    print('DEBUG generate_final_clusters/__init__.main():')
    print('normalized_singlecell_profiles[0].shape')
    print(normalized_singlecell_profiles[0].shape)

    ncells = normalized_singlecell_profiles[0].shape[0]
    results = [range(ncells)]
    try:
        if args.skip_clustering:
            print('Skipping clustering.')
        else:
            ## NOTE: this is a temporary short circuit of clustering when there are more than
            ## 500 cells. We will revisit this module and fix the issue later.
            if True:  # ncells < 500:
                # results = cluster_jedna.cluster(normalized_singlecell_profiles, mask, n_merge=25, score_cutoff=10)
                results = cluster_jedna.cluster(normalized_singlecell_profiles,
                                                mask,
                                                n_merge=25,
                                                score_cutoff=5)
            else:
                martian.alarm(
                    "Too many cells for clustering. Putting all cells in one cluster."
                )
            # if ncells else
        # if skip_clustering else
    except Exception as error:
        martian.alarm(
            "Clustering encountered an exception. Putting all cells in one cluster. Error: %s"
            % repr(error))
    # try/except
    #
    out_file = open(outs.clusters, 'w')
    out_file.write(tenkit.safe_json.safe_jsonify(results))
    out_file.close()
def main(args, outs):

    with open(args.summary) as sf:
        metrics = json.load(sf)
    
    with open(CRDNA_ALARMS) as af:
        alarms = json.load(af)

    howbadisit = tenkit.alarms.evaluate_alarms(alarms, metrics)

    with open(outs.alarms, 'w') as of:
        of.write(tenkit.safe_json.safe_jsonify(howbadisit))

    with open(outs.alarms_summary, 'w') as sf:
        sf.write("10X Genomics -- Pipeline Run Details\n")
        sf.write("-" * 40 + "\n")
        sf.write("Sample ID: %s\n" % args.sample_id)
        sf.write("Reference: %s\n" % args.reference_path)

        for oopsie in howbadisit:
            sf.write("ALERT: %s is %s. %s\n" % (oopsie["title"], oopsie["formatted_value"], oopsie["message"]))

    if len(howbadisit) > 0:
        martian.alarm("There were %i sequencing alerts. Look at alarms_summary.txt for details.\n" % len(howbadisit))
Esempio n. 7
0
def join(args, outs, chunk_defs, chunk_outs):
    cell_barcodes = {}
    full_counts = {}
    for chunk_out in chunk_outs:
        if not chunk_out.counts:
            continue
        for (species, bc_counts) in chunk_out.counts.iteritems():
            if not species in full_counts:
                full_counts[species] = {}
            for (bc, count) in bc_counts.iteritems():
                full_counts[species][bc] = count

    for species in full_counts.iterkeys():
        sorted_data = sorted(full_counts[species].items(),
                             key=lambda item: item[1],
                             reverse=True)
        bc_counts = [item[1] for item in sorted_data]

        ## Define cells by first taking one log width. Then refine by choosing
        ## one log width from the 99th percentile amongst the cells
        max_count = bc_counts[0]
        min_count = float(max_count) / np.power(10, args.log_width)

        count_99th = np.percentile([x for x in bc_counts if x >= min_count],
                                   99)
        min_count = float(count_99th) / np.power(10, args.log_width)

        # implement force_cells if supplied to overrule min_count
        if args.force_cells is not None and args.force_cells > 0:
            index = min(args.force_cells, len(bc_counts)) - 1
            min_count = max(bc_counts[index], 1)
            martian.log_info("Using force_cells")

        if not species in cell_barcodes:
            cell_barcodes[species] = {}
        for i, (bc, count) in enumerate(sorted_data, start=1):
            if count < min_count:
                break
            cell_barcodes[species][bc] = count
            if i >= MAX_CELLS:
                martian.log_info("%s: hit maximum number of cells "\
                    "(%d)"%(species, MAX_CELLS))
                min_count = count
                break

        martian.log_info("%s: max count %d, min count %d" %
                         (species, max_count, min_count))

        # some logging
        ncell = len(cell_barcodes[species])
        nobs = len(bc_counts)
        if len(cell_barcodes[species]) > 0:
            mean = np.mean(cell_barcodes[species].values())
            median = np.median(cell_barcodes[species].values())
        else:
            mean = 0
            median = 0
        print("{}: {} cells of {} obs, cell barcode reads: "
              "mean = {:.2f}, median = {:.1f}").format(species, ncell, nobs,
                                                       mean, median)

    # alarm user
    with open(CRDNA_ALARMS) as af:
        alarms = json.load(af)
    # filter alarms
    alarms = [
        alarm for alarm in alarms
        if alarm['id'] in ['not_enough_cells', 'too_many_cells']
    ]
    num_cells = sum([len(bc_cts) for bc_cts in cell_barcodes.itervalues()])
    alarm_results = tk_alarms.evaluate_alarms(alarms, {'num_cells': num_cells})
    for a in alarm_results:
        martian.alarm("%s is %s. %s\n" %
                      (a["title"], a["formatted_value"], a["message"]))

    outs.cell_barcodes = cell_barcodes
Esempio n. 8
0
 def __del__(self):
     if not self._posted:
         martian.alarm(
             "C++ alarms were not posted, but left in file (BUG).")
     else:
         self.check_delete()
Esempio n. 9
0
def main(args, outs):
    hostname = socket.gethostname()

    # Sample ID / pipestance name
    if args.sample_id is not None:
        if not re.match("^[\w-]+$", args.sample_id):
            martian.exit("Sample name may only contain letters, numbers, underscores, and dashes: " + args.sample_id)

    # Check numerical options
    # types are already checked by mrp so only need to check ranges
    if args.force_cells is not None and (args.force_cells < 1 or
        args.force_cells > 20000):
        martian.exit("MRO parameter force_cells must be a positive integer"\
            " <= 20000.")

    # check min_ploidy, max_ploidy
    if args.cnv_params is not None:
        min_ploidy = args.cnv_params.get("min_ploidy", None)
        max_ploidy = args.cnv_params.get("max_ploidy", None)
        if min_ploidy is not None and min_ploidy <= 0:
            martian.exit("Command line argument soft-min-avg-ploidy must be a "\
                "positive real number.")
        if max_ploidy is not None and (max_ploidy <= 0 or max_ploidy > 8.0):
            martian.exit("Command line argument soft-max-avg-ploidy must be a "\
                "positive real number <= 8.")
        if (min_ploidy is not None and max_ploidy is not None and 
            max_ploidy <= min_ploidy):
            martian.exit("Command line arguments must satisfy "\
                "soft-min-avg-ploidy < soft-max-avg-ploidy.")

    # check downsample options
    if args.downsample is not None and len(args.downsample.keys()) > 0:
        keys = args.downsample.keys()
        if len(keys) > 1:
            martian.exit("Please supply either maxreads or downsample but not "\
                "both.")
        key = keys[0]
        value = args.downsample[key]
        param_map = {"target_reads" : "maxreads", "gigabases" : "downsample"}
        bad_value = False
        try:
            float(value)
            bad_value = value < 1e-12
        except ValueError:
            bad_value = True
        if bad_value:
            cs_key = param_map[key]
            martian.exit("Command line argument %s must be a positive number" %
                cs_key)

    # FASTQ input
    for idx, sample_def in enumerate(args.sample_def):
        read_path = sample_def["read_path"]
        if not read_path:
            martian.exit("Must specify a read_path containing FASTQs.")
        if not read_path.startswith('/'):
            martian.exit("Specified FASTQ folder must be an absolute path: %s" % read_path)
        if not os.path.exists(read_path):
            martian.exit("On machine: %s, specified FASTQ folder does not exist: %s" % (hostname, read_path))
        if not os.access(read_path, os.X_OK):
            martian.exit("On machine: %s, longranger does not have permission to open FASTQ folder: %s" % (hostname, read_path))
        if not os.listdir(read_path):
            martian.exit("Specified FASTQ folder is empty: " + read_path)

        library_id = sample_def.get("library_id")
        if library_id is not None:
            if not re.match("^[\w-]+$", library_id):
                martian.exit("Library name may only contain letters, numbers, underscores, and dashes: " + library_id)

        lanes = sample_def["lanes"]
        if lanes is not None:
            for lane in lanes:
                if not tk_preflight.is_int(lane):
                    martian.exit("Lanes must be a comma-separated list of numbers.")

        if args.fastq_mode == "BCL_PROCESSOR":
            sample_indices, msg = tk_preflight.check_sample_indices(sample_def)
            if sample_indices is None:
                martian.exit(msg)

            find_func = tk_fasta.find_input_fastq_files_10x_preprocess
            reads = []
            for sample_index in sample_indices:
                # process interleaved reads
                reads.extend(find_func(read_path, "RA", sample_index, lanes))
            if len(reads) == 0:
                martian.exit("No input FASTQs were found for the requested parameters.")
        elif args.fastq_mode == "ILMN_BCL2FASTQ":
            sample_names = sample_def.get("sample_names", None)
            if sample_names is None:
                martian.exit("Entry {} in sample_def missing required field: sample_names".format(idx))
            find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult
            reads1 = []
            reads2 = []
            for sample_name in sample_names:
                r1 = find_func(read_path, "R1", sample_name, lanes)
                r2 = find_func(read_path, "R2", sample_name, lanes)
                if len(r1) != len(r2):
                    martian.exit("Entry {} in sample_defs are missing input FASTQs.".format(idx))
                reads1.extend(r1)
                reads2.extend(r2)
            if len(reads1) == 0 and len(reads2) == 0:
                martian.exit("No input FASTQs were found for the requested parameters.")
        else:
            martian.exit("Unrecognized fastq_mode: {}".format(args.fastq_mode))

    # Reference
    ok, msg = tk_preflight.check_refdata(args.reference_path, max_contigs=None)
    if ok:
        martian.log_info(msg)
    else:
        martian.exit(msg)
    contig_defs_json_path = os.path.join(args.reference_path, "fasta", 
        "contig-defs.json")
    faidx_path = os.path.join(args.reference_path, "fasta", 
        "genome.fa.fai")
    error_msg = contig_manager.verify_contig_defs(contig_defs_json_path,
        faidx_path)
    if error_msg is not None:
        martian.exit(error_msg)

    try:
        ref = contig_manager.contig_manager(args.reference_path)
    except Exception as e:
        martian.exit("Unexpected error occurred.\n%s"%str(e))

    # too many contigs
    primary = ref.primary_contigs(allow_sex_chromosomes=True)
    num_primary_contigs = len(primary)
    if num_primary_contigs > 100:
        martian.exit("There can be at most 100 primary contigs.")

    # contig length checks
    chrom_length_dict = ref.get_contig_lengths()

    contig_length_exit = 500 * 1000
    contig_length_warn = 10 ** 7
    offending_contigs_warn = []
    offending_contigs_exit = []
    for c in primary:
        clen = chrom_length_dict[c]
        if clen < contig_length_exit:
            offending_contigs_exit.append(c)
        elif clen < contig_length_warn:
            offending_contigs_warn.append(c)
    if len(offending_contigs_exit) > 0:
        martian.exit("Primary contig(s) \"%s\" are shorter than %d bases. "\
            "Every primary contig must be at least %d bases "\
            "in length."%(",".join(offending_contigs_exit), contig_length_exit,
                          contig_length_exit))
    elif (not args.check_executables) and len(offending_contigs_warn) > 0:
        martian.alarm("Primary contig(s) \"%s\" are shorter than %d bases. "\
            "Every primary contig is recommended to be at least %d bases "\
            "in length."%(",".join(offending_contigs_warn), contig_length_warn,
                          contig_length_warn))

    # Open file handles limit 
    if args.check_executables:
        ok, msg = tk_preflight.check_open_fh()
        if not ok:
            martian.exit(msg)

    martian.log_info(tk_preflight.record_package_versions())
Esempio n. 10
0
def main(args, outs):
    """Combine reads from multiple input FASTQ files, and potentially trim.
       Demultiplex outputs a series of FASTQ files with filenames of the form:
       read-[RA|I1|I2]_si-AGTAACGT_lane-001_chunk_001.fastq[.gz].
    """

    def check_key(n, dict_in, name, tys):
        if not dict_in.has_key(name):
            martian.exit("Entry %d in sample_def missing required field: %s" % (n, name))

        if not (type(dict_in[name]) in tys):
            martian.exit("Entry %d in sample_def for '%s' has incorrect type -- expecting %s, got %s" % (n, name, str(tys), type(dict_in[name])))


    if args.downsample is not None:
        if len(args.downsample.keys()) > 1:
            martian.exit("More than one downsampling mode requested. Please select a single downsampling mode")

        (k,v) = args.downsample.items()[0]
        if not k in ["gigabases", "subsample_rate", "target_reads"]:
            martian.exit("Unrecognized downsampling mode: %s" % k)

    # Check for self-consistent gem_group settings in the sample_def entries
    gem_groups = [x['gem_group'] for x in args.sample_def]
    all_null = all([x is None for x in gem_groups])
    all_int = all([type(x) is int for x in gem_groups])

    if not (all_null or all_int):
        martian.exit("Inconsistent gem_group tags. Please specify all gem_group tags as null, or all gem_group tags with an integer")

    # If all gem_groups are set to null, then set them all to 1
    if all_null:
        for sample_item in args.sample_def:
            sample_item['gem_group'] = 1

    # Predicted input bases
    total_seq_bases = 0
    # Predicted input reads
    total_input_reads = 0

    # verify input mode upfront
    if args.input_mode not in ["BCL_PROCESSOR", "ILMN_BCL2FASTQ"]:
        martian.throw("Unrecognized input_mode: %s" % args.input_mode)

    for (idx, sample_item) in enumerate(args.sample_def):
        # validate fields
        check_key(idx, sample_item, "read_path", [str, unicode])
        check_key(idx, sample_item, "lanes",  [list, type(None)])
        check_key(idx, sample_item, "gem_group", [int, type(None)])
        if args.input_mode == "BCL_PROCESSOR":
            check_key(idx, sample_item, "sample_indices", [list, type(None)])
        elif args.input_mode == "ILMN_BCL2FASTQ":
            check_key(idx, sample_item, "sample_names", [list, type(None)])

    interleaved_read_type = "RA"

    chunks = []
    read_groups = set()

    for read_chunk in args.sample_def:
        # Each sample_def entry can have a separate pre-applied downsampling rate
        # We adjust the estimated data in that chunk to account for this
        # subsampling
        chunk_subsample_rate = read_chunk.get('subsample_rate', 1.0)

        bc_in_read = {}
        if read_chunk.has_key('bc_in_read'):
            if read_chunk['bc_in_read'] is not None:
                bc_in_read['bc_in_read'] = read_chunk['bc_in_read']
                bc_in_read['bc_length'] = read_chunk['bc_length']

        path = read_chunk['read_path']
        lanes = read_chunk['lanes']
        gem_group = read_chunk['gem_group']
        unbarcoded = read_chunk.get('unbarcoded')
        sample_id = args.sample_id
        library_id = read_chunk.get('library_id', 'MissingLibrary')

        # split on BCL_PROCESSOR / ILMN_BCL2FASTQ
        # the main difference is that BCL_PROCESSOR uses interleaved reads and labels FASTQs by sample index;
        # whereas ILMN_BCL2FASTQ uses R1/R2 and labels by sample name

        if args.input_mode == "BCL_PROCESSOR":
            sample_index_strings, msg = tk_preflight.check_sample_indices(read_chunk)
            if sample_index_strings is None:
                martian.exit(msg)

            sample_seq_bases = 0
            read_length = 100 # Should be overwritten below

            find_func = tk_fasta.find_input_fastq_files_10x_preprocess
            for sample_index in sample_index_strings:
                # process interleaved reads
                reads = find_func(path, interleaved_read_type, sample_index, lanes)
                for read in reads:
                    _, predicted_seq_bases, read_length = fastq_data_estimate(read)
                    sample_seq_bases += predicted_seq_bases

            sample_seq_bases = chunk_subsample_rate * sample_seq_bases
            bp_per_read_pair = 2*read_length

            martian.log_info("Input data: Predict %f GB from %s. (%d bp per read pair)" % (float(sample_seq_bases)/1e9, path, bp_per_read_pair))
            total_seq_bases += sample_seq_bases
            total_input_reads += float(sample_seq_bases)/read_length

            for sample_index in sample_index_strings:
                reads = find_func(path, interleaved_read_type, sample_index, lanes)
                # TODO confirm that this works with cellranger
                si_read, bc_read = ("I1", "I2")
                if 'barcode_read' in read_chunk and read_chunk['barcode_read'] == 'I1':
                    si_read, bc_read = ("I2", "I1")
                sis = find_func(path, si_read, sample_index, lanes)

                # allow empty sample index case if all reads in lane are same sample
                if sis is None or sis == []:
                    sis = [None] * len(reads)

                if not unbarcoded:
                    barcodes = find_func(path, bc_read, sample_index, lanes)
                    if len(barcodes) == 0:
                        barcodes = [None] * len(reads)
                else:
                    barcodes = [None] * len(reads)

                # calculate chunks
                for r,b,si in zip(reads, barcodes, sis):
                    (flowcell, lane) = get_run_data(r)
                    rg_string = tk_bam.pack_rg_string(sample_id, library_id, gem_group, flowcell, lane)
                    new_chunk = {
                        'read1': r, 'read2': None, 'reads_interleaved': True, 'barcode': b,
                        'sample_index': si, 'barcode_reverse_complement': False, 'gem_group': gem_group,
                        'subsample_rate': chunk_subsample_rate, 'read_group': rg_string
                    }
                    new_chunk.update(bc_in_read)
                    chunks.append(new_chunk)
                    read_groups.add(rg_string)

        elif args.input_mode == "ILMN_BCL2FASTQ":
            sample_names = read_chunk['sample_names']

            read_length1 = None
            read_length2 = None
            sample_seq_bases = 0
            find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult
            for sample_name in sample_names:
                # process read 1
                reads = find_func(path, "R1", sample_name, lanes)
                for read in reads:
                    _, predicted_seq_bases, read_length1 = fastq_data_estimate(read)
                    sample_seq_bases += predicted_seq_bases
                # process read 2
                reads = find_func(path, "R2", sample_name, lanes)
                for read in reads:
                    _, predicted_seq_bases, read_length2 = fastq_data_estimate(read)
                    sample_seq_bases += predicted_seq_bases

            if read_length1 is None and read_length2 is None:
                martian.exit("No input FASTQs were found for the requested parameters.")
            elif read_length1 is None:
                martian.exit("No input FASTQs were found for Read1.")
            elif read_length2 is None:
                martian.exit("No input FASTQs were found for Read2.")

            sample_seq_bases = chunk_subsample_rate * sample_seq_bases
            bp_per_read_pair = read_length1 + read_length2

            martian.log_info("Input data: Predict %f GB from %s. (%d bp per read pair)" % (float(sample_seq_bases)/1e9, path, bp_per_read_pair))
            total_seq_bases += sample_seq_bases
            total_input_reads += float(sample_seq_bases)*2/(read_length1 +
                read_length2)

            for sample_name in sample_names:
                r1_reads = find_func(path, "R1", sample_name, lanes)
                r2_reads = find_func(path, "R2", sample_name, lanes)

                # TODO confirm that this works with cellranger
                si_read, bc_read = ("I1", "I2")
                if 'barcode_read' in read_chunk and read_chunk['barcode_read'] == 'I1':
                    si_read, bc_read = ("I2", "I1")
                sis = find_func(path, si_read, sample_name, lanes)

                # allow empty sample index case if all reads in lane are same sample
                if sis is None or sis == []:
                    sis = [None] * len(r1_reads)

                # in Chromium chemistry... there shouldn't be separate barcode reads...
                if not unbarcoded:
                    barcodes = find_func(path, bc_read, sample_name, lanes)
                    if len(barcodes) == 0:
                        barcodes = [None] * len(r1_reads)
                else:
                    barcodes = [None] * len(r1_reads)

                # again, with Chromium, the barcodes should be an array of Nones, but
                # just in case...
                if not (len(r1_reads) == len(r2_reads) == len(barcodes)):
                    martian.log_info("Read 1 files: %s" % str(r1_reads))
                    martian.log_info("Read 2 files: %s" % str(r2_reads))
                    martian.log_info("Barcode files: %s" % str(barcodes))
                    martian.exit("Read1, Read2, and Barcode files are mismatched. Exiting pipline")

                # calculate chunks
                for r1,r2,b,si in zip(r1_reads, r2_reads, barcodes, sis):
                    (flowcell, lane) = get_run_data(r1)
                    rg_string = tk_bam.pack_rg_string(sample_id, library_id, gem_group, flowcell, lane)
                    new_chunk = {
                        'read1': r1, 'read2': r2, 'reads_interleaved': False, 'barcode': b,
                        'sample_index': si, 'barcode_reverse_complement': False, 'gem_group': gem_group,
                        'subsample_rate': chunk_subsample_rate, 'read_group': rg_string
                    }
                    new_chunk.update(bc_in_read)
                    chunks.append(new_chunk)
                    read_groups.add(rg_string)

    martian.log_info("Input data: Predict %f total GB" % (float(total_seq_bases)/1e9))

    if len(chunks) == 0:
        martian.exit("No input FASTQs were found for the requested parameters.")


    #
    # Downsampling setup
    #

    # The total available input raw gigabases of input data (est_gb), and the base pairs per read pair (bp_per_read_pair)
    # are estimated above.
    (est_gb, bp_per_read_pair) = (float(total_seq_bases)/1e9, bp_per_read_pair)

    downsample = args.downsample if args.downsample is not None else {}

    # Possible BC subsampling -- try to get the requested amount of data _after_ bc subsampling
    est_gb_post_bc = est_gb * downsample.get("bc_subsample_rate", 1.0)

    # Aim high to ensure that we won't be left with too few reads
    fudge_factor = 1.00

    downsample_succeeded = True

    if downsample.has_key("gigabases"):
        read_sample_rate = min(1.0, fudge_factor * downsample['gigabases'] / est_gb_post_bc)
        requested_read_pairs = int(1e9 * downsample['gigabases'] / bp_per_read_pair)
        downsample_succeeded = downsample['gigabases'] > est_gb_post_bc

    elif downsample.has_key("target_reads"):
        requested_read_pairs = int(downsample['target_reads'] / 2)
        est_read_pair_post_bc = 1e9 * est_gb_post_bc / bp_per_read_pair
        read_sample_rate = min(1.0, fudge_factor * requested_read_pairs / est_read_pair_post_bc)
        downsample_succeeded = requested_read_pairs > est_read_pair_post_bc

    elif downsample.has_key("subsample_rate"):
        read_sample_rate = min(1.0, downsample["subsample_rate"] / downsample.get("bc_subsample_rate", 1.0))
        requested_read_pairs = None
    else:
        if len(downsample.keys()) > 0:
            martian.exit("Unrecognized downsample request: %s.\n Please use 'gigabases', 'target_reads', or 'subsample_rate'" % str(downsample))
        read_sample_rate = 1.0
        requested_read_pairs = None

    ## Alert if user requests analysis on too many reads
    ## Three CS scenarios:
    ## no downsampling
    ## "gigabases" downsampling
    ## "target_reads" downsampling
    READ_THRESHOLD = 5*1000*1000*1000
    est_reads_post_ds = (requested_read_pairs*2 if requested_read_pairs is 
        not None else total_input_reads)
    martian.log_info("Estimate %.3f M reads entering pipeline" % 
        (est_reads_post_ds/1e6))
    if est_reads_post_ds > READ_THRESHOLD:
        martian.alarm("We will be processing data from %.1f billion reads "\
            "and the pipeline run time will likely exceed 24 hours. Please "\
            "consult the 10x support website for guidance on run times. You "\
            "can reduce the number of reads using the downsample/maxreads "\
            "command-line option." % (est_reads_post_ds/1e9))

    martian.log_info("Downsampling request: %s" % str(downsample))
    martian.log_info("Base pairs per read pair: %s" % bp_per_read_pair)
    martian.log_info("Estimated Input: %.2f GB, Initial Downsample Rate: %.3f. Requested total reads: %s" % (est_gb, read_sample_rate, str(requested_read_pairs)))

    # Copy over the per-chunk subsample rates
    if read_sample_rate is not None:
        for chunk in chunks:
            chunk['subsample_rate'] = chunk.get('subsample_rate', 1.0) * read_sample_rate
            if downsample.has_key("bc_subsample_rate"):
                chunk["bc_subsample_rate"] = downsample["bc_subsample_rate"]

    outs.requested_read_pairs = requested_read_pairs

    martian.log_info("Input reads: %s" % str(chunks))
    outs.chunks = chunks
    outs.read_groups = [rg for rg in read_groups]

    downsample_info = {}
    downsample_info['available_gb'] = est_gb
    downsample_info['requested_gb'] = downsample.get('gigabases', None)
    downsample_info['requested_rate'] = read_sample_rate
    downsample_info['post_downsample_gb'] = float(requested_read_pairs * bp_per_read_pair) / 1e9 if requested_read_pairs is not None else None
    downsample_info['downsample_succeeded'] = downsample_succeeded

    with open(outs.downsample_info, 'w') as downsample_out:
        tenkit.safe_json.dump_numpy(downsample_info, downsample_out)

    check_fastqs(outs.chunks)

    # Give out full path to BC whitelist
    if args.barcode_whitelist:
        outs.barcode_whitelist_path = bc_utils.barcode_whitelist_path(args.barcode_whitelist)
    else:
        outs.barcode_whitelist_path = None
Esempio n. 11
0
def main(args, outs):
    hostname = socket.gethostname()
    tk_preflight.record_package_versions()

    ## no barcode whitelist
    if args.barcode_whitelist is None:
        martian.exit("No barcode whitelist specified.")

    ## there must be a barcode in each sample
    ## and it should be 16 bases long
    ## and it should be on read 1 or read 2
    for sd in args.sample_def:
        if sd.get("bc_length", 0) != 16 or sd.get("bc_in_read",
                                                  3) not in [1, 2]:
            martian.exit("Barcode must be 16 bases and on read1 or read2.")

    print "Checking FASTQ folder..."
    for sample_def in args.sample_def:
        read_path = sample_def["read_path"]
        if not read_path:
            martian.exit("Must specify a read_path containing FASTQs.")
        if not read_path.startswith('/'):
            martian.exit(
                "Specified FASTQ folder must be an absolute path: %s" %
                read_path)
        if not os.path.exists(read_path):
            martian.exit(
                "On machine: %s, specified FASTQ folder does not exist: %s" %
                (hostname, read_path))
        if not os.access(read_path, os.X_OK):
            martian.exit(
                "On machine: %s, supernova does not have permission to open FASTQ folder: %s"
                % (hostname, read_path))
        if not os.listdir(read_path):
            martian.exit("Specified FASTQ folder is empty: " + read_path)

        library_id = sample_def.get("library_id")
        if library_id is not None:
            if not re.match("^[\w-]+$", library_id):
                martian.exit(
                    "Library name may only contain letters, numbers, underscores, and dashes: "
                    + library_id)

        lanes = sample_def["lanes"]
        if lanes is not None:
            for lane in lanes:
                if not is_int(lane):
                    martian.exit(
                        "Lanes must be a comma-separated list of numbers.")

    # Open file handles limit
    ok, msg = tk_preflight.check_open_fh()
    if not ok:
        martian.exit(msg)

    ## compile a list of fastq files
    fastq_files = []
    if args.input_mode == "BCL_PROCESSOR":
        # Validate the sample_def fields are correct
        for (idx, sample_item) in enumerate(args.sample_def):
            # validate
            check_key(idx, sample_item, "sample_indices", [list, type(None)])
            check_key(idx, sample_item, "read_path", [str, unicode])
            check_key(idx, sample_item, "lanes", [list, type(None)])

        main_read_type = "RA"
        find_func = tk_fasta.find_input_fastq_files_10x_preprocess

        for read_chunk in args.sample_def:
            sample_index_strings, msg = tk_preflight.check_sample_indices(
                read_chunk)
            if sample_index_strings is None:
                martian.exit(msg)

            path = read_chunk['read_path']
            lanes = read_chunk['lanes']

            for sample_index in sample_index_strings:
                reads = find_func(path, main_read_type, sample_index, lanes)
                fastq_files.extend(reads)
    elif args.input_mode == "ILMN_BCL2FASTQ":
        # Validate the sample_def fields are correct
        for (idx, sample_item) in enumerate(args.sample_def):
            # validate
            check_key(idx, sample_item, "read_path", [str, unicode])
            check_key(idx, sample_item, "lanes", [list, type(None)])
            check_key(idx, sample_item, "sample_names", [list, type(None)])

        find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult

        for read_chunk in args.sample_def:
            sample_names = read_chunk['sample_names']
            path = read_chunk['read_path']
            lanes = read_chunk['lanes']

            for sample_name in sample_names:
                reads = find_func(path, "R1", sample_name, lanes)
                fastq_files.extend(reads)
                reads = find_func(path, "R3", sample_name, lanes)
                fastq_files.extend(reads)
    else:
        martian.throw("Unrecognized input_mode: %s" % args.input_mode)

    ## if we found nothing then break
    if len(fastq_files) == 0:
        martian.exit(
            "No input FASTQs were found with the requested lanes and sample indices."
        )

    ## make sure they are okay first
    check_fastqs(fastq_files)

    total_reads = 0.0
    global_avg = 0.0
    num_files = 0
    for fn in fastq_files:
        reads_fn, avg_read_len_fn = estimate_read_count_and_length(
            fn, num_reads=1000)
        total_reads += reads_fn
        global_avg += avg_read_len_fn
        num_files += 1
    global_avg = global_avg / num_files
    martian.log_info(
        "Estimated read length = %.1f, Estimated total read input = %.1f" %
        (global_avg, total_reads))

    exit_msg = "We observe many reads shorter than 125 bases. The ideal read length for Supernova is 150 bases. Reads shorter than the ideal length are likely to yield a lower quality assembly, and the algorithm has not been tested on short reads. Because reads are too short, execution will be terminated."
    warn_msg = "We observe many reads shorter than 150 bases.The ideal read length for Supernova is 150 bases. Reads shorter than the ideal length are likely to yield a lower quality assembly."
    if global_avg < 125:
        martian.exit(exit_msg)
    elif global_avg < 149:
        martian.alarm(warn_msg)
Esempio n. 12
0
def main(args, outs):
    # Analysis parameters
    lead_trim = args.trim_length
    analysis_params = {}
    analysis_params['lead_trim'] = lead_trim
    analysis_params['analysis_version'] = martian.get_pipelines_version()
    analysis_params_output_file = open(outs.analysis_params, 'w')
    analysis_params_output_file.write(json.dumps(analysis_params))
    analysis_params_output_file.close()

    # Summary metrics
    summary_metrics = {}

    # Add longranger version to summary so we get it everywhere the summary.json goes
    # We'll also pipe it to the customer csv
    summary_metrics['longranger_version'] = martian.get_pipelines_version()

    basic_metrics = json.load(open(args.basic_results, 'r'))
    for (k, v) in basic_metrics.items():
        summary_metrics[k] = v

    # Get the set of instrument IDs observed in the BAM file
    if args.bam_file is not None:
        instrument_ids = get_instrument_ids(args.bam_file)
        summary_metrics['instrument_ids'] = ";".join(instrument_ids)
    else:
        summary_metrics['instrument_ids'] = ''

    # Copy over single_partition results
    sp_metrics = json.load(open(args.single_partition_results, 'r'))
    for (k, v) in sp_metrics.items():
        summary_metrics[k] = v

    # Load the duplicate summary results
    # only include the overall dup rate in the customer metrics
    dup_metrics = json.load(open(args.duplicate_summary, 'r'))

    key = 'full_use_bcs'
    dup_counts = dup_metrics[key]

    if dup_counts is None:
        key = 'full_ignore_bcs'
        dup_counts = dup_metrics[key]

    mean_tag = "mean_dup_rate"
    sd_tag = "sd_dup_rate"
    optical_tag = "optical_dup_rate"
    dup_frac_tag = "dup_fraction"

    if dup_counts:
        dd = {int(k): v for (k, v) in dup_counts.items()}
        n_dups = sum([v * (k - 1) for (k, v) in dd.items() if k > 1])
        n_non_dups = sum(dd.values())

        mean_dup_rate = tk_stats.robust_divide(float(n_dups + n_non_dups),
                                               n_non_dups)
        summary_metrics[mean_tag] = mean_dup_rate

        # Customer facing dup rate on 0 - 1 scale
        summary_metrics[dup_frac_tag] = (mean_dup_rate - 1.0) / mean_dup_rate

        optical_dup_count = dup_metrics['optical_' + key]['count']
        summary_metrics[optical_tag] = tk_stats.robust_divide(
            float(optical_dup_count), n_non_dups)

        sd_terms = [(k - mean_dup_rate)**2.0 * v for (k, v) in dd.items()]
        sd_dup_rate = math.sqrt(
            tk_stats.robust_divide(sum(sd_terms), sum(dd.values())))
        summary_metrics[sd_tag] = sd_dup_rate
    else:
        summary_metrics[dup_frac_tag] = 0.0
        summary_metrics[mean_tag] = 1.0
        summary_metrics[sd_tag] = 0.0
        summary_metrics[optical_tag] = 0.0

    # Load the bias results
    bias_results = json.load(open(args.coverage_results, 'r'))
    summary_depth_info = bias_results['summary_depth_info']
    mean_depth, median_depth, zero_cov_fract = get_depth_info(
        summary_depth_info)
    on_target_bases = get_on_target_bases(summary_depth_info)
    depth_positional_cv = get_depth_positional_cv(summary_depth_info,
                                                  COVERAGE_TRIM_TAIL)

    summary_depth_info = bias_results['summary_depth_info_deduped']
    mean_depth_deduped, median_depth_deduped, garb = get_depth_info(
        summary_depth_info)
    depth_positional_cv_deduped = get_depth_positional_cv(
        summary_depth_info, COVERAGE_TRIM_TAIL)

    # low coverage tail for customers, based on deduped coverage profile
    summary_metrics['low_cov_' +
                    str(CUSTOMER_LEFT_TAIL_COVERAGE)] = get_depth_tail_fract(
                        summary_depth_info,
                        CUSTOMER_LEFT_TAIL_COVERAGE,
                        left_tail=True)

    if bias_results['target_info'] != {}:
        target_info = bias_results['target_info']
        summary_metrics['fraction_on_target'] = tk_stats.robust_divide(
            float(target_info['on_target_bases']), target_info['total_bases'])
    else:
        summary_metrics['fraction_on_target'] = None

    summary_metrics['detected_sex'] = bias_results.get('detected_sex')
    summary_metrics['mean_depth'] = mean_depth
    summary_metrics['male_chromosome_copies'] = bias_results.get(
        'male_chromosome_copies')
    summary_metrics['median_depth'] = median_depth
    summary_metrics['mean_depth_deduped'] = mean_depth_deduped
    summary_metrics['median_depth_deduped'] = median_depth_deduped
    summary_metrics['on_target_bases'] = on_target_bases
    summary_metrics['depth_positional_cv'] = depth_positional_cv
    summary_metrics[
        'depth_positional_cv_deduped'] = depth_positional_cv_deduped
    summary_metrics['zero_cov_fract'] = zero_cov_fract

    # Compute fraction of reads in high-coverage spikes
    cov_data = bias_results['summary_depth_info_deduped']
    _, conf_median, _ = get_depth_info(cov_data)
    conf_median = max(conf_median, 1)
    cov_variance = conf_median + (conf_median * depth_positional_cv_deduped)**2
    cov_sigma = math.sqrt(cov_variance)
    high_cutoff = conf_median + 5.0 * cov_sigma
    cov_data = {int(k): v for (k, v) in cov_data.iteritems()}
    total = sum(float(k * v) for (k, v) in cov_data.iteritems())
    outlier = sum(
        float(k * v) for (k, v) in cov_data.iteritems() if k > high_cutoff)
    summary_metrics['high_coverage_pileup_fraction'] = tk_stats.robust_divide(
        outlier, total)

    # Add metrics from variant_results
    if not (args.variant_results is None):
        with open(args.variant_results) as variant_results_file:
            variant_results = json.load(variant_results_file)
        summary_metrics.update(variant_results)

    # Copy of coalescence results
    coa_metrics = json.load(open(args.filter_barcodes_results))
    for (k, v) in coa_metrics.items():
        summary_metrics[k] = v

    if not (args.sv_results is None):
        with open(args.sv_results) as sv_results_file:
            sv_results = json.load(sv_results_file)
        summary_metrics.update(sv_results)

    if not (args.short_del_results is None):
        with open(args.short_del_results) as short_del_results_file:
            short_del_results = json.load(short_del_results_file)
        new_res = {}
        for k, v in short_del_results.iteritems():
            new_res['short_del_' + k] = v
        summary_metrics.update(new_res)

    # Length mass results
    # Only copy scalar results
    if args.length_mass_results is not None:
        with open(args.length_mass_results) as length_mass_file:
            lm_results = json.load(length_mass_file)
            for (k, v) in lm_results.iteritems():
                if type(v) == str or type(v) == int or type(
                        v) == float or v is None:
                    summary_metrics[k] = v

    # Reference genome information
    summary_metrics[
        'reference_name'] = reference_name = tenkit.reference.get_genome(
            args.reference_path)
    ref_fasta = tenkit.reference.open_reference(args.reference_path)
    summary_metrics['reference_contigs'] = reference_contigs = len(ref_fasta)
    summary_metrics['reference_bases'] = reference_bases = sum(
        len(ref_fasta[contig]) for contig in ref_fasta)
    martian.log_info("Reference: %s, %d contigs, %d bases" %
                     (reference_name, reference_contigs, reference_bases))

    # Check for SV blacklist (only check if SV calling is enabled)
    summary_metrics['sv_blacklist_present'] = True
    if not (args.sv_results is None):
        if not tenkit.reference.is_tenx(args.reference_path):
            if not os.path.exists(
                    tenkit.reference.get_sv_blacklist(args.reference_path)):
                summary_metrics['sv_blacklist_present'] = False
                martian.alarm(
                    "WARNING: Pipeline run without a region blacklist for SV calling. SV calls may contain many false positives due to problematic regions in the reference."
                )

    # Gelbead lot information
    if not (args.lot_info is None):
        with open(args.lot_info) as lot_info_file:
            lot_info_results = json.load(lot_info_file)
        summary_metrics.update(lot_info_results)

    # Downsampling information
    if not (args.downsample_info is None):
        with open(args.downsample_info) as downsample_info_file:
            downsample_info_results = json.load(downsample_info_file)
        summary_metrics.update(downsample_info_results)

    # Summary metrics are now finalized -- evaluate alarms
    # Select alarm file -- right now we always use the same one
    alarm_rules = tenkit.alarms.load_rules(args.targets)
    alarms = tenkit.alarms.evaluate_alarms(alarm_rules, summary_metrics)

    # Write alarm file
    with open(outs.alarms, 'w') as alarms_output_file:
        alarms_output_file.write(tenkit.safe_json.safe_jsonify(alarms))

    # Log alarms to martian
    with open(outs.alarms_summary, 'w') as al_summary_file:

        def wl(s):
            al_summary_file.write(s + "\n")

        wl("10X Genomics - Pipeline Run Details")
        wl("-" * 40)
        wl("Sample ID: %s" % args.sample_id)
        wl("Genome: %s" % tenkit.reference.get_genome(args.reference_path))
        wl("Reference Path: %s" % args.reference_path)
        wl("Targets file: %s" % args.targets)
        if alarms is not None and len(alarms) > 0:
            wl("")
            wl("Sequencing Metric Alarms:")
            wl("-" * 40)
            for alarm in alarms:
                wl("%s [%s] -- %s" %
                   (alarm['level'], alarm['title'], alarm['message']))
        else:
            wl("")
            wl("No alarms raised.")

    summary_output_file = open(outs.summary, 'w')
    summary_output_file.write(
        tenkit.safe_json.safe_jsonify(summary_metrics, pretty=True))
    summary_output_file.close()

    # Generate CS summary metrics CSV
    sv_calls_metric = "num_calls"
    metrics_key_map = copy.deepcopy(CS_METRICS_KEY_MAP)
    metrics_key_map.append([sv_calls_metric, "large_sv_calls"])

    if args.targets is None:
        metrics_key_map.append(
            ["short_del_calledDEL_num_calls", "short_deletion_calls"])
    else:
        metrics_key_map.append(
            ["short_del_total_del_numPosCalls", "short_deletion_calls"])

    generate_summary_cs_csv(metrics_key_map, summary_metrics, outs.summary_cs)
Esempio n. 13
0
def main(args, outs):
    normalized_profiles = []
    raw_profiles, mask = coverage_matrix.load_matrix(
        args.raw_singlecell_profiles, args.reference_path)
    print('len(mask)=%d' % len(mask))
    print('len(raw_profiles)=%d' % len(raw_profiles))

    chromosomes = coverage_matrix.list_primary_contigs(
        args.raw_singlecell_profiles, args.reference_path)
    print('chromosomes:')
    print(chromosomes)
    n_chrom = len(chromosomes)
    #
    # Get mappability, GC content:
    bin_parameters = []
    vesna.load_track_parameters(args.tracks, bin_parameters)
    n_cells = raw_profiles[0].shape[0]
    linear = args.linear
    quadratic = args.quadratic
    gc0 = 0.45  # TODO: Replace this with mean of GC in good bins across entire genome
    #
    remove = []
    for chrom_index, chrom_name in enumerate(chromosomes):
        try:
            mappability = get_mappability(bin_parameters, chrom_name,
                                          ordered_chromosomes)
            gc_gc0 = get_gc(bin_parameters, gc0, chrom_name,
                            ordered_chromosomes)
            print('len(mappability)=%d' % len(mappability))
            print('len(gc_gc0)=%d' % len(gc_gc0))
            print('raw_profiles[chrom_index].shape:')
            print(raw_profiles[chrom_index].shape)
            expectation = mappability * (1.0 + linear * gc_gc0 +
                                         quadratic * gc_gc0 * gc_gc0)
            #print('expectation')
            #print(expectation.tolist())
            tmp = np.zeros(raw_profiles[chrom_index].shape, dtype='float')
            for cell in range(n_cells):
                #print('tmp[cell, :] before:')
                #print(tmp[cell, :].tolist())
                tmp[cell, :] = raw_profiles[chrom_index][cell, :] / expectation
                tmp[cell, tmp[cell, :] < 0.0] = 0.0
                #print('tmp[cell, :] after:')
                #print(tmp[cell, :].tolist())
            # for cell
            normalized_profiles.append(tmp)
        except Exception as error:
            martian.alarm(
                "stages/copy_number_processor/normalize_gc_bias/__init__ encountered an exception. Error: %s"
                % repr(error))
            print(
                "stages/copy_number_processor/normalize_gc_bias/__init__ encountered an exception. Error: %s"
                % repr(error))
            print(
                'Removing chrom_name=%s, chrom_index=%d (absent from input raw profiles)'
                % (chrom_name, chrom_index))
            remove.append(chrom_name)
        # try/except
    # for chrom
    for chrom_name in remove:
        if chrom_name in chromosomes:
            chromosomes.remove(chrom_name)
        # if chrom_name
    # for chrom_name
    #
    # Export normalized cell profiles
    bin_size = 20000  # TODO: Fetch this value from input raw_profiles h5 file
    tracks = pd.HDFStore(args.tracks, 'r')
    coverage_matrix.store_matrix(file_name=outs.normalized_singlecell_profiles,
                                 chroms=chromosomes,
                                 profiles=normalized_profiles,
                                 tracks=tracks,
                                 window_size=bin_size)
    tracks.close()