コード例 #1
0
ファイル: preflight.py プロジェクト: basedata10/cellranger
def check_sample_def(sample_def):
    hostname = socket.gethostname()

    check(tk_preflight.check_gem_groups(sample_def))

    print "Checking FASTQ folder..."
    for sample_def in sample_def:
        read_path = sample_def["read_path"]
        if not read_path.startswith('/'):
            raise PreflightException(
                "Specified FASTQ folder must be an absolute path: %s" %
                read_path)
        if not os.path.exists(read_path):
            raise PreflightException(
                "On machine: %s, specified FASTQ folder does not exist: %s" %
                (hostname, read_path))
        if not os.access(read_path, os.X_OK):
            raise PreflightException(
                "On machine: %s, cellranger does not have permission to open FASTQ folder: %s"
                % (hostname, read_path))
        if not os.listdir(read_path):
            raise PreflightException("Specified FASTQ folder is empty: " +
                                     read_path)

        lanes = sample_def["lanes"]
        if lanes is not None:
            for lane in lanes:
                if not is_int(lane):
                    raise PreflightException(
                        "Lanes must be a comma-separated list of numbers.")

        check(tk_preflight.check_sample_indices(sample_def))
コード例 #2
0
ファイル: __init__.py プロジェクト: umccr/longranger
def main(args, outs):
    hostname = socket.gethostname()

    if args.output_format == 'bam' and args.read_group is None:
        martian.exit(
            "Please specify a read_group to populate the @RG field of the BAM file"
        )

    if args.sample_id is not None:
        if not re.match("^[\w-]+$", args.sample_id):
            martian.exit(
                "Sample name may only contain letters, numbers, underscores, and dashes: "
                + args.sample_id)

    for sample_def in args.sample_def:
        read_path = sample_def["read_path"]
        if not read_path.startswith('/'):
            martian.exit(
                "Specified FASTQ folder must be an absolute path: %s" %
                read_path)
        if not os.path.exists(read_path):
            martian.exit(
                "On machine: %s, specified FASTQ folder does not exist: %s" %
                (hostname, read_path))
        if not os.access(read_path, os.X_OK):
            martian.exit(
                "On machine: %s, longranger does not have permission to open FASTQ folder: %s"
                % (hostname, read_path))
        if not os.listdir(read_path):
            martian.exit("Specified FASTQ folder is empty: " + read_path)

        library_id = sample_def.get("library_id")
        if library_id is not None:
            if not re.match("^[\w-]+$", library_id):
                martian.exit(
                    "Library name may only contain letters, numbers, underscores, and dashes: "
                    + library_id)

        lanes = sample_def["lanes"]
        if lanes is not None:
            for lane in lanes:
                if not tk_preflight.is_int(lane):
                    martian.exit(
                        "Lanes must be a comma-separated list of numbers.")

        ok, msg = tk_preflight.check_sample_indices(sample_def)
        if not ok:
            martian.exit(msg)

    # Check open file handles limit
    ok, msg = tk_preflight.check_open_fh()
    if not ok:
        martian.exit(msg)

    martian.log_info(tk_preflight.record_package_versions())
コード例 #3
0
ファイル: __init__.py プロジェクト: mosquitoCat/cellranger
def main_bcl_processor(sample_id, sample_def, chemistry_arg,
                       custom_chemistry_def):
    chunks = []

    sample_index_strings, msg = tk_preflight.check_sample_indices(sample_def)
    if sample_index_strings is None:
        martian.exit(msg)

    path = sample_def['read_path']
    lanes = sample_def['lanes']

    for sample_index in sample_index_strings:
        # Determine the read-type => fastq filename mapping
        try:
            chemistry_name = cr_chem.infer_sc3p_chemistry_bcl_processor(
                chemistry_arg, path, sample_index, lanes)
        except cr_chem.NoInputFastqsException:
            continue

        if chemistry_name == cr_chem.CUSTOM_CHEMISTRY_NAME:
            chemistry = custom_chemistry_def
        else:
            chemistry = cr_chem.get_chemistry(chemistry_name)

        read_type_map = cr_chem.get_read_type_map(
            chemistry, tk_constants.BCL_PROCESSOR_FASTQ_MODE)

        # Collect the fastq files for each read type
        filename_lists = {}
        for dest_read_type in cr_constants.FASTQ_READ_TYPES:
            src_read_type = read_type_map[dest_read_type]
            filename_lists[
                dest_read_type] = tk_fasta.find_input_fastq_files_10x_preprocess(
                    path, src_read_type, sample_index, lanes)

        fill_in_missing_reads(filename_lists)
        if validate_fastq_lists(filename_lists):
            chunks += construct_chunks(filename_lists,
                                       sample_id,
                                       sample_def,
                                       reads_interleaved=True,
                                       chemistry=chemistry)

    return chunks
コード例 #4
0
def main(args, outs):
    """Combine reads from multiple input FASTQ files, and potentially trim.
       Demultiplex outputs a series of FASTQ files with filenames of the form:
       read-[RA|I1|I2]_si-AGTAACGT_lane-001_chunk_001.fastq[.gz].
    """

    def check_key(n, dict_in, name, tys):
        if not dict_in.has_key(name):
            martian.exit("Entry %d in sample_def missing required field: %s" % (n, name))

        if not (type(dict_in[name]) in tys):
            martian.exit("Entry %d in sample_def for '%s' has incorrect type -- expecting %s, got %s" % (n, name, str(tys), type(dict_in[name])))

    global_subsample_rate = 1.0
    downsample_gigabases = False
    downsample_reads      = False
    if args.downsample is not None:
        ## make sure that exactly one downsampling option is specified
        options_supplied=0
        for subsample_key in ["gigabases", "subsample_rate", "target_reads"]:
            if args.downsample.get(subsample_key, None) is not None:
                options_supplied += 1
        assert( options_supplied == 1 )
        ##
        if 'subsample_rate' in args.downsample and args.downsample['subsample_rate'] is not None:
            global_subsample_rate = args.downsample['subsample_rate']
            assert( global_subsample_rate <= 1.0 )
        elif 'target_reads' in args.downsample and args.downsample['target_reads'] is not None:
            downsample_reads = True
        else:
            downsample_gigabases = True

    # Check for self-consistent gem_group settings in the sample_def entries
    gem_groups = [x['gem_group'] for x in args.sample_def]
    all_null = all([x is None for x in gem_groups])
    all_int = all([type(x) is int for x in gem_groups])

    if not (all_null or all_int):
        martian.exit("Inconsistent gem_group tags. Please specify all gem_group tags as null, or all gem_group tags with an integer")

    # If all gem_groups are set to null, then set them all to 1
    if all_null:
        for sample_item in args.sample_def:
            sample_item['gem_group'] = 1

    # Predicted input bases
    total_seq_bases = 0
    total_seq_reads = 0

    # verify input mode upfront
    if args.input_mode not in ["BCL_PROCESSOR", "ILMN_BCL2FASTQ"]:
        martian.throw("Unrecognized input_mode: %s" % args.input_mode)

    for (idx, sample_item) in enumerate(args.sample_def):
        # validate fields
        check_key(idx, sample_item, "read_path", [str, unicode])
        check_key(idx, sample_item, "lanes",  [list, type(None)])
        check_key(idx, sample_item, "gem_group", [int, type(None)])
        if args.input_mode == "BCL_PROCESSOR":
            check_key(idx, sample_item, "sample_indices", [list, type(None)])
        elif args.input_mode == "ILMN_BCL2FASTQ":
            check_key(idx, sample_item, "sample_names", [list, type(None)])

    interleaved_read_type = "RA"

    chunks = []
    read_groups = set()

    for read_chunk in args.sample_def:
        # Check if subsample_rate exists in sample_def
        if 'subsample_rate' in read_chunk.keys():
            subsample_rate = global_subsample_rate * read_chunk['subsample_rate']
        else:
            subsample_rate = global_subsample_rate

        bc_in_read = {}
        if read_chunk.has_key('bc_in_read'):
            if read_chunk['bc_in_read'] is not None:
                bc_in_read['bc_in_read'] = read_chunk['bc_in_read']
                bc_in_read['bc_length'] = read_chunk['bc_length']

        path = read_chunk['read_path']
        lanes = read_chunk['lanes']
        gem_group = read_chunk['gem_group']
        unbarcoded = read_chunk.get('unbarcoded')
        sample_id = args.sample_id
        library_id = read_chunk.get('library_id', 'MissingLibrary')

        # split on BCL_PROCESSOR / ILMN_BCL2FASTQ
        # the main difference is that BCL_PROCESSOR uses interleaved reads and labels FASTQs by sample index;
        # whereas ILMN_BCL2FASTQ uses R1/R2 and labels by sample name

        if args.input_mode == "BCL_PROCESSOR":
            sample_index_strings, msg = tk_preflight.check_sample_indices(read_chunk)
            if sample_index_strings is None:
                martian.exit(msg)

            sample_seq_bases = 0
            sample_seq_reads = 0
            find_func = tk_fasta.find_input_fastq_files_10x_preprocess
            for sample_index in sample_index_strings:
                # process interleaved reads
                reads = find_func(path, interleaved_read_type, sample_index, lanes)
                for read in reads:
                    predicted_seq_reads, predicted_seq_bases = fastq_data_estimate(read)
                    sample_seq_bases += predicted_seq_bases
                    sample_seq_reads += predicted_seq_reads

            martian.log_info("Input data: Predict %f GB from %s" % (float(sample_seq_bases)/1e9, path))
            total_seq_bases += sample_seq_bases
            total_seq_reads += sample_seq_reads

            for sample_index in sample_index_strings:
                reads = find_func(path, interleaved_read_type, sample_index, lanes)
                # TODO confirm that this works with cellranger
                si_read, bc_read = ("I1", "I2")
                if 'barcode_read' in read_chunk and read_chunk['barcode_read'] == 'I1':
                    si_read, bc_read = ("I2", "I1")
                sis = find_func(path, si_read, sample_index, lanes)

                # allow empty sample index case if all reads in lane are same sample
                if sis is None or sis == []:
                    sis = [None] * len(reads)

                if not unbarcoded:
                    barcodes = find_func(path, bc_read, sample_index, lanes)
                    if len(barcodes) == 0:
                        barcodes = [None] * len(reads)
                else:
                    barcodes = [None] * len(reads)

                # calculate chunks
                for r,b,si in zip(reads, barcodes, sis):
                    (flowcell, lane) = get_run_data(r)
                    rg_string = ':'.join([sample_id, library_id, str(gem_group), flowcell, lane])
                    new_chunk = {
                        'read1': r, 'read2': None, 'reads_interleaved': True, 'barcode': b, 
                        'sample_index': si, 'barcode_reverse_complement': False, 'gem_group': gem_group,
                        'subsample_rate': subsample_rate, 'read_group': rg_string
                    }
                    new_chunk.update(bc_in_read)
                    chunks.append(new_chunk)
                    read_groups.add(rg_string)

        elif args.input_mode == "ILMN_BCL2FASTQ":
            sample_names = read_chunk['sample_names']

            sample_seq_bases = 0
            sample_seq_reads = 0
            find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult
            for sample_name in sample_names:
                # process read 1
                reads = find_func(path, "R1", sample_name, lanes)
                for read in reads:
                    predicted_seq_reads, predicted_seq_bases = fastq_data_estimate(read)
                    sample_seq_bases += predicted_seq_bases
                    sample_seq_reads += predicted_seq_reads
                # process read 2
                reads = find_func(path, "R2", sample_name, lanes)
                for read in reads:
                    predicted_seq_reads, predicted_seq_bases = fastq_data_estimate(read)
                    sample_seq_bases += predicted_seq_bases
                    sample_seq_reads += predicted_seq_reads

            martian.log_info("Input data: Predict %f GB from %s" % (float(sample_seq_bases)/1e9, path))
            total_seq_bases += sample_seq_bases
            total_seq_reads += sample_seq_reads

            for sample_name in sample_names:
                r1_reads = find_func(path, "R1", sample_name, lanes)
                r2_reads = find_func(path, "R2", sample_name, lanes)

                # TODO confirm that this works with cellranger
                si_read, bc_read = ("I1", "I2")
                if 'barcode_read' in read_chunk and read_chunk['barcode_read'] == 'I1':
                    si_read, bc_read = ("I2", "I1")
                sis = find_func(path, si_read, sample_name, lanes)

                # allow empty sample index case if all reads in lane are same sample
                if sis is None or sis == []:
                    sis = [None] * len(r1_reads)

                # in Chromium chemistry... there shouldn't be separate barcode reads...
                if not unbarcoded:
                    barcodes = find_func(path, bc_read, sample_name, lanes)
                    if len(barcodes) == 0:
                        barcodes = [None] * len(r1_reads)
                else:
                    barcodes = [None] * len(r1_reads)

                # again, with Chromium, the barcodes should be an array of Nones, but
                # just in case...
                if not (len(r1_reads) == len(r2_reads) == len(barcodes)):
                    martian.log_info("Read 1 files: %s" % str(r1_reads))
                    martian.log_info("Read 2 files: %s" % str(r2_reads))
                    martian.log_info("Barcode files: %s" % str(barcodes))
                    martian.exit("Read1, Read2, and Barcode files are mismatched. Exiting pipline")

                # calculate chunks
                for r1,r2,b,si in zip(r1_reads, r2_reads, barcodes, sis):
                    (flowcell, lane) = get_run_data(r1)
                    rg_string = ':'.join([sample_id, library_id, str(gem_group), flowcell, lane])
                    new_chunk = {
                        'read1': r1, 'read2': r2, 'reads_interleaved': False, 'barcode': b,
                        'sample_index': si, 'barcode_reverse_complement': False, 'gem_group': gem_group,
                        'subsample_rate': subsample_rate, 'read_group': rg_string
                    }
                    new_chunk.update(bc_in_read)
                    chunks.append(new_chunk)
                    read_groups.add(rg_string)

    martian.log_info("Input data: Predict %f total GB" % (float(total_seq_bases)/1e9))
    martian.log_info("            Predict %d total reads" % total_seq_reads)

    if len(chunks) == 0:
        martian.exit("No input FASTQs were found for the requested parameters.")

    if downsample_gigabases and args.downsample['gigabases'] is not None:
        # Calculate global downsample rate
        global_subsample_rate = min(1.0, float(args.downsample['gigabases'])*1e9 / float(total_seq_bases))
        martian.log_info("Input data downsampling: Requested: %.2f GB, Estimated Input: %.2f GB, Downsample Rate: %.3f" \
         % (float(args.downsample['gigabases']), float(total_seq_bases)/1e9, global_subsample_rate))

        for chunk in chunks:
            chunk['subsample_rate'] = chunk['subsample_rate'] * global_subsample_rate
    elif downsample_reads:
        global_subsample_rate = min(1.0, float(args.downsample['target_reads'])/float(total_seq_reads))
        martian.log_info("Input data downsampling: Requested: %.2f M reads, Estimated Input: %.2f M reads, Downsample Rate: %.3f" \
         % (float(args.downsample['target_reads'])/1e6, float(total_seq_reads)/1e6, global_subsample_rate))

        for chunk in chunks:
            chunk['subsample_rate'] = chunk['subsample_rate'] * global_subsample_rate



    martian.log_info("Input reads: %s" % str(chunks))
    outs.chunks = chunks
    outs.read_groups = [rg for rg in read_groups]

    # log info about input vs requested GB
    # first, set defaults
    available_gb = float(total_seq_bases)/1e9
    requested_gb = None
    available_reads = total_seq_reads
    requested_reads = None
    requested_rate = None
    post_downsample_gb = requested_gb
    downsample_succeeded = True

    if args.downsample is not None and args.downsample.get('gigabases') is not None:
        requested_gb = float(args.downsample['gigabases'])
        post_downsample_gb = min(available_gb, requested_gb)
        if available_gb < requested_gb:
            martian.log_info("Downsample requested more GB than was available; will not downsample.")
            downsample_succeeded = False

    elif args.downsample is not None and args.downsample.get('subsample_rate') is not None:
        requested_rate = float(args.downsample['subsample_rate'])
        post_downsample_gb = available_gb * requested_rate

    elif args.downsample is not None and args.downsample.get('target_reads') is not None:
        requested_reads = float(args.downsample['target_reads'])


    downsample_info = {}
    downsample_info['available_gb'] = available_gb
    downsample_info['requested_gb'] = requested_gb
    downsample_info['available_reads'] = available_reads
    downsample_info['requested_reads'] = requested_reads
    downsample_info['requested_rate'] = requested_rate
    downsample_info['post_downsample_gb'] = post_downsample_gb
    downsample_info['downsample_succeeded'] = downsample_succeeded

    with open(outs.downsample_info, 'w') as downsample_out:
        tenkit.safe_json.dump_numpy(downsample_info, downsample_out)

    check_fastqs(outs.chunks)
コード例 #5
0
def main(args, outs):
    hostname = socket.gethostname()

    # Sample ID / pipestance name
    if args.sample_id is not None:
        if not re.match("^[\w-]+$", args.sample_id):
            martian.exit("Sample name may only contain letters, numbers, underscores, and dashes: " + args.sample_id)

    # FASTQ input
    for sample_def in args.sample_def:
        #if not tk_preflight.check_is_chromium(sample_def):
        #    martian.exit("This version of Longranger does not support GemCode data. Please use Longranger 1.2 instead.")
        read_path = sample_def["read_path"]
        if not read_path:
            martian.exit("Must specify a read_path containing FASTQs.")
        if not read_path.startswith('/'):
            martian.exit("Specified FASTQ folder must be an absolute path: %s" % read_path)
        if not os.path.exists(read_path):
            martian.exit("On machine: %s, specified FASTQ folder does not exist: %s" % (hostname, read_path))
        if not os.access(read_path, os.X_OK):
            martian.exit("On machine: %s, longranger does not have permission to open FASTQ folder: %s" % (hostname, read_path))
        if not os.listdir(read_path):
            martian.exit("Specified FASTQ folder is empty: " + read_path)

        library_id = sample_def.get("library_id")
        if library_id is not None:
            if not re.match("^[\w-]+$", library_id):
                martian.exit("Library name may only contain letters, numbers, underscores, and dashes: " + library_id)

        lanes = sample_def["lanes"]
        if lanes is not None:
            for lane in lanes:
                if not is_int(lane):
                    martian.exit("Lanes must be a comma-separated list of numbers.")

        ok, msg = tk_preflight.check_sample_indices(sample_def)
        if not ok:
            martian.exit(msg)

    # Reference
    MAX_CONTIGS = 1000
    ok, msg = tk_preflight.check_refdata(args.reference_path, MAX_CONTIGS)
    if ok:
        martian.log_info(msg)
    else:
        martian.exit(msg)

    # Sex (given reference)
    if args.sex is not None:
        if args.sex.lower() not in ["m", "male", "f", "female"]:
            martian.exit("Sex of sample must be 'm', 'male', 'f', or 'female'.")
    else:
        if tenkit.reference.load_male_chromosomes(args.reference_path) == None:
            martian.exit("Must specify sex of sample, or use a reference package that includes a sex_chromosomes.tsv file.\nFor more details, see http://support.10xgenomics.com/genome-exome/software/pipelines/latest/advanced/references")

        ref = tenkit.reference.open_reference(args.reference_path)
        male_chrom = tenkit.reference.load_male_chromosomes(args.reference_path)
        for m in male_chrom:
            if m not in ref:
                martian.exit("Reference issue in sex_chromosomes.tsv. Male-specific chromosome '%s' does not exist in reference" % m)

        auto_chrom = tenkit.reference.load_autosomal_chromosomes(args.reference_path)
        if auto_chrom is None:
            martian.exit("No autosomal chromosome listed in sex_chromosomes.tsv. Please list an autosomal chromosome to use as a reference for sex determination")

        for a in auto_chrom:
            if a not in ref:
                martian.exit("Reference issue in sex_chromosomes.tsv. Autosomal chromosome '%s' does not exist in reference" % a) 

    # Open file handles limit - per LONGRANGER-1758, only check this on the execution machine.
    # We can tell if we're on the execution machine by looking at args.check_executables
    if args.check_executables:
        ok, msg = tk_preflight.check_open_fh()
        if not ok:
            martian.exit(msg)

    # Targets
    if args.targets is not None:
        tk_preflight.check_file("targets", args.targets, hostname)
        tk_preflight.check_bed(args.targets, args.reference_path)

        if args.target_blacklist is None:
            print "\nWARNING: You selected targeted mode but did not provide a --cnvfilter.\nPlease note this may result in a high number of false positive CNV calls.\nFor more details, see http://support.10xgenomics.com/genome-exome/software\n"

    # Target blacklist
    if args.target_blacklist is not None:
        tk_preflight.check_file("cnvfilter", args.target_blacklist, hostname)
        tk_preflight.check_bed(args.target_blacklist, args.reference_path)

    # Restrict locus
    if tenkit.reference.is_tenx(args.reference_path):
        if args.restrict_locus is not None:
            if not re.match("^chr[A-Za-z0-9]{1,2}:[0-9]+\.\.[0-9]+$", args.restrict_locus):
                martian.exit("restrict_locus must be of the form 'chrXX:start..end'.")

    # Pre-called
    if args.vc_precalled is not None:
        tk_preflight.check_file("pre-called VCF", args.vc_precalled, hostname)
        check_vcf(args.vc_precalled, args)

    # VC mode
    if not re.match("^(disable|freebayes|gatk:/.*\.jar|precalled:/.*\.vcf)$", args.vc_mode):
        martian.exit("vc_mode must be of the form 'freebayes', 'gatk:/path/to/gatk_jar_file.jar', 'disable'.")

    if args.vc_precalled is None and args.vc_mode == "disable":
        martian.exit("Because you have not provided a pre-called VCF file, variant calling cannot be disabled.")

    vc_args = args.vc_mode.split(":")
    vc_mode = vc_args[0]
    if vc_mode == "precalled":
        if args.vc_precalled is not None:
            martian.exit("Please specify a pre-called VCF file using only one method.")
        precalled_vars_path = vc_args[1]
        tk_preflight.check_file("pre-called VCF", precalled_vars_path, hostname)
        check_vcf(precalled_vars_path, args)
    elif vc_mode == "gatk":
        jar_path = vc_args[1]
        if not jar_path.startswith('/'):
            martian.exit("Specified GATK jar file must be an absolute path: %s" % jar_path)
        if not os.path.exists(jar_path):
            martian.exit("On machine: %s, specified GATK jar file does not exist: %s" % (hostname, jar_path))
        if os.path.isdir(jar_path):
            martian.exit("Please specify a GATK jar file, not a folder.")
        if args.check_executables:
            check_gatk(jar_path, hostname)

        check_gatk_ref(args.reference_path)

    # VC ground truth
    if args.vc_ground_truth is not None:
        tk_preflight.check_file("VCF ground truth", args.vc_ground_truth, hostname)
        check_vcf(args.vc_ground_truth, args)

    # SV min QV
    if args.sv_min_qv is not None and args.sv_min_qv < 0:
        martian.exit("sv_min_qv must be a positive integer.")

    # SV ground truth
    if args.sv_ground_truth is not None:
        tk_preflight.check_file("SV ground truth", args.sv_ground_truth, hostname)

    martian.log_info(tk_preflight.record_package_versions())
コード例 #6
0
def join(args, outs, chunk_defs, chunk_outs):
    # Sample ID / pipestance name
    check_sample_id(args.sample_id)

    # force_cells
    check_force_cells(args.force_cells)

    # downsample
    if args.downsample is not None:
        if len(args.downsample) == 0:
            martian.exit("downsample must be a non-empty dictionary.")
        keys = args.downsample.keys()
        if len(keys) > 1:
            martian.exit("Please supply either subsample_rate or gigabases but not both.")
        key = keys[0]
        if not (key in ['subsample_rate', 'gigabases']):
            martian.exit("Please supply either subsample_rate or gigabases as the downsample argument. '%s' is invalid" % key)
        value = args.downsample[key]
        bad_value = False
        try:
            float(value)
            bad_value = value < 1e-12
        except ValueError:
            bad_value = True
        if bad_value:
            martian.exit("Command line argument for downsampling must be a positive number")

    # FASTQ mode
    if args.fastq_mode is not None:
        if args.fastq_mode not in ['ILMN_BCL2FASTQ', 'BCL_PROCESSOR']:
            martian.exit("Unsupported fastq_mode. Options are ILMN_BCL2FASTQ and BCL_PROCESSOR, provided: {}".
                         format(args.fastq_mode))

    # FASTQ input (sample_def)
    hostname = socket.gethostname()
    for idx, sample_def in enumerate(args.sample_def):
        read_path = sample_def.get("read_path")
        if not read_path:
            martian.exit("Must specify a read_path containing FASTQs in each entry of 'sample_def' argument")
        if not read_path.startswith('/'):
            martian.exit("Specified FASTQ folder must be an absolute path: %s" % read_path)
        if not os.path.exists(read_path):
            martian.exit("On machine: %s, specified FASTQ folder does not exist: %s" % (hostname, read_path))
        if not os.access(read_path, os.X_OK):
            martian.exit("On machine: %s, cellranger-atac does not have permission to open FASTQ folder: %s" % (
                         hostname, read_path))
        if not os.listdir(read_path):
            martian.exit("Specified FASTQ folder is empty: " + read_path)

        library_id = sample_def.get("library_id")
        if library_id is not None:
            if not re.match("^[\w-]+$", library_id):
                martian.exit(
                    "Library name may only contain letters, numbers, underscores, and dashes: " + library_id)

        lanes = sample_def["lanes"]
        if lanes is not None:
            for lane in lanes:
                if not tk_preflight.is_int(lane):
                    martian.exit("Lanes must be a comma-separated list of numbers.")

        if args.fastq_mode == "BCL_PROCESSOR":
            sample_indices, msg = tk_preflight.check_sample_indices(sample_def)
            if sample_indices is None:
                martian.exit(msg)

            find_func = tk_fasta.find_input_fastq_files_10x_preprocess
            reads = []
            for sample_index in sample_indices:
                # process interleaved reads
                reads.extend(find_func(read_path, "RA", sample_index, lanes))
            if len(reads) == 0:
                martian.exit("No input FASTQs were found for the requested parameters.")
        elif args.fastq_mode == "ILMN_BCL2FASTQ":
            sample_names = sample_def.get("sample_names", None)
            if sample_names is None:
                martian.exit("Entry {} in sample_def missing required field: sample_names".format(idx))
            find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult
            reads1 = []
            reads2 = []
            for sample_name in sample_names:
                r1 = find_func(read_path, BCL2FASTQ_SEQNAMES["R1"], sample_name, lanes)
                r2 = find_func(read_path, BCL2FASTQ_SEQNAMES["R2"], sample_name, lanes)
                if len(r1) != len(r2):
                    martian.exit("Entry {} in sample_defs are missing input FASTQs.".format(idx))
                reads1.extend(r1)
                reads2.extend(r2)
            if len(reads1) == 0 and len(reads2) == 0:
                martian.exit("No input FASTQs were found for the requested parameters.")
        else:
            martian.exit("Unrecognized fastq_mode: {}".format(args.fastq_mode))

    # trim_def['R1'] and ['R2'] must be identical.
    if args.trim_def is not None:
        if len(args.trim_def) == 0:
            martian.exit("trim_def must be a non-empty dictionary.")
        if "R1" not in args.trim_def or "R2" not in args.trim_def:
            martian.exit("trim_def must have R1, R2 fields.")
        if args.trim_def["R1"] != args.trim_def["R2"]:
            martian.exit("trim_def['R1'] and trim_def['R2'] must be identical.")

    # factorization.
    check_factorization(args.factorization)

    # # Reference
    # ref directory structure and timestamps
    ok, msg = check_refdata(args.reference_path, max_contigs=None)
    if ok:
        martian.log_info(msg)
    else:
        martian.exit(msg)

    # usability and format check
    check_reference_format(args.reference_path)

    # Open file handles limit
    if args.check_executables:
        check_filehandle_limit()

    martian.log_info(tk_preflight.record_package_versions())
コード例 #7
0
def main(args, outs):
    hostname = socket.gethostname()
    tk_preflight.record_package_versions()

    ## no barcode whitelist
    if args.barcode_whitelist is None:
        martian.exit("No barcode whitelist specified.")

    ## there must be a barcode in each sample
    ## and it should be 16 bases long
    ## and it should be on read 1 or read 2
    for sd in args.sample_def:
        if sd.get("bc_length", 0) != 16 or sd.get("bc_in_read",
                                                  3) not in [1, 2]:
            martian.exit("Barcode must be 16 bases and on read1 or read2.")

    print "Checking FASTQ folder..."
    for sample_def in args.sample_def:
        read_path = sample_def["read_path"]
        if not read_path:
            martian.exit("Must specify a read_path containing FASTQs.")
        if not read_path.startswith('/'):
            martian.exit(
                "Specified FASTQ folder must be an absolute path: %s" %
                read_path)
        if not os.path.exists(read_path):
            martian.exit(
                "On machine: %s, specified FASTQ folder does not exist: %s" %
                (hostname, read_path))
        if not os.access(read_path, os.X_OK):
            martian.exit(
                "On machine: %s, supernova does not have permission to open FASTQ folder: %s"
                % (hostname, read_path))
        if not os.listdir(read_path):
            martian.exit("Specified FASTQ folder is empty: " + read_path)

        library_id = sample_def.get("library_id")
        if library_id is not None:
            if not re.match("^[\w-]+$", library_id):
                martian.exit(
                    "Library name may only contain letters, numbers, underscores, and dashes: "
                    + library_id)

        lanes = sample_def["lanes"]
        if lanes is not None:
            for lane in lanes:
                if not is_int(lane):
                    martian.exit(
                        "Lanes must be a comma-separated list of numbers.")

    # Open file handles limit - per SUPERNOVA-152, only check this on the execution machine.
    # We can tell if we're on the execution machine by looking at args.check_executables
    if args.check_executables:
        ok, msg = tk_preflight.check_open_fh()
        if not ok:
            martian.exit(msg)

    ## compile a list of fastq files
    fastq_files = []
    if args.input_mode == "BCL_PROCESSOR":
        # Validate the sample_def fields are correct
        for (idx, sample_item) in enumerate(args.sample_def):
            # validate
            check_key(idx, sample_item, "sample_indices", [list, type(None)])
            check_key(idx, sample_item, "read_path", [str, unicode])
            check_key(idx, sample_item, "lanes", [list, type(None)])

        main_read_type = "RA"
        find_func = tk_fasta.find_input_fastq_files_10x_preprocess

        for read_chunk in args.sample_def:
            sample_index_strings, msg = tk_preflight.check_sample_indices(
                read_chunk)
            if sample_index_strings is None:
                martian.exit(msg)

            path = read_chunk['read_path']
            lanes = read_chunk['lanes']

            for sample_index in sample_index_strings:
                reads = find_func(path, main_read_type, sample_index, lanes)
                fastq_files.extend(reads)
    elif args.input_mode == "ILMN_BCL2FASTQ":
        # Validate the sample_def fields are correct
        for (idx, sample_item) in enumerate(args.sample_def):
            # validate
            check_key(idx, sample_item, "read_path", [str, unicode])
            check_key(idx, sample_item, "lanes", [list, type(None)])
            check_key(idx, sample_item, "sample_names", [list, type(None)])

        find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult

        for read_chunk in args.sample_def:
            sample_names = read_chunk['sample_names']
            path = read_chunk['read_path']
            lanes = read_chunk['lanes']

            for sample_name in sample_names:
                reads = find_func(path, "R1", sample_name, lanes)
                fastq_files.extend(reads)
                reads = find_func(path, "R3", sample_name, lanes)
                fastq_files.extend(reads)
    else:
        martian.throw("Unrecognized input_mode: %s" % args.input_mode)

    ## if we found nothing then break
    if len(fastq_files) == 0:
        martian.exit(
            "No input FASTQs were found with the requested lanes and sample indices."
        )

    ## make sure they are okay first
    check_fastqs(fastq_files)

    total_reads = 0.0
    global_avg = 0.0
    num_files = 0
    for fn in fastq_files:
        reads_fn, avg_read_len_fn = estimate_read_count_and_length(
            fn, num_reads=1000)
        total_reads += reads_fn
        global_avg += avg_read_len_fn
        num_files += 1
    global_avg = global_avg / num_files
    martian.log_info(
        "Estimated read length = %.1f, Estimated total read input = %.1f" %
        (global_avg, total_reads))

    PreflightAlert = alerts.AlertLogger(stage="preflight")
    PreflightAlert.issue("mean_read_length", global_avg)

    # verify type and range for downsampling parameters
    # Note that non-numerical values for bc_subsample_rate and target_reads in mro trickle down as 'None'
    if args.downsample is not None:
        bc_subsample_rate = args.downsample.get("bc_subsample_rate", None)
        if bc_subsample_rate is not None:
            if not isinstance(bc_subsample_rate, float) and not isinstance(
                    bc_subsample_rate, int):
                martian.exit(
                    "Specified barcode fraction: %s is not a fraction. Please specify a valid float between 0 and 1."
                    % str(bc_subsample_rate))
            if bc_subsample_rate <= 0 or bc_subsample_rate > 1:
                martian.exit(
                    "Specified barcode fraction: %s is not between 0 and 1. Please specify a valid float between 0 and 1."
                    % str(bc_subsample_rate))
            if abs(bc_subsample_rate) < 1e-5:
                martian.exit(
                    "Specified barcode fraction: %s is too close to 0 and thus impractical."
                    % str(bc_subsample_rate))

        target_reads = args.downsample.get("target_reads", None)
        if target_reads is not None:
            if not isinstance(target_reads, int) and not isinstance(
                    target_reads, float):
                martian.exit(
                    "Specified maxreads: %s is not a number. Please specify an integer larger than one for maxreads"
                    % str(target_reads))
            if target_reads < 1:
                martian.exit(
                    "Specified maxreads: %s is less than one. Please specify an integer larger than one for maxreads"
                    % str(target_reads))
コード例 #8
0
def main(args, outs):
    hostname = socket.gethostname()

    # Sample ID / pipestance name
    if args.sample_id is not None:
        if not re.match("^[\w-]+$", args.sample_id):
            martian.exit("Sample name may only contain letters, numbers, underscores, and dashes: " + args.sample_id)

    # Check numerical options
    # types are already checked by mrp so only need to check ranges
    if args.force_cells is not None and (args.force_cells < 1 or
        args.force_cells > 20000):
        martian.exit("MRO parameter force_cells must be a positive integer"\
            " <= 20000.")

    # check min_ploidy, max_ploidy
    if args.cnv_params is not None:
        min_ploidy = args.cnv_params.get("min_ploidy", None)
        max_ploidy = args.cnv_params.get("max_ploidy", None)
        if min_ploidy is not None and min_ploidy <= 0:
            martian.exit("Command line argument soft-min-avg-ploidy must be a "\
                "positive real number.")
        if max_ploidy is not None and (max_ploidy <= 0 or max_ploidy > 8.0):
            martian.exit("Command line argument soft-max-avg-ploidy must be a "\
                "positive real number <= 8.")
        if (min_ploidy is not None and max_ploidy is not None and 
            max_ploidy <= min_ploidy):
            martian.exit("Command line arguments must satisfy "\
                "soft-min-avg-ploidy < soft-max-avg-ploidy.")

    # check downsample options
    if args.downsample is not None and len(args.downsample.keys()) > 0:
        keys = args.downsample.keys()
        if len(keys) > 1:
            martian.exit("Please supply either maxreads or downsample but not "\
                "both.")
        key = keys[0]
        value = args.downsample[key]
        param_map = {"target_reads" : "maxreads", "gigabases" : "downsample"}
        bad_value = False
        try:
            float(value)
            bad_value = value < 1e-12
        except ValueError:
            bad_value = True
        if bad_value:
            cs_key = param_map[key]
            martian.exit("Command line argument %s must be a positive number" %
                cs_key)

    # FASTQ input
    for idx, sample_def in enumerate(args.sample_def):
        read_path = sample_def["read_path"]
        if not read_path:
            martian.exit("Must specify a read_path containing FASTQs.")
        if not read_path.startswith('/'):
            martian.exit("Specified FASTQ folder must be an absolute path: %s" % read_path)
        if not os.path.exists(read_path):
            martian.exit("On machine: %s, specified FASTQ folder does not exist: %s" % (hostname, read_path))
        if not os.access(read_path, os.X_OK):
            martian.exit("On machine: %s, longranger does not have permission to open FASTQ folder: %s" % (hostname, read_path))
        if not os.listdir(read_path):
            martian.exit("Specified FASTQ folder is empty: " + read_path)

        library_id = sample_def.get("library_id")
        if library_id is not None:
            if not re.match("^[\w-]+$", library_id):
                martian.exit("Library name may only contain letters, numbers, underscores, and dashes: " + library_id)

        lanes = sample_def["lanes"]
        if lanes is not None:
            for lane in lanes:
                if not tk_preflight.is_int(lane):
                    martian.exit("Lanes must be a comma-separated list of numbers.")

        if args.fastq_mode == "BCL_PROCESSOR":
            sample_indices, msg = tk_preflight.check_sample_indices(sample_def)
            if sample_indices is None:
                martian.exit(msg)

            find_func = tk_fasta.find_input_fastq_files_10x_preprocess
            reads = []
            for sample_index in sample_indices:
                # process interleaved reads
                reads.extend(find_func(read_path, "RA", sample_index, lanes))
            if len(reads) == 0:
                martian.exit("No input FASTQs were found for the requested parameters.")
        elif args.fastq_mode == "ILMN_BCL2FASTQ":
            sample_names = sample_def.get("sample_names", None)
            if sample_names is None:
                martian.exit("Entry {} in sample_def missing required field: sample_names".format(idx))
            find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult
            reads1 = []
            reads2 = []
            for sample_name in sample_names:
                r1 = find_func(read_path, "R1", sample_name, lanes)
                r2 = find_func(read_path, "R2", sample_name, lanes)
                if len(r1) != len(r2):
                    martian.exit("Entry {} in sample_defs are missing input FASTQs.".format(idx))
                reads1.extend(r1)
                reads2.extend(r2)
            if len(reads1) == 0 and len(reads2) == 0:
                martian.exit("No input FASTQs were found for the requested parameters.")
        else:
            martian.exit("Unrecognized fastq_mode: {}".format(args.fastq_mode))

    # Reference
    ok, msg = tk_preflight.check_refdata(args.reference_path, max_contigs=None)
    if ok:
        martian.log_info(msg)
    else:
        martian.exit(msg)
    contig_defs_json_path = os.path.join(args.reference_path, "fasta", 
        "contig-defs.json")
    faidx_path = os.path.join(args.reference_path, "fasta", 
        "genome.fa.fai")
    error_msg = contig_manager.verify_contig_defs(contig_defs_json_path,
        faidx_path)
    if error_msg is not None:
        martian.exit(error_msg)

    try:
        ref = contig_manager.contig_manager(args.reference_path)
    except Exception as e:
        martian.exit("Unexpected error occurred.\n%s"%str(e))

    # too many contigs
    primary = ref.primary_contigs(allow_sex_chromosomes=True)
    num_primary_contigs = len(primary)
    if num_primary_contigs > 100:
        martian.exit("There can be at most 100 primary contigs.")

    # contig length checks
    chrom_length_dict = ref.get_contig_lengths()

    contig_length_exit = 500 * 1000
    contig_length_warn = 10 ** 7
    offending_contigs_warn = []
    offending_contigs_exit = []
    for c in primary:
        clen = chrom_length_dict[c]
        if clen < contig_length_exit:
            offending_contigs_exit.append(c)
        elif clen < contig_length_warn:
            offending_contigs_warn.append(c)
    if len(offending_contigs_exit) > 0:
        martian.exit("Primary contig(s) \"%s\" are shorter than %d bases. "\
            "Every primary contig must be at least %d bases "\
            "in length."%(",".join(offending_contigs_exit), contig_length_exit,
                          contig_length_exit))
    elif (not args.check_executables) and len(offending_contigs_warn) > 0:
        martian.alarm("Primary contig(s) \"%s\" are shorter than %d bases. "\
            "Every primary contig is recommended to be at least %d bases "\
            "in length."%(",".join(offending_contigs_warn), contig_length_warn,
                          contig_length_warn))

    # Open file handles limit 
    if args.check_executables:
        ok, msg = tk_preflight.check_open_fh()
        if not ok:
            martian.exit(msg)

    martian.log_info(tk_preflight.record_package_versions())
コード例 #9
0
ファイル: __init__.py プロジェクト: mosquitoCat/cellranger
def main(args, outs):
    hostname = socket.gethostname()

    print "Checking sample info..."
    ok, msg = tk_preflight.check_gem_groups(args.sample_def)
    if not ok:
        martian.exit(msg)

    print "Checking FASTQ folder..."
    for sample_def in args.sample_def:
        read_path = sample_def["read_path"]
        if not read_path.startswith('/'):
            martian.exit(
                "Specified FASTQ folder must be an absolute path: %s" %
                read_path)
        if not os.path.exists(read_path):
            martian.exit(
                "On machine: %s, specified FASTQ folder does not exist: %s" %
                (hostname, read_path))
        if not os.access(read_path, os.X_OK):
            martian.exit(
                "On machine: %s, cellranger does not have permission to open FASTQ folder: %s"
                % (hostname, read_path))
        if not os.listdir(read_path):
            martian.exit("Specified FASTQ folder is empty: " + read_path)

        lanes = sample_def["lanes"]
        if lanes is not None:
            for lane in lanes:
                if not is_int(lane):
                    martian.exit(
                        "Lanes must be a comma-separated list of numbers.")

        ok, msg = tk_preflight.check_sample_indices(sample_def)
        if not ok:
            martian.exit(msg)

    if args.reference_path is None and args.vdj_reference_path is None:
        martian.exit(
            "Must specify either reference_path or vdj_reference_path.")

    print "Checking transcriptome..."
    if args.reference_path is not None:
        ok, msg = cr_preflight.check_refdata(args.reference_path)
        if not ok:
            martian.exit(msg)

    if args.vdj_reference_path is not None:
        ok, msg = vdj_preflight.check_refdata(args.vdj_reference_path)
        if not ok:
            martian.exit(msg)

    print "Checking chemistry..."
    ok, msg = cr_chem.check_chemistry_defs()
    if not ok:
        martian.exit(msg)

    ok, msg = cr_chem.check_chemistry_arg(args.chemistry_name)
    if not ok:
        martian.exit(msg)

    if args.chemistry_name == cr_chem.CUSTOM_CHEMISTRY_NAME:
        ok, msg = cr_chem.check_chemistry_def(args.custom_chemistry_def)
        if not ok:
            martian.exit(msg)

    # Open file handles limit - per CELLRANGER-824, only check this on the execution machine.
    # We can tell if we're on the execution machine by looking at args.check_executables
    if args.check_executables:
        print "Checking system environment..."
        ok, msg = tk_preflight.check_open_fh()
        if not ok:
            martian.exit(msg)

    print "Checking optional arguments..."
    if args.recovered_cells is not None and args.force_cells is not None:
        martian.exit(
            "Cannot specify both --force-cells and --expect-cells (or --cells) in the same run."
        )

    cr_preflight.record_package_versions()
コード例 #10
0
def main(args, outs):
    """Combine reads from multiple input FASTQ files, and potentially trim.
       Demultiplex outputs a series of FASTQ files with filenames of the form:
       read-[RA|I1|I2]_si-AGTAACGT_lane-001_chunk_001.fastq[.gz].
    """

    validate_input(args)

    global_subsample_rate = args.downsample.get(
        'subsample_rate', 1.0) if args.downsample is not None else 1.0

    # Predicted input bases
    total_seq_bases = 0

    chunks = []
    read_groups = set()

    for read_chunk in args.sample_def:
        subsample_rate = global_subsample_rate * read_chunk.get(
            'subsample_rate', 1.0)

        bc_in_read = {}
        if read_chunk.get('bc_in_read', None) is not None:
            bc_in_read['bc_in_read'] = read_chunk['bc_in_read']
            bc_in_read['bc_length'] = read_chunk['bc_length']

        path = read_chunk['read_path']
        lanes = read_chunk['lanes']
        gem_group = read_chunk['gem_group']
        unbarcoded = read_chunk.get('unbarcoded', False)
        if unbarcoded:
            martian.log_info('Flagged as unbarcoded: processing as bulk data')

        sample_id = args.sample_id
        library_id = read_chunk.get('library_id', 'MissingLibrary')

        # split on BCL_PROCESSOR / ILMN_BCL2FASTQ
        # the main difference is that BCL_PROCESSOR uses interleaved reads and labels FASTQs by sample index;
        # whereas ILMN_BCL2FASTQ uses R1/R2 and labels by sample name

        if args.input_mode == "BCL_PROCESSOR":
            sample_index_strings, msg = tk_preflight.check_sample_indices(
                read_chunk)
            if sample_index_strings is None:
                martian.exit(msg)

            sample_seq_bases = 0
            find_func = tk_fasta.find_input_fastq_files_10x_preprocess
            for sample_index in sample_index_strings:
                read_paths = find_func(path, "RA", sample_index, lanes)
                for read in read_paths:
                    _, predicted_seq_bases = fastq_data_estimate(read)
                    sample_seq_bases += predicted_seq_bases

            martian.log_info("Input data: Predict %f GB from %s" %
                             (sample_seq_bases / 1e9, path))
            total_seq_bases += sample_seq_bases

            for sample_index in sample_index_strings:
                read_paths = find_func(path, "RA", sample_index, lanes)
                # cell barcodes and sample indices are embedded in the index reads
                si_read, bc_read = ("I1", "I2")

                # allow empty sample index case if all reads in lane are same sample
                sis = find_func(path, si_read, sample_index, lanes)
                if sis is None or len(sis) == 0:
                    sis = [None] * len(read_paths)

                barcodes = find_func(path, bc_read, sample_index, lanes)
                if unbarcoded or len(barcodes) == 0:
                    barcodes = [None] * len(read_paths)

                # calculate chunks
                for r, b, si in zip(read_paths, barcodes, sis):
                    (flowcell, lane) = get_run_data(r)
                    if sample_id is not None:
                        rg_string = ':'.join(
                            str(item) for item in
                            [sample_id, library_id, gem_group, flowcell, lane])
                    else:
                        rg_string = 'None:None:None:None:None'
                    new_chunk = {
                        'read1': r,
                        'read2': None,
                        'reads_interleaved': True,
                        'barcode': b,
                        'sample_index': si,
                        'barcode_reverse_complement': False,
                        'gem_group': gem_group,
                        'subsample_rate': subsample_rate,
                        'read_group': rg_string
                    }
                    new_chunk.update(bc_in_read)
                    chunks.append(new_chunk)
                    read_groups.add(rg_string)

        elif args.input_mode == "ILMN_BCL2FASTQ":
            r1_read, r2_read, si_read, bc_read = \
                (BCL2FASTQ_SEQNAMES["read1"], BCL2FASTQ_SEQNAMES["read2"],
                 BCL2FASTQ_SEQNAMES["sample_index"], BCL2FASTQ_SEQNAMES["barcode"])
            sample_names = read_chunk["sample_names"]
            sample_seq_bases = 0
            find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult
            for sample_name in sample_names:
                for seq_name in (r1_read, r2_read):
                    read_paths = find_func(path, seq_name, sample_name, lanes)
                    for read_fn in read_paths:
                        _, predicted_seq_bases = fastq_data_estimate(read_fn)
                        sample_seq_bases += predicted_seq_bases

            martian.log_info("Input data: Predict %f GB from %s" %
                             (sample_seq_bases / 1e9, path))
            total_seq_bases += sample_seq_bases

            for sample_name in sample_names:
                r1_reads = find_func(path, r1_read, sample_name, lanes)
                r2_reads = find_func(path, r2_read, sample_name, lanes)

                # allow empty sample index case if all reads in lane are same sample
                sis = find_func(path, si_read, sample_name, lanes)
                if sis is None or len(sis) == 0:
                    sis = [None] * len(r1_reads)

                barcodes = find_func(path, bc_read, sample_name, lanes)
                if unbarcoded or len(barcodes) == 0:
                    martian.log_info(
                        'No barcodes available: ignoring sc processing')
                    barcodes = [None] * len(r1_reads)

                if not (len(r1_reads) == len(r2_reads) == len(barcodes)):
                    martian.log_info("Read 1 files: %s" % str(r1_reads))
                    martian.log_info("Read 2 files: %s" % str(r2_reads))
                    martian.log_info("Barcode files: %s" % str(barcodes))
                    martian.exit(
                        "Read1, Read2, and Barcode files are mismatched. Exiting pipeline"
                    )

                # calculate chunks
                for r1, r2, b, si in zip(r1_reads, r2_reads, barcodes, sis):
                    (flowcell, lane) = get_run_data(r1)
                    if sample_id is not None:
                        rg_string = ':'.join(
                            str(item) for item in
                            [sample_id, library_id, gem_group, flowcell, lane])
                    else:
                        rg_string = 'None:None:None:None:None'
                    new_chunk = {
                        'read1': r1,
                        'read2': r2,
                        'reads_interleaved': False,
                        'barcode': b,
                        'sample_index': si,
                        'barcode_reverse_complement': False,
                        'gem_group': gem_group,
                        'subsample_rate': subsample_rate,
                        'read_group': rg_string
                    }
                    new_chunk.update(bc_in_read)
                    chunks.append(new_chunk)
                    read_groups.add(rg_string)

    martian.log_info("Input data: Predict %f total GB" %
                     (total_seq_bases / 1e9))

    if len(chunks) == 0:
        martian.exit(
            "No input FASTQs were found for the requested parameters.")

    if args.downsample is not None and args.downsample.get('subsample_rate', None) is None \
            and args.downsample.get('gigabases', None) is not None:
        global_subsample_rate = min(
            1.0, args.downsample['gigabases'] * 1e9 / total_seq_bases)
        martian.log_info(
            "Input data downsampling: Requested: %.2f GB, Estimated Input: %.2f GB, Downsample Rate: %.3f"
            % (args.downsample['gigabases'], total_seq_bases / 1e9,
               global_subsample_rate))
        for chunk in chunks:
            chunk['subsample_rate'] *= global_subsample_rate

    martian.log_info("Input reads: %s" % str(chunks))
    outs.chunks = chunks
    outs.read_groups = [rg for rg in read_groups]

    downsample_info = get_downsample_info(args.downsample, total_seq_bases)
    with open(outs.downsample_info, 'w') as downsample_out:
        tenkit.safe_json.dump_numpy(downsample_info, downsample_out)

    check_fastqs(outs.chunks)
コード例 #11
0
def check_sample_def(sample_def, feature_ref=None, pipeline=None):
    hostname = socket.gethostname()

    check(tk_preflight.check_gem_groups(sample_def))

    # Check uniqueness of sample_def entries
    sd_entries = sorted([(sd.get("read_path"), sd.get("sample_names"),
                          sd.get("sample_indices"), sd.get("lanes"))
                         for sd in sample_def])

    for i in range(len(sd_entries) - 1):
        if sd_entries[i] == sd_entries[i + 1]:
            msg = "Duplicated entry in the input FASTQ data. Please use a unique combination of fastq path and sample name."
            msg += "\nPath: %s" % sd_entries[i][0]
            msg += "\nNote in demux mode, a unique combination fastq path, sample indices, and lanes is required."
            raise PreflightException(msg)

    print "Checking FASTQ folder..."
    for sample_def in sample_def:
        read_path = sample_def["read_path"]
        if read_path.strip() == "":
            raise PreflightException(
                "Empty fastq path specifed. Please specify an absolute path.")
        if not read_path.startswith('/'):
            raise PreflightException(
                "Specified FASTQ folder must be an absolute path: %s" %
                read_path)
        if not os.path.exists(read_path):
            raise PreflightException(
                "On machine: %s, specified FASTQ folder does not exist: %s" %
                (hostname, read_path))
        if not os.access(read_path, os.X_OK):
            raise PreflightException(
                "On machine: %s, cellranger does not have permission to open FASTQ folder: %s"
                % (hostname, read_path))
        if not os.listdir(read_path):
            raise PreflightException("Specified FASTQ folder is empty: " +
                                     read_path)

        lanes = sample_def["lanes"]
        if lanes is not None:
            for lane in lanes:
                if not is_int(lane):
                    raise PreflightException(
                        "Lanes must be a comma-separated list of numbers.")

        check(tk_preflight.check_sample_indices(sample_def))

        if pipeline == cr_constants.PIPELINE_COUNT:
            options = ", ".join(("'%s'" % x for x in ALLOWED_LIBRARY_TYPES))
            library_type = sample_def.get("library_type", None)

            # Check for empty library_type
            if library_type == '':
                msg = ("library_type field may not be an empty string."
                    "\nThe 'library_type' field in the libraries csv"
                    " must be one of %s, or start with '%s'") % \
                    (options, cellranger.rna.library.CUSTOM_LIBRARY_TYPE_PREFIX)
                raise PreflightException(msg)

            # Check for a valid library_type
            if not (library_type is None or library_type in ALLOWED_LIBRARY_TYPES or \
            library_type.startswith(cellranger.rna.library.CUSTOM_LIBRARY_TYPE_PREFIX)):

                msg = ("Unknown library_type: '%s'."
                    "\nThe 'library_type' field in the libraries csv"
                    " must be one of %s, or start with '%s'") % \
                    (library_type, options, cellranger.rna.library.CUSTOM_LIBRARY_TYPE_PREFIX)
                raise PreflightException(msg)

            # Check that the library_type exists in the feature_ref
            if feature_ref is not None and \
            library_type is not None and \
            library_type != cr_constants.GENE_EXPRESSION_LIBRARY_TYPE:

                if not any(x.feature_type == library_type
                           for x in feature_ref.feature_defs):
                    msg = "You declared a library with library_type = '%s', but there are no features declared with that feature_type in the feature reference." % library_type
                    msg += "\nCheck that the 'library_type' field in the libraries csv matches at least 1 entry in the 'feature_type' field in the feature reference csv"
                    raise PreflightException(msg)

        elif pipeline == cr_constants.PIPELINE_VDJ:
            # library type can be missing, or VDJ
            library_type = sample_def.get("library_type", None)
            if library_type is not None and not (
                    library_type == lib_constants.VDJ_LIBRARY_TYPE):
                msg = "You declared a library with library_type = '%s'. For the vdj pipeline, the library_type field in sample_def must be missing or '%s'" % (
                    library_type, lib_constants.VDJ_LIBRARY_TYPE)
                raise PreflightException(msg)
コード例 #12
0
ファイル: __init__.py プロジェクト: umccr/longranger
def main(args, outs):
    """Combine reads from multiple input FASTQ files, and potentially trim.
       Demultiplex outputs a series of FASTQ files with filenames of the form:
       read-[RA|I1|I2]_si-AGTAACGT_lane-001_chunk_001.fastq[.gz].
    """
    def check_key(n, dict_in, name, tys):
        if not dict_in.has_key(name):
            martian.exit("Entry %d in sample_def missing required field: %s" %
                         (n, name))

        if not (type(dict_in[name]) in tys):
            martian.exit(
                "Entry %d in sample_def for '%s' has incorrect type -- expecting %s, got %s"
                % (n, name, str(tys), type(dict_in[name])))

    # Check for self-consistent gem_group settings in the sample_def entries
    gem_groups = [x['gem_group'] for x in args.sample_def]
    all_null = all([x is None for x in gem_groups])
    all_int = all([type(x) is int for x in gem_groups])

    if not (all_null or all_int):
        martian.exit(
            "Inconsistent gem_group tags. Please specify all gem_group tags as null, or all gem_group tags with an integer"
        )

    # If all gem_groups are set to null, then set them all to 1
    if all_null:
        for sample_item in args.sample_def:
            sample_item['gem_group'] = 1

    # Predicted input bases
    total_seq_bases = 0

    # verify input mode upfront
    if args.input_mode not in ["BCL_PROCESSOR", "ILMN_BCL2FASTQ"]:
        martian.throw("Unrecognized input_mode: %s" % args.input_mode)

    for (idx, sample_item) in enumerate(args.sample_def):
        # validate fields
        check_key(idx, sample_item, "read_path", [str, unicode])
        check_key(idx, sample_item, "lanes", [list, type(None)])
        check_key(idx, sample_item, "gem_group", [int, type(None)])
        if args.input_mode == "BCL_PROCESSOR":
            check_key(idx, sample_item, "sample_indices", [list, type(None)])
        elif args.input_mode == "ILMN_BCL2FASTQ":
            check_key(idx, sample_item, "sample_names", [list, type(None)])

    interleaved_read_type = "RA"

    chunks = []
    read_groups = set()

    for read_chunk in args.sample_def:
        # Each sample_def entry can have a separate pre-applied downsampling rate
        # We adjust the estimated data in that chunk to account for this
        # subsampling
        chunk_subsample_rate = read_chunk.get('subsample_rate', 1.0)

        bc_in_read = {}
        if read_chunk.has_key('bc_in_read'):
            if read_chunk['bc_in_read'] is not None:
                bc_in_read['bc_in_read'] = read_chunk['bc_in_read']
                bc_in_read['bc_length'] = read_chunk['bc_length']

        path = read_chunk['read_path']
        lanes = read_chunk['lanes']
        gem_group = read_chunk['gem_group']
        unbarcoded = read_chunk.get('unbarcoded')
        sample_id = args.sample_id
        library_id = read_chunk.get('library', 'MissingLibrary')

        # split on BCL_PROCESSOR / ILMN_BCL2FASTQ
        # the main difference is that BCL_PROCESSOR uses interleaved reads and labels FASTQs by sample index;
        # whereas ILMN_BCL2FASTQ uses R1/R2 and labels by sample name

        if args.input_mode == "BCL_PROCESSOR":
            sample_index_strings, msg = tk_preflight.check_sample_indices(
                read_chunk)
            if sample_index_strings is None:
                martian.exit(msg)

            sample_seq_bases = 0
            read_length = 100  # Should be overwritten below

            find_func = tk_fasta.find_input_fastq_files_10x_preprocess
            for sample_index in sample_index_strings:
                # process interleaved reads
                reads = find_func(path, interleaved_read_type, sample_index,
                                  lanes)
                for read in reads:
                    _, predicted_seq_bases, read_length = fastq_data_estimate(
                        read)
                    sample_seq_bases += predicted_seq_bases

            sample_seq_bases = chunk_subsample_rate * sample_seq_bases
            bp_per_read_pair = 2 * read_length

            martian.log_info(
                "Input data: Predict %f GB from %s. (%d bp per read pair)" %
                (float(sample_seq_bases) / 1e9, path, bp_per_read_pair))
            total_seq_bases += sample_seq_bases

            for sample_index in sample_index_strings:
                reads = find_func(path, interleaved_read_type, sample_index,
                                  lanes)
                # TODO confirm that this works with cellranger
                si_read, bc_read = ("I1", "I2")
                if 'barcode_read' in read_chunk and read_chunk[
                        'barcode_read'] == 'I1':
                    si_read, bc_read = ("I2", "I1")
                sis = find_func(path, si_read, sample_index, lanes)

                # allow empty sample index case if all reads in lane are same sample
                if sis is None or sis == []:
                    sis = [None] * len(reads)

                if not unbarcoded:
                    barcodes = find_func(path, bc_read, sample_index, lanes)
                    if len(barcodes) == 0:
                        barcodes = [None] * len(reads)
                else:
                    barcodes = [None] * len(reads)

                # calculate chunks
                for r, b, si in zip(reads, barcodes, sis):
                    (flowcell, lane) = get_run_data(r)
                    rg_string = tk_bam.pack_rg_string(sample_id, library_id,
                                                      gem_group, flowcell,
                                                      lane)
                    new_chunk = {
                        'read1': r,
                        'read2': None,
                        'reads_interleaved': True,
                        'barcode': b,
                        'sample_index': si,
                        'barcode_reverse_complement': False,
                        'gem_group': gem_group,
                        'subsample_rate': chunk_subsample_rate,
                        'read_group': rg_string
                    }
                    new_chunk.update(bc_in_read)
                    chunks.append(new_chunk)
                    read_groups.add(rg_string)

        elif args.input_mode == "ILMN_BCL2FASTQ":
            sample_names = read_chunk['sample_names']

            sample_seq_bases = 0
            find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult
            for sample_name in sample_names:
                # process read 1
                reads = find_func(path, "R1", sample_name, lanes)
                for read in reads:
                    _, predicted_seq_bases, read_length1 = fastq_data_estimate(
                        read)
                    sample_seq_bases += predicted_seq_bases
                # process read 2
                reads = find_func(path, "R2", sample_name, lanes)
                for read in reads:
                    _, predicted_seq_bases, read_length2 = fastq_data_estimate(
                        read)
                    sample_seq_bases += predicted_seq_bases

            sample_seq_bases = chunk_subsample_rate * sample_seq_bases
            bp_per_read_pair = read_length1 + read_length2

            martian.log_info(
                "Input data: Predict %f GB from %s. (%d bp per read pair)" %
                (float(sample_seq_bases) / 1e9, path, bp_per_read_pair))
            total_seq_bases += sample_seq_bases

            for sample_name in sample_names:
                r1_reads = find_func(path, "R1", sample_name, lanes)
                r2_reads = find_func(path, "R2", sample_name, lanes)

                # TODO confirm that this works with cellranger
                si_read, bc_read = ("I1", "I2")
                if 'barcode_read' in read_chunk and read_chunk[
                        'barcode_read'] == 'I1':
                    si_read, bc_read = ("I2", "I1")
                sis = find_func(path, si_read, sample_name, lanes)

                # allow empty sample index case if all reads in lane are same sample
                if sis is None or sis == []:
                    sis = [None] * len(r1_reads)

                # in Chromium chemistry... there shouldn't be separate barcode reads...
                if not unbarcoded:
                    barcodes = find_func(path, bc_read, sample_name, lanes)
                    if len(barcodes) == 0:
                        barcodes = [None] * len(r1_reads)
                else:
                    barcodes = [None] * len(r1_reads)

                # again, with Chromium, the barcodes should be an array of Nones, but
                # just in case...
                if not (len(r1_reads) == len(r2_reads) == len(barcodes)):
                    martian.log_info("Read 1 files: %s" % str(r1_reads))
                    martian.log_info("Read 2 files: %s" % str(r2_reads))
                    martian.log_info("Barcode files: %s" % str(barcodes))
                    martian.exit(
                        "Read1, Read2, and Barcode files are mismatched. Exiting pipline"
                    )

                # calculate chunks
                for r1, r2, b, si in zip(r1_reads, r2_reads, barcodes, sis):
                    (flowcell, lane) = get_run_data(r1)
                    rg_string = tk_bam.pack_rg_string(sample_id, library_id,
                                                      gem_group, flowcell,
                                                      lane)
                    new_chunk = {
                        'read1': r1,
                        'read2': r2,
                        'reads_interleaved': False,
                        'barcode': b,
                        'sample_index': si,
                        'barcode_reverse_complement': False,
                        'gem_group': gem_group,
                        'subsample_rate': chunk_subsample_rate,
                        'read_group': rg_string
                    }
                    new_chunk.update(bc_in_read)
                    chunks.append(new_chunk)
                    read_groups.add(rg_string)

    martian.log_info("Input data: Predict %f total GB" %
                     (float(total_seq_bases) / 1e9))

    if len(chunks) == 0:
        martian.exit(
            "No input FASTQs were found for the requested parameters.")

    #
    # Downsampling setup
    #

    # The total available input raw gigabases of input data (est_gb), and the base pairs per read pair (bp_per_read_pair)
    # are estimated above.
    (est_gb, bp_per_read_pair) = (float(total_seq_bases) / 1e9,
                                  bp_per_read_pair)

    downsample = args.downsample if args.downsample is not None else {}

    # Possible BC subsampling -- try to get the requested amount of data _after_ bc subsampling
    est_gb_post_bc = est_gb * downsample.get("bc_subsample_rate", 1.0)

    # Aim high to ensure that we won't be left with too few reads
    # if the rest of pipeline can trim this down for us.
    fudge_factor = args.downsample_overage

    downsample_succeeded = True

    if downsample.has_key("gigabases"):
        read_sample_rate = min(
            1.0, fudge_factor * downsample['gigabases'] / est_gb_post_bc)
        requested_read_pairs = int(1e9 * downsample['gigabases'] /
                                   bp_per_read_pair)
        downsample_succeeded = downsample['gigabases'] > est_gb_post_bc

    elif downsample.has_key("target_reads"):
        requested_read_pairs = int(downsample['target_reads'] / 2)
        est_read_pair_post_bc = 1e9 * est_gb_post_bc / bp_per_read_pair
        read_sample_rate = min(
            1.0, fudge_factor * requested_read_pairs / est_read_pair_post_bc)
        downsample_succeeded = requested_read_pairs > est_read_pair_post_bc

    elif downsample.has_key("subsample_rate"):
        read_sample_rate = min(
            1.0, downsample["subsample_rate"] /
            downsample.get("bc_subsample_rate", 1.0))
        requested_read_pairs = None

    else:
        read_sample_rate = 1.0
        requested_read_pairs = None

    martian.log_info("Downsampling request: %s" % str(downsample))
    martian.log_info("Base pairs per read pair: %s" % bp_per_read_pair)
    martian.log_info(
        "Estimated Input: %.2f GB, Initial Downsample Rate: %.3f. Requested total reads: %s"
        % (est_gb, read_sample_rate, str(requested_read_pairs)))

    # Copy over the per-chunk subsample rates
    if read_sample_rate is not None:
        for chunk in chunks:
            chunk['subsample_rate'] = chunk.get('subsample_rate',
                                                1.0) * read_sample_rate
            if downsample.has_key("bc_subsample_rate"):
                chunk["bc_subsample_rate"] = downsample["bc_subsample_rate"]

    outs.requested_read_pairs = requested_read_pairs

    martian.log_info("Input reads: %s" % str(chunks))
    outs.chunks = chunks
    outs.read_groups = [rg for rg in read_groups]

    downsample_info = {}
    downsample_info['available_gb'] = est_gb
    downsample_info['requested_gb'] = downsample.get('gigabases', None)
    downsample_info['requested_rate'] = read_sample_rate
    downsample_info['post_downsample_gb'] = float(
        requested_read_pairs *
        bp_per_read_pair) / 1e9 if requested_read_pairs is not None else None
    downsample_info['downsample_succeeded'] = downsample_succeeded

    with open(outs.downsample_info, 'w') as downsample_out:
        tenkit.safe_json.dump_numpy(downsample_info, downsample_out)

    check_fastqs(outs.chunks)

    # Give out full path to BC whitelist
    if args.barcode_whitelist:
        outs.barcode_whitelist_path = BARCODE_LOCATION + "/" + args.barcode_whitelist + ".txt"
    else:
        outs.barcode_whitelist_path = None
コード例 #13
0
def main(args, outs):
    hostname = socket.gethostname()
    tk_preflight.record_package_versions()

    ## no barcode whitelist
    if args.barcode_whitelist is None:
        martian.exit("No barcode whitelist specified.")

    ## there must be a barcode in each sample
    ## and it should be 16 bases long
    ## and it should be on read 1 or read 2
    for sd in args.sample_def:
        if sd.get("bc_length", 0) != 16 or sd.get("bc_in_read",
                                                  3) not in [1, 2]:
            martian.exit("Barcode must be 16 bases and on read1 or read2.")

    print "Checking FASTQ folder..."
    for sample_def in args.sample_def:
        read_path = sample_def["read_path"]
        if not read_path:
            martian.exit("Must specify a read_path containing FASTQs.")
        if not read_path.startswith('/'):
            martian.exit(
                "Specified FASTQ folder must be an absolute path: %s" %
                read_path)
        if not os.path.exists(read_path):
            martian.exit(
                "On machine: %s, specified FASTQ folder does not exist: %s" %
                (hostname, read_path))
        if not os.access(read_path, os.X_OK):
            martian.exit(
                "On machine: %s, supernova does not have permission to open FASTQ folder: %s"
                % (hostname, read_path))
        if not os.listdir(read_path):
            martian.exit("Specified FASTQ folder is empty: " + read_path)

        library_id = sample_def.get("library_id")
        if library_id is not None:
            if not re.match("^[\w-]+$", library_id):
                martian.exit(
                    "Library name may only contain letters, numbers, underscores, and dashes: "
                    + library_id)

        lanes = sample_def["lanes"]
        if lanes is not None:
            for lane in lanes:
                if not is_int(lane):
                    martian.exit(
                        "Lanes must be a comma-separated list of numbers.")

    # Open file handles limit
    ok, msg = tk_preflight.check_open_fh()
    if not ok:
        martian.exit(msg)

    ## compile a list of fastq files
    fastq_files = []
    if args.input_mode == "BCL_PROCESSOR":
        # Validate the sample_def fields are correct
        for (idx, sample_item) in enumerate(args.sample_def):
            # validate
            check_key(idx, sample_item, "sample_indices", [list, type(None)])
            check_key(idx, sample_item, "read_path", [str, unicode])
            check_key(idx, sample_item, "lanes", [list, type(None)])

        main_read_type = "RA"
        find_func = tk_fasta.find_input_fastq_files_10x_preprocess

        for read_chunk in args.sample_def:
            sample_index_strings, msg = tk_preflight.check_sample_indices(
                read_chunk)
            if sample_index_strings is None:
                martian.exit(msg)

            path = read_chunk['read_path']
            lanes = read_chunk['lanes']

            for sample_index in sample_index_strings:
                reads = find_func(path, main_read_type, sample_index, lanes)
                fastq_files.extend(reads)
    elif args.input_mode == "ILMN_BCL2FASTQ":
        # Validate the sample_def fields are correct
        for (idx, sample_item) in enumerate(args.sample_def):
            # validate
            check_key(idx, sample_item, "read_path", [str, unicode])
            check_key(idx, sample_item, "lanes", [list, type(None)])
            check_key(idx, sample_item, "sample_names", [list, type(None)])

        find_func = tk_fasta.find_input_fastq_files_bcl2fastq_demult

        for read_chunk in args.sample_def:
            sample_names = read_chunk['sample_names']
            path = read_chunk['read_path']
            lanes = read_chunk['lanes']

            for sample_name in sample_names:
                reads = find_func(path, "R1", sample_name, lanes)
                fastq_files.extend(reads)
                reads = find_func(path, "R3", sample_name, lanes)
                fastq_files.extend(reads)
    else:
        martian.throw("Unrecognized input_mode: %s" % args.input_mode)

    ## if we found nothing then break
    if len(fastq_files) == 0:
        martian.exit(
            "No input FASTQs were found with the requested lanes and sample indices."
        )

    ## make sure they are okay first
    check_fastqs(fastq_files)

    total_reads = 0.0
    global_avg = 0.0
    num_files = 0
    for fn in fastq_files:
        reads_fn, avg_read_len_fn = estimate_read_count_and_length(
            fn, num_reads=1000)
        total_reads += reads_fn
        global_avg += avg_read_len_fn
        num_files += 1
    global_avg = global_avg / num_files
    martian.log_info(
        "Estimated read length = %.1f, Estimated total read input = %.1f" %
        (global_avg, total_reads))

    exit_msg = "We observe many reads shorter than 125 bases. The ideal read length for Supernova is 150 bases. Reads shorter than the ideal length are likely to yield a lower quality assembly, and the algorithm has not been tested on short reads. Because reads are too short, execution will be terminated."
    warn_msg = "We observe many reads shorter than 150 bases.The ideal read length for Supernova is 150 bases. Reads shorter than the ideal length are likely to yield a lower quality assembly."
    if global_avg < 125:
        martian.exit(exit_msg)
    elif global_avg < 149:
        martian.alarm(warn_msg)