def check_reference_format(reference_path):
    """Check file formats for files present in the reference"""
    try:
        contig_manager = ReferenceManager(reference_path)
    except Exception as e:
        martian.exit("Contig manager could not be initialized, Error:\n%s" % str(e))

    # formatting
    error_msg = contig_manager.verify_contig_defs()
    if error_msg is not None:
        martian.exit(error_msg)

    # filecheck
    contig_manager.genes

    # check if motif file is in right format (naming convention)
    if len(contig_manager.list_species()) == 1:
        motif_format_checker(contig_manager.motifs)

    # checks for valid bed file formats in regions/
    faidx_file = os.path.join(reference_path, 'fasta', 'genome.fa.fai')

    bed_format_checker(contig_manager.tss_track, faidx_file)
    bed_format_checker(contig_manager.transcripts_track, faidx_file)
    bed_format_checker(contig_manager.ctcf_track, faidx_file)
    bed_format_checker(contig_manager.blacklist_track, faidx_file)
    bed_format_checker(contig_manager.dnase_track, faidx_file)
    bed_format_checker(contig_manager.enhancer_track, faidx_file)
    bed_format_checker(contig_manager.promoter_track, faidx_file)
Esempio n. 2
0
def split(args):
    # validate
    if args.kit_type not in ["5'", "3'"]:
        martian.exit("Kit type is not one of 5' or 3'.")
    # group by gene
    tx_dict = get_gene_pred_dict(args.transcripts)
    tx_by_name2 = defaultdict(list)
    valid_chroms = set(args.valid_chroms)
    for tx in tx_dict.itervalues():
        if tx.chromosome in valid_chroms:
            tx_by_name2[tx.name2].append(tx)

    singletons = [
        x[0] for x in tx_by_name2.itervalues() if len(x) == 1
        and args.lower_size_cutoff <= len(x[0]) <= args.upper_size_cutoff
    ]

    avg = np.mean([len(x) for x in singletons])
    med = np.median([len(x) for x in singletons])
    martian.log_info(
        '{} singleton genes under consideration (avg size = {} median size = {}'
        .format(len(singletons), avg, med))

    def tx_to_str(singletons):
        for x in singletons:
            yield x.get_gene_pred()

    chunks = [{
        'tx_subset': list(x),
        '__mem_gb': 4
    } for x in grouper(tx_to_str(singletons), 20)]
    return {'chunks': chunks, 'join': {'__mem_gb': 32}}
Esempio n. 3
0
def split(args):
    chunk_dict = defaultdict(list)

    # TENKIT-88 If we don't want to run qc, just have a blank set of chunks
    if not args.run_qc:
        return {'chunks': []}

    # find projects inside run dir
    dirnames = next(os.walk(args.fastq_path))[1]

    # this should be fine even if --reports-dir and --stats-dir are elsewhere
    # theoretically, you could name your sample 'Reports' or 'Stats' and
    # move the --reports-dir and --stats-dir somewhere else but at that
    # point you're just being daft
    project_dirs = [dn for dn in dirnames if dn not in ('Reports', 'Stats')]
    if not project_dirs:
        # add root folder
        project_dirs = ['.']

    for project in sorted(project_dirs):
        project_path = os.path.join(args.fastq_path, project)
        # split by detected sample (all types)
        r1_files = tk_fasta.find_input_fastq_files_bcl2fastq_demult(
            project_path, 'R1', None, None)
        r2_files = tk_fasta.find_input_fastq_files_bcl2fastq_demult(
            project_path, 'R2', None, None)
        r3_files = tk_fasta.find_input_fastq_files_bcl2fastq_demult(
            project_path, 'R3', None, None)
        r4_files = tk_fasta.find_input_fastq_files_bcl2fastq_demult(
            project_path, 'R4', None, None)
        i1_files = tk_fasta.find_input_fastq_files_bcl2fastq_demult(
            project_path, 'I1', None, None)
        i2_files = tk_fasta.find_input_fastq_files_bcl2fastq_demult(
            project_path, 'I2', None, None)

        # group files together by like sample
        all_files = r1_files + r2_files + r3_files + r4_files + i1_files + i2_files

        for path in sorted(all_files):
            file_spec = tk_fasta.IlmnFastqFile(path)

            # do not add chunk if Undetermined
            if file_spec.prefix != 'Undetermined':
                chunk_dict[(project, file_spec.lane, file_spec.prefix,
                            file_spec.s)].append(path)

    # if all reads were undetermined or bcl2fastq didn't generate anything
    if len(chunk_dict) == 0:
        martian.exit(
            "No FASTQs matched your sample sheet's lane, sample, and indexes. Please recheck your sample sheet."
        )

    chunk_defs = [{
        'input_files': file_list,
        'project': tup[0],
        'lane': tup[1],
        'sample': tup[2],
        'subfolder': tup[3]
    } for tup, file_list in sorted(chunk_dict.items())]
    return {'chunks': chunk_defs}
Esempio n. 4
0
def main(args, outs):
    try:
        run_preflight_checks(args)
    except cr_preflight.PreflightException as e:
        martian.exit(e.msg)

    cr_preflight.record_package_versions()
def check_factorization(factorization):
    """checks if factorization is valid"""
    if factorization is not None:
        if len(factorization) == 0:
            martian.exit("factorization must be a non-empty list.")
        if not all(elem in ALLOWED_FACTORIZATIONS for elem in factorization):
            martian.exit("Unsupported factorization provided. Options are {}.".
                         format(", ".join(ALLOWED_FACTORIZATIONS)))
Esempio n. 6
0
 def check_key(n, dict_in, name, tys):
     if not name in dict_in:
         martian.exit("Entry %d in sample_def missing required field: %s" %
                      (n, name))
     if not (type(dict_in[name]) in tys):
         martian.exit(
             "Entry %d in sample_def for '%s' has incorrect type -- expecting %s, got %s"
             % (n, name, str(tys), type(dict_in[name])))
def check_spec(spec):
    # rule: if spec doesn't contain csv, it must contain lane/sample/index, with optional project.
    if not spec.get('csv'):
        # allow None for lanes, downstream will default to all
        if not spec.get('sample') or not spec.get('indices'):
            martian.exit(
                "Samplesheet spec without CSV must include lanes, sample and indices keys."
            )
Esempio n. 8
0
def check_chunk_chemistries(chunks):
    """ Ensure all samples were generated with the same chemistry. """
    unique_chemistries = set([chunk['chemistry']['name'] for chunk in chunks])
    descriptions = map(cr_chem.get_chemistry_description_from_name,
                       list(unique_chemistries))
    if len(unique_chemistries) > 1:
        martian.exit(
            "Found multiple chemistries: %s. Combined analysis of libraries generated with different chemistries is not supported."
            % ', '.join(descriptions))
def check_force_cells(force_cells, ulimit=20000):
    """check if force cells is correctly specified"""
    if force_cells is not None:
        if len(force_cells) == 0:
            martian.exit("force_cells must be a non-empty dictionary.")
        for force_cells_k in force_cells.keys():
            if (force_cells[force_cells_k] < 1 or force_cells[force_cells_k] > ulimit):
                martian.exit("MRO parameter force-cells[{}] must be a positive integer <= 20000: {}".
                             format(force_cells_k, force_cells[force_cells_k]))
Esempio n. 10
0
def check_gatk_ref(reference_path):

    if not os.path.exists(os.path.join(reference_path, "fasta", "genome.dict")):
        msg = "GATK requires that you create a .dict reference index file.\n"
        msg += "Note: to use this reference with GATK, also run this Picard command:\n"
        msg += "java -jar /path/to/picard.jar CreateSequenceDictionary R=%s O=%s\n" % (os.path.join(reference_path, "fasta", "genome.fa"), os.path.join(reference_path, "fasta", "genome.dict"))
        msg += "If you have GATK4, use this command:\n"
        msg += "java -jar /path/to/gatk4.jar CreateSequenceDictionary -R=%s" % os.path.join(reference_path, "fasta", "genome.fa")
        martian.exit(msg)
def check_vmem_for_reference(ref):
    """Finds out vmem required to load a genome reference fully"""
    refpath = os.path.join(ref, "fasta", "genome.fa")
    if not os.path.exists(refpath):
        hostname = socket.gethostname()
        martian.exit("Your reference does not contain the expected files, or they are not readable. Please check your reference folder on {}.".format(hostname))
    refsize = os.path.getsize(refpath) / 1e9
    vmem_gb = int(np.ceil(refsize)) + 4
    return vmem_gb
Esempio n. 12
0
def check_rta_complete(folder_path):
    """
    :return: path to valid RTAComplete.txt in folder_path
    :rtype: string
    """
    hostname = socket.gethostname()
    check_folder("sequencing run", folder_path, hostname)
    rta_complete = os.path.join(folder_path, "RTAComplete.txt")
    if not os.path.exists(rta_complete):
        martian.exit("On machine: %s, run does not appear to be complete yet.  RTAComplete.txt not found." % hostname)
    return rta_complete
def contain_three_columns(in_file, check_top_n = 100):

    with open(in_file) as bediter:
        for i, row in enumerate(bediter):
            fields = row.strip().split('\t')
            if len(fields) != 3:
                martian.exit("Peak input file must contain only 3 columns")

            if i > check_top_n:
                break
        return
Esempio n. 14
0
def check_bcl2fastq_v1(hostname):
    try:
        subprocess.check_call(["which", "configureBclToFastq.pl"])
    except subprocess.CalledProcessError:
        martian.exit(
            "On machine: %s, bcl2fastq or configureBclToFastq.pl not found on PATH."
            % hostname)
    print "configureBclToFastq.pl version on %s: < 2.0" % hostname
    try:
        subprocess.check_call(["which", "perl"])
    except subprocess.CalledProcessError:
        martian.exit("On machine: %s, perl not found on PATH." % hostname)
Esempio n. 15
0
def join(args, outs, chunk_defs, chunk_outs):
    if args.raw_matrix is None:
        outs.filtered_matrix = None
        return

    # consume cell barcodes across all species and raise errors if not found
    if args.cell_barcodes is None:
        martian.exit("cell barcodes not provided")
    cell_barcodes = utils.load_cell_barcodes(args.cell_barcodes, with_species=True)

    # Read the peak matrix file and keep only cell barcodes
    # remove cell barcodes that were specified externally, such in reanalyzer,
    # which may not be present in raw matrix because they're missing from the fragments file
    present_cell_barcodes = {}
    peak_matrix = cr_matrix.CountMatrix.load_h5_file(args.raw_matrix)
    peak_matrix_bcs = set(peak_matrix.bcs)
    for species in cell_barcodes:
        present_cell_barcodes[species] = set()
        for bc in cell_barcodes[species]:
            if bc not in peak_matrix_bcs:
                martian.log_info("{} not found in the raw peak - bc matrix".format(bc))
            else:
                present_cell_barcodes[species].add(bc)

    peak_matrix = peak_matrix.filter_barcodes(present_cell_barcodes)
    if peak_matrix.features_dim == 0:
        martian.log_info("data has no peaks, skipping the clustering analysis")
        outs.filtered_matrix = None
        outs.filtered_matrix_mex = None
        return

    peak_matrix = prune(peak_matrix, num_analysis_bcs=args.num_analysis_bcs, random_state=args.random_seed)

    if peak_matrix.bcs_dim <= analysis_constants.MAX_N_CLUSTERS_DEFAULT:
        martian.log_info("Insufficient number of cell barcodes present after processing")
        outs.filtered_matrix = None
        outs.filtered_matrix_mex = None
        return

    if peak_matrix.features_dim < analysis_constants.MAX_N_CLUSTERS_DEFAULT:
        martian.log_info("Insufficient number of peaks present after processing")
        outs.filtered_matrix = None
        outs.filtered_matrix_mex = None
        return

    # save processed matrix
    peak_matrix.save_h5_file(outs.filtered_matrix, sw_version=martian.get_pipelines_version())
    if not os.path.exists(outs.filtered_matrix_mex):
        os.mkdir(outs.filtered_matrix_mex)
    atac_matrix.save_mex(peak_matrix, outs.filtered_matrix_mex,
                         cr_lib_constants.ATACSEQ_LIBRARY_TYPE,
                         sw_version=martian.get_pipelines_version())
Esempio n. 16
0
def main(args, outs):
    if args.reanalyze and not args.aggregation_csv:
        return # CSV not required for reanalyze

    outs.sample_defs = parse_sample_sheet(args.pipestance_root, args.aggregation_csv)
    if args.reanalyze and args.matrix_h5:
        library_map = cr_matrix.get_gem_group_index(args.matrix_h5)
        matrix_library_ids = set([library_id for library_id, gem_group in library_map.values()])
        csv_library_ids = set([row[cr_constants.AGG_ID_FIELD] for row in outs.sample_defs])
        if matrix_library_ids != csv_library_ids:
            martian.exit("Library IDs specified in CSV (%s) do not match those contained in the input matrix (%s)"
             % (csv_library_ids, matrix_library_ids))
    copy_csv(args.aggregation_csv, outs.aggregation_csv)
Esempio n. 17
0
def check_runinfo_xml(folder_path):
    """
    :return: path to valid RunInfo.xml in folder_path
    :rtype: string
    """
    hostname = socket.gethostname()
    check_folder("sequencing run", folder_path, hostname)
    runinfo = os.path.join(folder_path, "RunInfo.xml")
    if not os.path.exists(runinfo):
        martian.exit("On machine: %s, RunInfo.xml not found. Cannot verify run was 10X-prepped." % hostname)
    if not os.access(runinfo, os.R_OK):
        martian.exit("On machine: %s, insufficient permission to open RunInfo.xml." % hostname)
    return runinfo
Esempio n. 18
0
def main(args, outs):
    if args.pipestance_type != "count" and args.pipestance_type != "aggr":
        martian.exit("The type argument must be one of: count, aggr")

    if args.pipestance_type == "count":
        pname = "SC_RNA_COUNTER_CS"
    if args.pipestance_type == "aggr":
        pname = "SC_RNA_AGGREGATOR_CS"

    pipestance_exists = os.path.exists(args.pipestance_path)
    if not pipestance_exists:
        martian.exit("Invalid pipestance path: %s" % args.pipestance_path)

    # check to see if an analysis file exists.  If it doesn't, then
    # this is likely a barnyard sample, and we cannot generate a
    # .loupe file (CELLRANGER-773);
    analysis_h5_path = os.path.join(args.pipestance_path,
                                    "outs/analysis/analysis.h5")

    # 1.2.0 location only
    internal_count_h5_path = os.path.join(
        args.pipestance_path,
        "SC_RNA_COUNTER_CS/SC_RNA_COUNTER/SC_RNA_ANALYZER/SUMMARIZE_ANALYSIS/fork0/files/analysis/analysis.h5"
    )

    internal_aggr_h5_path = os.path.join(
        args.pipestance_path,
        "SC_RNA_AGGREGATOR_CS/SC_RNA_AGGREGATOR/SC_RNA_ANALYZER/SUMMARIZE_ANALYSIS/fork0/files/analysis/analysis.h5"
    )

    if not os.path.exists(analysis_h5_path) \
            and not os.path.exists(internal_count_h5_path) \
            and not os.path.exists(internal_aggr_h5_path):
        martian.exit(
            "Could not find single-species analysis HDF5 file. " +
            "Loupe Cell Browser files are not generated for multi-species experiments."
        )

    # has to be 1.2 or higher
    cellranger_pd_before_1_2_path = os.path.join(args.pipestance_path,
                                                 "CELLRANGER_PD")
    cellranger_cs_before_1_2_path = os.path.join(args.pipestance_path,
                                                 "CELLRANGER_CS")
    if os.path.exists(cellranger_pd_before_1_2_path) or os.path.exists(
            cellranger_cs_before_1_2_path):
        martian.exit(
            "mkloupe is only supported for Cell Ranger 1.2 and later.")

    call = [
        "crconverter", args.sample_id, pname, "--pipestance",
        args.pipestance_path, "--output", outs.output_for_cloupe
    ]

    martian.log_info("Running crconverter: %s" % " ".join(call))
    try:
        results = subprocess.check_output(call)
        martian.log_info("crconverter output: %s" % results)
    except subprocess.CalledProcessError, e:
        outs.output_for_cloupe = None
        martian.throw("Could not generate .cloupe file: \n%s" % e.output)
Esempio n. 19
0
def main(args, outs):
    hostname = socket.gethostname()

    print "Checking run folder..."
    tk_preflight.check_rta_complete(args.run_path)

    print "Checking RunInfo.xml..."
    runinfo = tk_preflight.check_runinfo_xml(args.run_path)

    if not args.allow_no_barcodes:
        ok, msg = check_reads(runinfo)
        if not ok:
            martian.exit(msg)

    print "Checking system environment..."
    ok, msg = tk_preflight.check_ld_library_path()
    if not ok:
        martian.exit(msg)

    # Presence of SampleSheet.csv interferes with demux.
    # Ask customer to move it. Under older RTA, bcl2fastq looks for it
    # in Data/Intensities/BaseCalls while under newer RTA, it looks for it
    # at the top of the run folder.
    bc_dir = os.path.join(args.run_path, "Data", "Intensities", "BaseCalls")
    for ss_dir in [args.run_path, bc_dir]:
        ilmn_sample_sheet = os.path.join(ss_dir, "SampleSheet.csv")

        external = True
        try:
            import kitten
            external = False
        except ImportError:
            pass

        if external and os.path.exists(ilmn_sample_sheet):
            martian.exit(
                "On machine: %s, SampleSheet.csv found in run folder that would interfere with demux:\n%s\nPlease move, rename, or delete the file and run demux again."
                % (hostname, ilmn_sample_sheet))

    if args.check_executables:
        print "Checking bcl2fastq..."
        # Determine the RTA version of the run and whether this instrument
        # requires i2 to RC'd
        (rta_version, rc_i2_read,
         bcl_params) = tenkit.bcl.get_rta_version(args.run_path)
        martian.log_info("RTA Version: %s" % rta_version)
        martian.log_info("BCL Params: %s" % str(bcl_params))

        # Determine the best available bcl2fastq version to use
        # Will call martian.exit() with an error message if there isn't
        # a compatible version available
        (major_ver,
         full_ver) = tenkit.bcl.check_bcl2fastq(hostname, rta_version)
        martian.log_info("Running bcl2fastq mode: %s.  Version: %s" %
                         (major_ver, full_ver))

    ok, msg = tk_preflight.check_open_fh()
    if not ok:
        martian.exit(msg)
Esempio n. 20
0
 def issue(self, metric, value, format_string=""):
     for alert in self.alerts:
         ## find the right metric
         if alert["metric"] == metric:
             ## should we trigger?
             if (alert["compare"] == ">") ^ (value < alert["threshold"]):
                 ## optional formatting of alert message with format_string or value
                 if len(format_string) == 0:
                     format_string = str(value)
                 message = alert["message"].replace("{}", format_string)
                 ## issue an alert
                 if alert["action"] == "alarm":
                     martian.alarm(message)
                 elif alert["action"] == "exit":
                     martian.exit(message)
Esempio n. 21
0
def check_file(file_type, file_path, hostname):
    if not file_path.startswith('/'):
        martian.exit("Specified %s file must be an absolute path: %s" % (file_type, file_path))
    if not os.path.exists(file_path):
        martian.exit("On machine: %s, specified %s file does not exist: %s" % (hostname, file_type, file_path))
    if os.path.isdir(file_path):
        martian.exit("On machine: %s, specified %s file is a folder: %s" % (hostname, file_type, file_path))
    if not os.access(file_path, os.R_OK):
        martian.exit("On machine: %s, specified %s file is not readable: %s" % (hostname, file_type, file_path))
Esempio n. 22
0
def check_folder(folder_type, folder_path, hostname, permission=os.X_OK):
    if not folder_path.startswith('/'):
        martian.exit("Specified %s folder must be an absolute path: %s" % (folder_type, folder_path))
    if not os.path.exists(folder_path):
        martian.exit("On machine: %s, specified %s folder does not exist: %s" % (hostname, folder_type, folder_path))
    if not os.path.isdir(folder_path):
        martian.exit("On machine: %s, specified %s path is not a folder: %s" % (hostname, folder_type, folder_path))
    if not os.access(folder_path, permission):
        martian.exit("On machine: %s, insufficient permissions on %s folder: %s" % (hostname, folder_type, folder_path))
Esempio n. 23
0
def main(args, outs):
    in_h5_list = [args.map_track, args.genome_tracks]
    out = pd.HDFStore(outs.tracks, "w")

    ## first add in mappability and GC tracks
    msizes = {}
    for in_h5 in in_h5_list:
        if in_h5 is None:
            continue
        if not os.path.exists(in_h5):
            martian.exit("Could not find " + in_h5)
            continue
        indata = pd.HDFStore(in_h5, "r")
        for key in indata.keys():
            X = indata[key]
            #
            # column-wise concatenation for the constants section
            #
            if key in out:
                out[key] = pd.concat([out[key], X], axis=0)
            else:
                out[key] = X
            chrom = key.split("/")[-1]
            msizes[chrom] = X.shape[0]
        indata.close()

    ## convert confident windows into numpy and store
    if args.confident_windows is None or not os.path.exists(
            args.confident_windows):
        for chrom, length in msizes.iteritems():
            out["/CONF/" + chrom] = pd.Series(np.ones(length, dtype=float))
    else:
        conf = defaultdict(list)
        for line in open(args.confident_windows):
            fields = line.strip().split()
            chrom, perc = fields[0], float(fields[3])
            conf[chrom].append(perc)
        for chrom, length in msizes.iteritems():
            X = np.array(conf[chrom])
            cbins = (X > crdna.constants.CONFIDENT_BIN_THRESHOLD).sum()
            if X.shape[0] == 0 or cbins == 0:
                out["/CONF/" + chrom] = pd.Series(np.ones(length, dtype=float))
            else:
                assert X.shape[0] == length
                out["/CONF/" + chrom] = pd.Series(X)
    out.close()
Esempio n. 24
0
def main(args, outs):
    ''' Convert sample_def = { "libraries_csv": "/path/to/libraries.csv" } into a
        standard sample_def map used by the rest of the pipeline. Only used by the
        CS pipeline to handle the --libraries cmd-line argument.'''

    if len(args.raw_sample_def) == 1:
        if args.raw_sample_def[0].keys() == ["libraries"]:
            # We've got a 'libraries mode' argument coming in -- load & check the CSV, and expand it into a normal sample def
            try:
                outs.sample_def = cr_preflight.expand_libraries_csv(
                    args.raw_sample_def[0]["libraries"])
                return
            except cr_preflight.PreflightException as e:
                martian.exit(e.msg)

    # Default case -- just copy over the sample_def
    outs.sample_def = args.raw_sample_def
Esempio n. 25
0
def sequencer_detection_message(fastq_files):
    seqrs = set()
    # accumulate (sequencer, status) set
    for fastq in fastq_files:
        with gzip.open(fastq) as f:
            head = ""
            line = f.readline()
            if len(line) > 0:
                if line[0] == "@":
                    head = line
                else:
                    martian.exit(
                        "Incorrectly formatted first read in FASTQ file: %s" %
                        fastq)

        iid, fcid = parse_readhead(head)
        seqr, msg = infer_sequencer_with_message(iid, fcid)
        for sr in seqr:
            signal = (sr, msg)
        seqrs.add(signal)

    # get a list of sequencing platforms
    platforms = set()
    for platform, _ in seqrs:
        platforms.add(platform)
    sequencers = list(platforms)

    # if no sequencer detected at all
    message = ""
    fails = 0
    for platform, status in seqrs:
        if status == fail_msg:
            fails += 1
    if fails == len(seqrs):
        message = "could not detect the sequencing platform(s) used to generate the input FASTQ files"
        return message, sequencers

    # if partial or no detection failures
    if fails > 0:
        message = "could not detect the sequencing platform used to generate some of the input FASTQ files, "
    message += "detected the following sequencing platforms- "
    for platform, status in seqrs:
        if status != fail_msg:
            message += platform + " " + status + ", "
    message = message.strip(", ")
    return message, sequencers
Esempio n. 26
0
def join(args, outs, chunk_defs, chunk_outs):
    outs.updated_sample_defs = [
        chunk_out.updated_sample_def for chunk_out in chunk_outs
    ]

    if any("batch" in sample_def for sample_def in outs.updated_sample_defs):
        ncells = 0
        for sample_def in outs.updated_sample_defs:
            with cr_mol_counter.MoleculeCounter.open(
                    sample_def[cr_constants.AGG_H5_FIELD], 'r') as mc:
                library_info = mc.get_library_info()
                ncells += sum(
                    mc.get_num_filtered_barcodes_for_library(i)
                    for i in xrange(len(library_info)))
        if ncells > CBC_MAX_NCELLS:
            martian.exit(
                "You provided {:,} cells in total, but chemistry batch correction only supports up to {:,} cells."
                .format(ncells, CBC_MAX_NCELLS))
def get_run_data(fn):
    """ Parse flowcell + lane from the first FASTQ record. 
    NOTE: we don't check whether there are multiple FC / lanes in this file.
    """
    if fn[-2:] == 'gz':
        reader = gzip.open(fn)
    else:
        reader = open(fn, 'r')
        
    gen = tk_fasta.read_generator_fastq(reader)

    try:
        (name, seq, qual) = gen.next()
        (flowcell, lane) = re.split(':', name)[2:4]
        return (flowcell, lane)
    except StopIteration:
        # empty fastq
        martian.exit("FASTQ is empty: %s" % fn)
Esempio n. 28
0
def main_bcl_processor(sample_id, sample_def, chemistry_arg,
                       custom_chemistry_def):
    chunks = []

    sample_index_strings, msg = tk_preflight.check_sample_indices(sample_def)
    if sample_index_strings is None:
        martian.exit(msg)

    path = sample_def['read_path']
    lanes = sample_def['lanes']

    for sample_index in sample_index_strings:
        # Determine the read-type => fastq filename mapping
        try:
            chemistry_name = cr_chem.infer_sc3p_chemistry_bcl_processor(
                chemistry_arg, path, sample_index, lanes)
        except cr_chem.NoInputFastqsException:
            continue

        if chemistry_name == cr_chem.CUSTOM_CHEMISTRY_NAME:
            chemistry = custom_chemistry_def
        else:
            chemistry = cr_chem.get_chemistry(chemistry_name)

        read_type_map = cr_chem.get_read_type_map(
            chemistry, tk_constants.BCL_PROCESSOR_FASTQ_MODE)

        # Collect the fastq files for each read type
        filename_lists = {}
        for dest_read_type in cr_constants.FASTQ_READ_TYPES:
            src_read_type = read_type_map[dest_read_type]
            filename_lists[
                dest_read_type] = tk_fasta.find_input_fastq_files_10x_preprocess(
                    path, src_read_type, sample_index, lanes)

        fill_in_missing_reads(filename_lists)
        if validate_fastq_lists(filename_lists):
            chunks += construct_chunks(filename_lists,
                                       sample_id,
                                       sample_def,
                                       reads_interleaved=True,
                                       chemistry=chemistry)

    return chunks
Esempio n. 29
0
def validate_csv(csv_file, entry_type, entry_colname):
    if not os.path.exists(csv_file):
        martian.exit("Specified %s file does not exist: %s" % (entry_type, csv_file))
    elif not os.access(csv_file, os.R_OK):
        martian.exit("Specified %s file is not readable, please check file permissions: %s" % (entry_type, csv_file))
    with open(csv_file) as f:
        header = f.readline().strip().split(',')
        if header[0] != entry_colname:
            martian.exit("First line of %s file must be a header line, with '%s' as the first column." % (entry_type, entry_colname))
        counts = sum(1 for line in f) # count remaining lines
    if counts == 0:
        martian.exit("Specified %s file must contain at least one entry." % entry_type)
    return counts
Esempio n. 30
0
def check_folder_or_create(folder_type, folder_path, hostname, permission=os.X_OK):
    if not folder_path.startswith('/'):
        martian.exit("Specified %s folder must be an absolute path: %s" % (folder_type, folder_path))
    if os.path.exists(folder_path):
        if not os.path.isdir(folder_path):
            martian.exit("On machine: %s, specified %s path is not a folder: %s" % (hostname, folder_type, folder_path))
        if not os.access(folder_path, permission):
            martian.exit("On machine: %s, insufficient permissions on %s folder: %s" % (hostname, folder_type, folder_path))
    else:
        try:
            os.makedirs(folder_path)
        except:
            martian.exit("On machine: %s, could not create %s folder: %s" % (hostname, folder_type, folder_path))