Exemple #1
0
def main(config_file, fastq_dir):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    barcode_info = config["barcodes"]
    print "Processing %s." % (fastq_dir)
    in_files = glob.glob(os.path.join(fastq_dir, "*.fastq"))
    print "Found %s in %s. " % (in_files, fastq_dir)
    print "Combining paired-end files, if found."
    pairs = combine_pairs(in_files)
    print "Calulcated pairs: %s." % (pairs)
    out_files = []
    for pair in pairs:
        barcode = _determine_barcode_from_filename(pair[0])
        print "Detected barcode: %s" % barcode
        if barcode not in barcode_info.keys():
            print "barcode %s not found in the YAML file, skipping." % (
                barcode)
            continue
        print "Sample ID: %s" % (barcode_info[barcode][0])
        type = barcode_info[barcode][1]
        print "Sample type: %s" % (barcode_info[barcode][1])
        to_trim = config["to_trim"][type]
        cutadapt_dir = "cutadapt"
        print("Trimming off %s and any bases before it from %s." %
              (to_trim[0], pair[0]))
        out_dir = os.path.join(cutadapt_dir, os.path.basename(pair[0]))
        out_files.append(_trim_from_front(pair[0], to_trim[0]))
        if len(pair) > 1:
            print("Trimming off %s and any bases before it from %s." %
                  (to_trim[1], pair[1]))
            out_files.append(_trim_from_front(pair[1], to_trim[1]))
    out_files = list(flatten(out_files))
    out_files = combine_pairs(out_files)
    for pair in out_files:
        if len(pair) > 1:
            filter_reads_by_length(pair[0], pair[1], "fastq-sanger")
        else:
            filter_single_reads_by_length(pair[0], "fastq-sanger")
Exemple #2
0
def main(config_file, fastq_dir):
    with open(config_file) as in_handle:
        config = yaml.load(in_handle)

    barcode_info = config["barcodes"]
    print "Processing %s." % (fastq_dir)
    in_files = glob.glob(os.path.join(fastq_dir, "*.fastq"))
    print "Found %s in %s. " % (in_files, fastq_dir)
    print "Combining paired-end files, if found."
    pairs = combine_pairs(in_files)
    print "Calulcated pairs: %s." % (pairs)
    out_files = []
    for pair in pairs:
        barcode = _determine_barcode_from_filename(pair[0])
        print "Detected barcode: %s" % barcode
        if barcode not in barcode_info.keys():
            print "barcode %s not found in the YAML file, skipping." % (barcode)
            continue
        print "Sample ID: %s" % (barcode_info[barcode][0])
        type = barcode_info[barcode][1]
        print "Sample type: %s" % (barcode_info[barcode][1])
        to_trim = config["to_trim"][type]
        cutadapt_dir = "cutadapt"
        print ("Trimming off %s and any bases before it from %s."
               % (to_trim[0], pair[0]))
        out_dir = os.path.join(cutadapt_dir, os.path.basename(pair[0]))
        out_files.append(_trim_from_front(pair[0], to_trim[0]))
        if len(pair) > 1:
            print ("Trimming off %s and any bases before it from %s."
                   % (to_trim[1], pair[1]))
            out_files.append(_trim_from_front(pair[1], to_trim[1]))
    out_files = list(flatten(out_files))
    out_files = combine_pairs(out_files)
    for pair in out_files:
        if len(pair) > 1:
            filter_reads_by_length(pair[0], pair[1], "fastq-sanger")
        else:
            filter_single_reads_by_length(pair[0], "fastq-sanger")
def remove_short_reads(fastq_files, dirs, lane_config):
    """
    remove reads from a single or pair of fastq files which fall below
    a length threshold (30 bases)

    """
    min_length = int(lane_config["algorithm"].get("min_read_length", 20))
    supplied_quality_format = _get_quality_format(lane_config)
    if supplied_quality_format == "illumina":
        quality_format = "fastq-illumina"
    else:
        quality_format = "fastq-sanger"

    if is_pair(fastq_files):
        fastq1, fastq2 = fastq_files
        out_files = fastq.filter_reads_by_length(fastq1, fastq2, quality_format, min_length)
    else:
        out_files = [fastq.filter_single_reads_by_length(fastq_files[0],
                                                         quality_format, min_length)]
    map(os.remove, fastq_files)
    return out_files
Exemple #4
0
def _remove_short_reads(fastq_files, dirs, lane_config):
    """
    remove reads from a single or pair of fastq files which fall below
    a length threshold (30 bases)

    """
    MIN_LENGTH = 20
    supplied_quality_format = _get_quality_format(lane_config)
    if supplied_quality_format == "illumina":
        quality_format = "fastq-illumina"
    else:
        quality_format = "fastq-sanger"

    if is_pair(fastq_files):
        fastq1, fastq2 = fastq_files
        out_files = fastq.filter_reads_by_length(fastq1, fastq2,
                                                 quality_format, MIN_LENGTH)
    else:
        out_files = fastq.filter_single_reads_by_length(
            fastq_files[0], quality_format, MIN_LENGTH)
    return out_files
Exemple #5
0
def _remove_short_reads(fastq_files, dirs, lane_config):
    """
    remove reads from a single or pair of fastq files which fall below
    a length threshold (30 bases)

    """
    MIN_LENGTH = 20
    supplied_quality_format = _get_quality_format(lane_config)
    if supplied_quality_format == "illumina":
        quality_format = "fastq-illumina"
    else:
        quality_format = "fastq-sanger"

    if is_pair(fastq_files):
        fastq1, fastq2 = fastq_files
        out_files = fastq.filter_reads_by_length(fastq1, fastq2, quality_format,
                                                 MIN_LENGTH)
    else:
        out_files = [fastq.filter_single_reads_by_length(fastq_files[0],
                                                        quality_format,
                                                        MIN_LENGTH)]
    return out_files
Exemple #6
0
def remove_short_reads(fastq_files, dirs, lane_config):
    """
    remove reads from a single or pair of fastq files which fall below
    a length threshold (30 bases)

    """
    min_length = int(lane_config["algorithm"].get("min_read_length", 20))
    supplied_quality_format = _get_quality_format(lane_config)
    if supplied_quality_format == "illumina":
        quality_format = "fastq-illumina"
    else:
        quality_format = "fastq-sanger"

    if is_pair(fastq_files):
        fastq1, fastq2 = fastq_files
        out_files = fastq.filter_reads_by_length(fastq1, fastq2,
                                                 quality_format, min_length)
    else:
        out_files = [
            fastq.filter_single_reads_by_length(fastq_files[0], quality_format,
                                                min_length)
        ]
    map(os.remove, fastq_files)
    return out_files