Beispiel #1
0
def write_synced_barcodes_fastq(joined_fp, index_fp):
    """Writes new index file based on surviving assembled paired-ends.
       -joined_fp : file path to paired-end assembled fastq file
       -index_fp : file path to index / barcode reads fastq file

       This function iterates through the joined reads file and index file. 
       Only those index-reads within the file at index_fp, that have headers
       matching those within the joined-pairs at joined_fp, are written 
       to file. 

     WARNING: Assumes reads are in the same order in both files,
              except for cases in which the corresponding
              read in the joined_fp file is missing (i.e. pairs 
              failed to assemble).

    """

    # open files (handles normal / gzipped data)
    jh = qiime_open(joined_fp)
    ih = qiime_open(index_fp)

    # base new index file name on joined paired-end file name:
    j_path,ext = os.path.splitext(joined_fp)
    filtered_bc_outfile_path = j_path + '_barcodes.fastq'
    fbc_fh = open(filtered_bc_outfile_path, 'w')


    # Set up iterators
    index_fastq_iter = MinimalFastqParser(ih, strict=False)
    joined_fastq_iter = MinimalFastqParser(jh, strict=False) 
    # Write barcodes / index reads that we observed within
    # the joined paired-ends. Warn if index and joined data
    # are not in order.
    for joined_label,joined_seq,joined_qual in joined_fastq_iter:
        index_label,index_seq,index_qual = index_fastq_iter.next()
        while joined_label != index_label:
            try:
                index_label,index_seq,index_qual = index_fastq_iter.next()
            except StopIteration:
                raise StopIteration, "\n\nReached end of index-reads file"+\
                 " before iterating through joined paired-end-reads file!"+\
                 " Except for missing paired-end reads that did not survive"+\
                 " assembly, your index and paired-end reads files must be in"+\
                 " the same order! Also, check that the index-reads and"+\
                 " paired-end reads have identical headers. The last joined"+\
                 " paired-end ID processed was:\n\'%s\'\n" %(joined_label)
        else:
            fastq_string = '@%s\n%s\n+\n%s\n'\
                            %(index_label,index_seq,index_qual)
            fbc_fh.write(fastq_string)
    
    ih.close()
    jh.close()
    fbc_fh.close()

    return filtered_bc_outfile_path
Beispiel #2
0
def write_synced_barcodes_fastq(joined_fp, index_fp):
    """Writes new index file based on surviving assembled paired-ends.
       -joined_fp : file path to paired-end assembled fastq file
       -index_fp : file path to index / barcode reads fastq file

       This function iterates through the joined reads file and index file.
       Only those index-reads within the file at index_fp, that have headers
       matching those within the joined-pairs at joined_fp, are written
       to file.

     WARNING: Assumes reads are in the same order in both files,
              except for cases in which the corresponding
              read in the joined_fp file is missing (i.e. pairs
              failed to assemble).

    """

    # open files (handles normal / gzipped data)
    jh = qiime_open(joined_fp)
    ih = qiime_open(index_fp)

    # base new index file name on joined paired-end file name:
    j_path, ext = os.path.splitext(joined_fp)
    filtered_bc_outfile_path = j_path + '_barcodes.fastq'
    fbc_fh = open(filtered_bc_outfile_path, 'w')

    # Set up iterators
    index_fastq_iter = MinimalFastqParser(ih, strict=False)
    joined_fastq_iter = MinimalFastqParser(jh, strict=False)
    # Write barcodes / index reads that we observed within
    # the joined paired-ends. Warn if index and joined data
    # are not in order.
    for joined_label, joined_seq, joined_qual in joined_fastq_iter:
        index_label, index_seq, index_qual = index_fastq_iter.next()
        while joined_label != index_label:
            try:
                index_label, index_seq, index_qual = index_fastq_iter.next()
            except StopIteration:
                raise StopIteration("\n\nReached end of index-reads file" +
                                    " before iterating through joined paired-end-reads file!" +
                                    " Except for missing paired-end reads that did not survive" +
                                    " assembly, your index and paired-end reads files must be in" +
                                    " the same order! Also, check that the index-reads and" +
                                    " paired-end reads have identical headers. The last joined" +
                                    " paired-end ID processed was:\n\'%s\'\n" % (joined_label))
        else:
            fastq_string = '@%s\n%s\n+\n%s\n'\
                % (index_label, index_seq, index_qual)
            fbc_fh.write(fastq_string)

    ih.close()
    jh.close()
    fbc_fh.close()

    return filtered_bc_outfile_path
Beispiel #3
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)
      
    verbose = opts.verbose
    
    input_seqs_fp = opts.input_seqs_fp
    fileSize = get_file_size(input_seqs_fp)
    if opts.file_type:
        fileType = opts.file_type
    else:
        fileType = splitext(input_seqs_fp)[1].split('.')[1]
    output_fp = opts.output_fp
    
    # if the output fp isn't specified, create one
    if not output_fp:
        input_file_basename, input_file_ext = \
         splitext(split(input_seqs_fp)[1])
        output_fp = '%s_counts.txt' % (input_file_basename)

    input_seqs = open(input_seqs_fp, "U")

    output = open(output_fp, "w")

    # count the number of seqs for each unique sample
    number_seqs_bySample = {}
    printcounter = 0
    if fileType == 'fasta' or fileType == 'fa':
        for label, seq in MinimalFastaParser(input_seqs):
            matchID = re.match('^.*barcodelabel=(.*);$',label)
            sampleID = matchID.group(1)
            if sampleID in number_seqs_bySample:
                number_seqs_bySample[sampleID] += 1
            else:
                number_seqs_bySample[sampleID] = 1
            if printcounter == 1000:
                pos = input_seqs.tell()
                display_progress(pos, fileSize)
                printcounter = 0
            printcounter += 1
    elif fileType == 'fastq' or fileType == 'fq':
        for label, seq, qual in MinimalFastqParser(input_seqs,strict=False):
            matchID = re.match('^.*barcodelabel=(.*);$',label)
            sampleID = matchID.group(1)
            if sampleID in number_seqs_bySample:
                number_seqs_bySample[sampleID] += 1
            else:
                number_seqs_bySample[sampleID] = 1
            if printcounter == 1000:
                pos = input_seqs.tell()
                display_progress(pos, fileSize)
                printcounter = 0
            printcounter += 1
    else:
        print "Invalid file type"
    for key in number_seqs_bySample:
        output.write('%s\t%s\n' %(key,number_seqs_bySample[key]))

    sys.stdout.write('\n')
    input_seqs.close()
    output.close()
Beispiel #4
0
def filter_fastq_fp(input_seqs_fp, output_seqs_fp, seqs_to_keep, negate=False):
    """Filter a fastq file to include only sequences listed in seqs_to_keep """
    input_seqs = MinimalFastqParser(open(input_seqs_fp, 'U'), strict=False)
    output_f = open(output_seqs_fp, 'w')
    return filter_fastq(input_seqs, output_f, seqs_to_keep, negate)
def convert_fastaqual(fasta_file_path, output_directory='.',
        multiple_output_files=False, ascii_increment=33,
        full_fastq=False, full_fasta_headers=False,
        per_file_buffer_size=100000):
    '''Takes a FASTQfile, generates FASTA and QUAL file(s)
    
    fasta_file_path:  filepath of input FASTQ file.
    output_directory:  Directory to output converted files.
    multiple_output_files:  Make one file per SampleID.
    ascii_increment:  Conversion value for fastq ascii character to numeric
     quality score.
    full_fastq:  Write labels to both sequence and quality score lines.
    full_fasta_headers:  Retain all data on fasta label, instead of breaking at
     first whitespace.''' 

    # rename this to avoid confusion...
    fastq_fp = fasta_file_path

    # if we are NOT using multiple output files, then open our two (and only)
    # output files here
    if not multiple_output_files:
        fasta_out_fp = get_filename_with_new_ext(fastq_fp,
                                                 '.fna',
                                                 output_directory)
        qual_out_fp = get_filename_with_new_ext(fastq_fp,
                                                 '.qual',
                                                 output_directory)

        fasta_out_f = open(fasta_out_fp, 'w')
        qual_out_f = open(qual_out_fp, 'w')

    else:
        fasta_out_lookup = defaultdict(str)
        qual_out_lookup = defaultdict(str)

    for header, sequence, qual in MinimalFastqParser(open(fastq_fp, 'U'),
                                                     strict=False):
        label = header.split()[0]
        sample_id = label.split('_')[0]

        if multiple_output_files:
            fasta_out_fp = get_filename_with_new_ext(fastq_fp,
                                     '_' + sample_id + '.fna',
                                     output_directory)

            qual_out_fp = get_filename_with_new_ext(fastq_fp,
                                     '_' + sample_id + '.qual',
                                     output_directory)

        if full_fasta_headers:
            label = header

        #convert quality scores
        qual_scores = []
        for qual_char in qual:
            if (ord(qual_char) - ascii_increment) < 0: 
                raise ValueError,("Output qual scores are negative values. "
                 "Use different ascii_increment value than %s" %
                 str(ascii_increment))
            else:
                qual_scores.append(str(ord(qual_char) - ascii_increment))

        #write QUAL file, 60 qual scores per line
        qual_record = '>' + label + '\n'
        for i in range(0, len(qual_scores), 60):
            qual_record += ' '.join(qual_scores[i:i+60]) + '\n'

        if multiple_output_files:
            qual_out_lookup[qual_out_fp] += qual_record
        else:
            qual_out_f.write(qual_record)

        #write FASTA file
        fasta_record = '>%s\n%s\n' % (label, sequence)
        if multiple_output_files:
            fasta_out_lookup[fasta_out_fp] += fasta_record
        else:
            fasta_out_f.write(fasta_record)

        # if we're writing multiple output files, we must close after each
        # sequeunce write to avoid potentiallyusing up all the OS's filehandles
        if multiple_output_files:
            if fasta_out_lookup[fasta_out_fp] >= per_file_buffer_size:
                fasta_f = open(fasta_out_fp, 'a')
                fasta_f.write(fasta_out_lookup[fasta_out_fp])
                fasta_f.close()
                fasta_out_lookup[fasta_out_fp] = ''

                qual_f = open(qual_out_fp, 'a')
                qual_f.write(qual_out_lookup[qual_out_fp])
                qual_f.close()
                qual_out_lookup[qual_out_fp] = ''

    # if we have one output file, close it now
    if multiple_output_files:
        for fasta_out_fp, records in fasta_out_lookup.iteritems():
            if records:
                fasta_f = open(fasta_out_fp, 'a')
                fasta_f.write(records)
                fasta_f.close()

        for qual_out_fp, records in qual_out_lookup.iteritems():
            if records:
                qual_f = open(qual_out_fp, 'a')
                qual_f.write(records)
                qual_f.close()
    else:
        fasta_out_f.close()
        qual_out_f.close()
Beispiel #6
0
        basename = splitext(filein)[0]

        #make directory to store cleaned sequences in
        if not exists(folderout + basename):
            mkdir(folderout + basename)
        else:
            print "Round", basename, "already cleaned"
            continue

        currfolder = ''.join([folderout, basename, "/", basename])
        print "==ROUND " + basename + "=="
        #convert fastq to fasta if needed
        if args.q:
            print "==Converting to FASTA=="
            f = open(''.join([folderout, basename, ".fasta"]), 'w')
            for header, seq, qual in MinimalFastqParser(folderin + filein,
                                                        strict=False):
                f.write(''.join([">", header, '\n', seq, '\n']))
            f.close()
            filein = ''.join([folderout, basename, ".fasta"])

        print "==Cleaning input sequences=="

        log = open(currfolder + "-cleanup.log", 'w')
        log.write(''.join([
            "====================\nFile in: ", folderin, filein,
            "\nOutput Folder: ", currfolder, "\n3' primer: ", args.ep,
            "\nMin length: ",
            str(args.l), "\nMin duplicates: ",
            str(args.d), "\n====================\n"
        ]))
        #parse in sequences as (header, seq) tuples
def process_fastq_single_end_read_file(fastq_read_f,
                                       fastq_barcode_f,
                                       barcode_to_sample_id,
                                       store_unassigned=False,
                                       max_bad_run_length=0,
                                       phred_quality_threshold=2,
                                       min_per_read_length_fraction=0.75,
                                       rev_comp=False,
                                       rev_comp_barcode=False,
                                       seq_max_N=0,
                                       start_seq_id=0,
                                       filter_bad_illumina_qual_digit=False,
                                       log_f=None,
                                       histogram_f=None,
                                       barcode_correction_fn=None,
                                       max_barcode_errors=1.5,
                                       strict_header_match=True,
                                       phred_to_ascii_f=None):
    """parses fastq single-end read file
    """
    header_index = 0
    sequence_index = 1
    quality_index = 2

    seq_id = start_seq_id
    # grab the first lines and then seek back to the beginning of the file
    try:
        fastq_read_f_line1 = fastq_read_f.readline()
        fastq_read_f_line2 = fastq_read_f.readline()
        fastq_read_f.seek(0)
    except AttributeError:
        fastq_read_f_line1 = fastq_read_f[0]
        fastq_read_f_line2 = fastq_read_f[1]

    # determine the version of casava that was used to generate the fastq
    # to determine how to compare header lines and decode ascii phred scores
    post_casava_v180 = is_casava_v180_or_later(fastq_read_f_line1)
    if post_casava_v180:
        check_header_match_f = check_header_match_180_or_later
        if phred_to_ascii_f == None:
            phred_to_ascii_f = phred_to_ascii33
    else:
        check_header_match_f = check_header_match_pre180
        if phred_to_ascii_f == None:
            phred_to_ascii_f = phred_to_ascii64

    # determine the last unacceptable quality character
    if phred_quality_threshold != None:
        last_bad_quality_char = phred_to_ascii_f(phred_quality_threshold)
    else:
        # disable quality filter
        last_bad_quality_char = ''

    # compute the barcode length, if they are all the same.
    # this is useful for selecting a subset of the barcode read
    # if it's too long (e.g., for technical reasons on the sequencer)
    barcode_lengths = set(
        [len(bc) for bc, sid in barcode_to_sample_id.items()])
    if len(barcode_lengths) == 1:
        barcode_length = barcode_lengths.pop()
    else:
        barcode_length = None

    # compute the minimum read length as a fraction of the length of the input read
    min_per_read_length = min_per_read_length_fraction * len(
        fastq_read_f_line2)

    # prep data for logging
    input_sequence_count = 0
    count_barcode_not_in_map = 0
    count_too_short = 0
    count_too_many_N = 0
    count_bad_illumina_qual_digit = 0
    count_barcode_errors_exceed_max = 0
    sequence_lengths = []
    seqs_per_sample_counts = {}
    for bc_data, read_data in izip(
            MinimalFastqParser(fastq_barcode_f, strict=False),
            MinimalFastqParser(fastq_read_f, strict=False)):
        input_sequence_count += 1
        # Confirm match between barcode and read headers
        if strict_header_match and \
           (not check_header_match_f(bc_data[header_index],read_data[header_index])):
            raise FastqParseError,\
             ("Headers of barcode and read do not match. Can't continue. "
              "Confirm that the barcode fastq and read fastq that you are "
              "passing match one another.")
        else:
            header = read_data[header_index]

        # Grab the barcode sequence
        if barcode_length:
            # because thirteen cycles are sometimes used for
            # techical reasons, this step looks only at the
            # first tweleve bases. note that the barcode is
            # rev-comp'ed after this step if requested since
            # the thirteen base is a technical artefact, not
            # barcode sequence.
            barcode = bc_data[sequence_index][:barcode_length]
        else:
            barcode = bc_data[sequence_index]
        if rev_comp_barcode:
            barcode = DNA.rc(barcode)
        # Grab the read sequence
        sequence = read_data[1]
        # Grab the read quality
        quality = read_data[2]

        # correct the barcode (if applicable) and map to sample id
        num_barcode_errors, corrected_barcode, correction_attempted, sample_id = \
         correct_barcode(barcode,barcode_to_sample_id,barcode_correction_fn)
        # skip samples with too many errors
        if (num_barcode_errors > max_barcode_errors):
            count_barcode_errors_exceed_max += 1
            continue

        # skip unassignable samples unless otherwise requested
        if sample_id == None:
            if not store_unassigned:
                count_barcode_not_in_map += 1
                continue
            else:
                sample_id = 'Unassigned'

        quality_filter_result, sequence, quality =\
          quality_filter_sequence(header,
                                  sequence,
                                  quality,
                                  max_bad_run_length,
                                  last_bad_quality_char,
                                  min_per_read_length,
                                  seq_max_N,
                                  filter_bad_illumina_qual_digit)

        # process quality result
        if quality_filter_result != 0:
            # if the quality filter didn't pass record why and
            # move on to the next record
            if quality_filter_result == 1:
                count_too_short += 1
            elif quality_filter_result == 2:
                count_too_many_N += 1
            elif quality_filter_result == 3:
                count_bad_illumina_qual_digit += 1
            else:
                raise ValueError,\
                 "Unknown quality filter result: %d" % quality_filter_result
            continue

        sequence_lengths.append(len(sequence))

        try:
            seqs_per_sample_counts[sample_id] += 1
        except KeyError:
            seqs_per_sample_counts[sample_id] = 1

        if rev_comp:
            sequence = DNA.rc(sequence)
            quality = quality[::-1]

        fasta_header = '%s_%s %s orig_bc=%s new_bc=%s bc_diffs=%d' %\
          (sample_id,seq_id,header,barcode,corrected_barcode,num_barcode_errors)
        yield fasta_header, sequence, quality, seq_id
        seq_id += 1

    # Add sample IDs with zero counts to dictionary for logging
    for curr_sample_id in barcode_to_sample_id.values():
        if curr_sample_id not in seqs_per_sample_counts.keys():
            seqs_per_sample_counts[curr_sample_id] = 0

    if log_f != None:
        log_str = format_split_libraries_fastq_log(
            count_barcode_not_in_map, count_too_short, count_too_many_N,
            count_bad_illumina_qual_digit, count_barcode_errors_exceed_max,
            input_sequence_count, sequence_lengths, seqs_per_sample_counts)
        log_f.write(log_str)

    if len(sequence_lengths) and histogram_f != None:
        counts, bin_edges = make_histograms(sequence_lengths)
        histogram_str = format_histogram_one_count(counts, bin_edges)
        histogram_f.write(histogram_str)
        histogram_f.write('\n--\n\n')
Beispiel #8
0
def extract_barcodes(fastq1,
                     fastq2=None,
                     output_dir=".",
                     input_type="barcode_single_end",
                     bc1_len=6,
                     bc2_len=6,
                     rev_comp_bc1=False,
                     rev_comp_bc2=False,
                     char_delineator=":",
                     switch_bc_order=False,
                     map_fp=None,
                     attempt_read_orientation=False,
                     disable_header_match=False):
    """ Main program function for extracting barcodes from reads
    
    fastq1: Open fastq file 1.
    fastq2: None or open fastq file 2.
    output_dir: Directory to write output parses sequences to.
    input_type: Specifies the type of parsing to be done.
    bc1_len: Length of barcode 1 to be parsed from fastq1
    bc2_len: Length of barcode 2 to be parsed from fastq2, or from end of a
     stitched read.
    rev_comp_bc1: If True, reverse complement bc1 before writing.
    rev_comp_bc2: If True, reverse complement bc2 before writing.
    char_delineator: Specify character that immediately precedes the barcode
        for input_type of barcode_in_label.
    switch_bc_order: Normally, barcode 1 will be written first, followed by
        barcode 2 in a combined output fastq file. If True, the order will be 
        reversed. Only applies to stitched reads processing, as other barcode
        orders are dictated by the the parameter chosen for the fastq files.
    map_fp: open file object of mapping file, requires a LinkerPrimerSequence
        and ReversePrimer field to be present. Used for orienting reads.
    attempt_read_orientation: If True, will attempt to orient the reads 
        according to the forward primers in the mapping file. If primer is 
        detected in current orientation, leave the read as is, but if reverse
        complement is detected (or ReversePrimer is detected in the current 
        orientation) the read will either be written to the forward (read 1) or
        reverse (read 2) reads for the case of paired files, or the read will be
        reverse complemented in the case of stitched reads.
    disable_header_match: if True, suppresses checks between fastq headers.
    """

    # Turn off extra file creation for single read.
    if input_type == "barcode_single_end" and attempt_read_orientation:
        attempt_read_orientation = False
    if attempt_read_orientation:
        header, mapping_data, run_description, errors, warnings =\
         process_id_map(map_fp)
        forward_primers, reverse_primers = get_primers(header, mapping_data)
        output_bc_not_oriented = open(
            join(output_dir, "barcodes_not_oriented.fastq.incomplete"), "w")
        fastq1_out_not_oriented = open(
            join(output_dir, "reads1_not_oriented.fastq.incomplete"), "w")
        fastq2_out_not_oriented = open(
            join(output_dir, "reads2_not_oriented.fastq.incomplete"), "w")
    else:
        forward_primers = None
        reverse_primers = None
        output_bc_not_oriented = None
        fastq1_out_not_oriented = None
        fastq2_out_not_oriented = None

    output_bc_fastq = open(join(output_dir, "barcodes.fastq.incomplete"), "w")
    if input_type in ["barcode_single_end", "barcode_paired_stitched"]:
        output_fastq1 = open(join(output_dir, "reads.fastq.incomplete"), "w")
        output_fastq2 = None
        final_fastq1_name = join(output_dir, "reads.fastq")
    elif input_type in ["barcode_paired_end"]:
        output_fastq1 = open(join(output_dir, "reads1.fastq.incomplete"), "w")
        output_fastq2 = open(join(output_dir, "reads2.fastq.incomplete"), "w")
        final_fastq1_name = join(output_dir, "reads1.fastq")
    else:
        output_fastq1 = None
        output_fastq2 = None

    if not fastq2:
        fastq2 = cycle(["@", "AAAAAAAAAAAA", "+", "bbbbbbbbbbbb"])
        not_paired = True
    else:
        not_paired = False

    check_header_match_f = get_casava_version(fastq1)

    header_index = 0

    for read1_data, read2_data in izip(
            MinimalFastqParser(fastq1, strict=False),
            MinimalFastqParser(fastq2, strict=False)):
        if not disable_header_match:
            if not check_header_match_f(read1_data[header_index],\
                read2_data[header_index]):
                raise FastqParseError,\
                    ("Headers of read1 and read2 do not match. Can't continue. "
                    "Confirm that the fastq sequences that you are "
                    "passing match one another. --disable_header_match can be "
                    "used to suppress header checks.")

        if input_type == "barcode_single_end":
            process_barcode_single_end_data(read1_data, output_bc_fastq,
                                            output_fastq1, bc1_len,
                                            rev_comp_bc1)

        elif input_type == "barcode_paired_end":
            process_barcode_paired_end_data(
                read1_data, read2_data, output_bc_fastq, output_fastq1,
                output_fastq2, bc1_len, bc2_len, rev_comp_bc1, rev_comp_bc2,
                attempt_read_orientation, forward_primers, reverse_primers,
                output_bc_not_oriented, fastq1_out_not_oriented,
                fastq2_out_not_oriented)

        elif input_type == "barcode_paired_stitched":
            process_barcode_paired_stitched(
                read1_data, output_bc_fastq, output_fastq1, bc1_len, bc2_len,
                rev_comp_bc1, rev_comp_bc2, attempt_read_orientation,
                forward_primers, reverse_primers, output_bc_not_oriented,
                fastq1_out_not_oriented, switch_bc_order)

        elif input_type == "barcode_in_label":
            if not_paired:
                curr_read2_data = False
            else:
                curr_read2_data = read2_data
            process_barcode_in_label(read1_data, curr_read2_data,
                                     output_bc_fastq, bc1_len, bc2_len,
                                     rev_comp_bc1, rev_comp_bc2,
                                     char_delineator)

    output_bc_fastq.close()
    rename(output_bc_fastq.name, join(output_dir, "barcodes.fastq"))
    if output_fastq1:
        output_fastq1.close()
        rename(output_fastq1.name, final_fastq1_name)
    if output_fastq2:
        output_fastq2.close()
        rename(output_fastq2.name, join(output_dir, "reads2.fastq"))
    if output_bc_not_oriented:
        rename(output_bc_not_oriented.name,
               join(output_dir, "barcodes_not_oriented.fastq"))
    if fastq1_out_not_oriented:
        rename(fastq1_out_not_oriented.name,
               join(output_dir, "reads1_not_oriented.fastq"))
    if fastq2_out_not_oriented:
        rename(fastq2_out_not_oriented.name,
               join(output_dir, "reads2_not_oriented.fastq"))
Beispiel #9
0
        
        These contain information on the read number, so can differ
    """
    header1 = header1.split(':')
    header2 = header2.split(':')
    for e1, e2 in zip(header1, header2):
        if e1.split(' ')[0] != e2.split(' ')[0]:
            return False

    return True


print "Printing labels that do not match before space character:"
mismatched_labels_found = False

for read1_data, read2_data in izip(MinimalFastqParser(read1, strict=False),
                                   MinimalFastqParser(read2, strict=False)):

    if not check_header_match_180_or_later(read1_data[header_index],
                                           read2_data[header_index]):
        print "Mismatched labels: %s, %s" % (read1_data[header_index],
                                             read2_data[header_index])
        mismatched_labels_found = True
    if not (len(read1_data[sequence_index]) == len(read1_data[quality_index])):
        print "Sequence and quality score lengths do not match for read 1 label %s " % read1_data[
            header_index]
    if not (len(read2_data[sequence_index]) == len(read2_data[quality_index])):
        print "Sequence and quality score lengths do not match for read 2 label %s " % read2_data[
            header_index]

if not mismatched_labels_found:
Beispiel #10
0
 def test_parse(self):
     """sequence and info objects should correctly match"""
     for label, seq, qual in MinimalFastqParser('data/fastq.txt'):
         self.assertTrue(label in data)
         self.assertEqual(seq, data[label]["seq"])
         self.assertEqual(qual, data[label]["qual"])
Beispiel #11
0
def convert_fastaqual(fasta_file_path,
                      output_directory='.',
                      multiple_output_files=False,
                      ascii_increment=33,
                      full_fastq=False,
                      full_fasta_headers=False):
    '''Takes a FASTQfile, generates FASTA and QUAL file(s)
    
    fasta_file_path:  filepath of input FASTQ file.
    output_directory:  Directory to output converted files.
    multiple_output_files:  Make one file per SampleID.
    ascii_increment:  Conversion value for fastq ascii character to numeric
     quality score.
    full_fastq:  Write labels to both sequence and quality score lines.
    full_fasta_headers:  Retain all data on fasta label, instead of breaking at
     first whitespace.'''

    fastq_file_path = fasta_file_path

    fasta_output = {}
    qual_output = {}
    fastq_file = open(fasta_file_path, 'U')

    # Need to open file the first time as "w", thereafter open as "a"
    sample_ids_written = {}

    for fastq_data in izip(MinimalFastqParser(fastq_file, strict=False)):
        sequence = fastq_data[0][1]
        qual = fastq_data[0][2]
        header = fastq_data[0][0]
        label = header.split()[0]
        sample_id = label.split('_')[0]

        if len(sequence) != len(qual):
            raise KeyError,("Number of quality scores "+\
            "(%d) does not match number of positions (%d) for label: %s" %\
             (len(qual), len(sequence), label))

        if not multiple_output_files:
            output_fasta = path.join(output_directory, \
                path.splitext(path.split(fastq_file_path)[1])[0] + '.fna')
            output_qual = path.join(output_directory, \
                path.splitext(path.split(fastq_file_path)[1])[0] + '.qual')

            if output_fasta in sample_ids_written.keys():
                sample_ids_written[output_fasta] = True
            else:
                sample_ids_written[output_fasta] = False
            try:
                # Create new file if first time writing, else append
                if sample_ids_written[output_fasta]:
                    fasta_o = open(output_fasta, 'a')
                    qual_o = open(output_qual, 'a')
                else:
                    fasta_o = open(output_fasta, 'w')
                    qual_o = open(output_qual, 'w')
            except IOError:
                raise IOError,("Could not open output FASTA or QUAL files, "+\
                 "please check file permissions.")

            fasta_output[sample_id] = output_fasta
            qual_output[sample_id] = output_qual

        if multiple_output_files:
            if sample_id not in fasta_output:
                output_fasta = path.join(output_directory, \
                    path.splitext(path.split(fastq_file_path)[1])[0] + \
                     '_' + sample_id + '.fna')

                if output_fasta in sample_ids_written.keys():
                    sample_ids_written[output_fasta] = True
                else:
                    sample_ids_written[output_fasta] = False

                try:
                    if sample_ids_written[output_fasta]:
                        fasta_output[sample_id] = open(output_fasta, 'a')
                    else:
                        fasta_output[sample_id] = open(output_fasta, 'w')
                except IOError:
                    raise IOError,("Could not open output FASTA file: %s" %\
                     output_fasta + '\n')
                fasta_output[sample_id] = output_fasta

            if sample_id not in qual_output:
                output_qual = path.join(output_directory, \
                 path.splitext(path.split(fastq_file_path)[1])[0] +'_'+ \
                 sample_id +'.qual')
                try:
                    if sample_ids_written[output_fasta]:
                        qual_output[sample_id] = open(output_qual, 'a')
                    else:
                        qual_output[sample_id] = open(output_qual, 'w')

                    #qual_output[sample_id] = open(output_qual,'a')
                except IOError:
                    fastq_file.close()
                    raise IOError,("Could not open QUAL file for writing: %s" %\
                     output_qual + '\n')
                qual_output[sample_id] = output_qual

        if full_fasta_headers: label = header

        fasta_o = open(fasta_output[sample_id], 'a')
        qual_o = open(qual_output[sample_id], 'a')

        #write Fasta file
        fasta_o.write('>' + label + '\n')
        fasta_o.write(sequence + '\n')

        #convert quality scores
        qual_chars = list(qual)
        qual_scores = []
        for qual_char in qual_chars:
            if (ord(qual_char) - ascii_increment) < -0:
                raise ValueError,("Output qual scores are negative values. "+ \
                 "Use different ascii_increment value than %s" %\
                 str(ascii_increment))
            else:
                qual_scores.append(ord(qual_char) - ascii_increment)

        #write QUAL file
        score_numbers = []
        for i, qual_score in enumerate(qual_scores):
            score_numbers.append(i)
        qual_o.write('>' + label + '\n')
        for i, qual_score in enumerate(qual_scores):
            if i % 60 == 0 and i != 0:
                qual_o.write('\n')
            qual_o.write(str(qual_score))
            if (i + 1) % 60 != 0 and i != max(score_numbers):
                qual_o.write(' ')
        qual_o.write('\n')
        if multiple_output_files:
            fasta_o.close()
            qual_o.close()