def assign_seqs(file_data,
                ids_bcs_added_field,
                bc_lens,
                all_bcs,
                keep_barcode=False,
                barcode_type="golay_12",
                max_bc_errors=1.5,
                start_index=1,
                write_unassigned_reads=False,
                disable_bc_correction=False,
                added_demultiplex_field=None):
    """ Demultiplexes, writes seqs/qual files, returns log data

    file_data:  dict of open file objects, contains input fasta, qual, and
     mapping files, and output filepaths for partially demultiplexed fasta
     and qual files, and unassigned sequence output file.
    ids_bcs_added_field: dict of (barcode,added_demultiplex): SampleID
    bc_lens:  Lengths of all barcodes from largest to smallest.
    all_bcs:  List of all barcode sequences.
    keep_barcode:  If True, will not remove barcode from output files.
    barcode_type:  Specified barcode, can be golay_12, hamming_8,
     variable_length, or an integer specifying length.
    max_bc_errors:  Number of changes allowed for error correcting barcodes,
     for generic barcodes, specifies the number of mismatches allowed.
    start_index:  Specifies the first number used to enumerate output sequences.
    write_unassigned_reads:  If True, will write sequences that could not be
     demultiplexed into a separate output file.
    disable_bc_correction:  Only tests for exact matches to barcodes.
    added_demultiplex_field:  Uses data supplied in metadata mapping field
     and demultiplexes according to data in fasta labels.
    save_barcode_frequencies:  Saves the frequencies of barcode sequences in 
     a separate output file.
    """
    
    
    log_data = initialize_log_data(ids_bcs_added_field)
    bc_freqs = defaultdict(int)
        
    seq_counts = 0
    enum_val = start_index
    corrected_bc_count = [0, 0]
    
    if file_data['qual_files']:
        for curr_fasta, curr_qual in zip(file_data['fasta_files'],
         file_data['qual_files']):
            for fasta_data, qual_data in izip(MinimalFastaParser(curr_fasta),
             MinimalQualParser(curr_qual, full_header=True)):

                seq_counts += 1
                fasta_label, fasta_seq = fasta_data
                qual_label, qual_seq = qual_data
                
                bc, corrected_bc, num_errors, added_field =\
                 get_demultiplex_data(ids_bcs_added_field,
                 fasta_label, fasta_seq, bc_lens, all_bcs, barcode_type, 
                 max_bc_errors, disable_bc_correction, added_demultiplex_field)
                 
                bc_freqs[bc] += 1
                
                sample_id, log_id, bc_corrected_result =\
                 get_output_ids(ids_bcs_added_field,
                 corrected_bc, num_errors, added_field, max_bc_errors,
                 enum_val)
                if bc_corrected_result == 'corrected':
                    corrected_bc_count[0] += 1
                if bc_corrected_result == 'not_corrected':
                    corrected_bc_count[1] += 1
                 
                label_line = get_label_line(sample_id, fasta_label, bc, 
                 corrected_bc, num_errors)
                
                if sample_id.startswith("Unassigned") and\
                 write_unassigned_reads:
                    write_fasta_line(file_data['unassigned_seqs_f'],
                     fasta_seq, label_line, True, len(bc))
                    write_qual_line(file_data['unassigned_qual_f'],
                     list(qual_seq), label_line, True, len(bc))
                elif not sample_id.startswith("Unassigned"):
                    write_fasta_line(file_data['demultiplexed_seqs_f'],
                     fasta_seq, label_line, keep_barcode, len(bc))
                    write_qual_line(file_data['demultiplexed_qual_f'],
                     list(qual_seq), label_line, keep_barcode, len(bc))
                
                if log_id:
                    log_data[log_id] += 1
                
                enum_val += 1
                
    else:
        for curr_fasta in file_data['fasta_files']:
            for fasta_label, fasta_seq in MinimalFastaParser(curr_fasta):
                seq_counts += 1
                bc, corrected_bc, num_errors, added_field =\
                 get_demultiplex_data(ids_bcs_added_field,
                 fasta_label, fasta_seq, bc_lens, all_bcs, barcode_type, 
                 max_bc_errors, disable_bc_correction, added_demultiplex_field)
                 
                bc_freqs[bc] += 1
                
                sample_id, log_id, bc_corrected_result =\
                 get_output_ids(ids_bcs_added_field,
                 corrected_bc, num_errors, added_field, max_bc_errors,
                 enum_val)
                 
                if bc_corrected_result == 'corrected':
                    corrected_bc_count[0] += 1
                if bc_corrected_result == 'not_corrected':
                    corrected_bc_count[1] += 1
                
                label_line = get_label_line(sample_id, fasta_label, bc, 
                 corrected_bc, num_errors)
                
                if sample_id.startswith("Unassigned") and\
                 write_unassigned_reads:
                    write_fasta_line(file_data['unassigned_seqs_f'],
                     fasta_seq, label_line, True, len(bc))
                elif not sample_id.startswith("Unassigned"):
                    write_fasta_line(file_data['demultiplexed_seqs_f'],
                     fasta_seq, label_line, keep_barcode, len(bc))
                
                if log_id:
                    log_data[log_id] += 1

                enum_val += 1

    return log_data, bc_freqs, seq_counts, corrected_bc_count
def convert_fastq(fasta_file_path, qual_file_path, output_directory='.',
        multiple_output_files=False, ascii_increment=33,
        full_fastq=False, full_fasta_headers=False,
        per_file_buffer_size=100000):
    '''Takes a FASTA and QUAL file, generates FASTQ file(s)
    
    fasta_file_path:  filepath of input FASTA file.
    qual_file_path:  filepath of input QUAL file (needed for making FASTQ files)
    output_directory:  Directory to output converted files.
    multiple_output_files:  Make one file per SampleID.
    ascii_increment:  Conversion value for fastq ascii character to numeric
     quality score.
    full_fastq:  Write labels to both sequence and quality score lines.
    full_fasta_headers:  Retain all data on fasta label, instead of breaking at
     first whitespace.'''   
    
    
    fasta_file = open(fasta_file_path,'U')
    qual_file = open(qual_file_path,'U')
    
    # if we're not using multiple output files, we can open the one (and only)
    # output file right now
    if not multiple_output_files:
        output_file_path = get_filename_with_new_ext(fasta_file_path,
                                                     '.fastq',
                                                     output_directory)

        fastq_file = open(output_file_path, 'w')
    else:
        fastq_lookup = defaultdict(str)

    # iterate through the FASTA and QUAL files entry by entry (assume the
    # entries are synchronized)
    for fasta_data, qual_data in izip(MinimalFastaParser(fasta_file),
         MinimalQualParser(qual_file)):
        
        qual_header = qual_data[0]
        fasta_header = fasta_data[0] 

        label = fasta_header.split()[0]
        sample_id = label.split('_')[0]

        sequence = fasta_data[1]
        qual = qual_data[1]

        # check whether the entries are actually (at least nominally) synch'd
        if qual_header != label:
            raise KeyError, ("QUAL header (%s) does not match "
                             "FASTA header (%s)") % (qual_header, label)

        if len(sequence) != len(qual):
            raise KeyError, ("Sequence length does not match QUAL length for "
                             "label (%s)") % label

        if multiple_output_files:
            output_file_path = get_filename_with_new_ext(fasta_file_path,
                                                 '_' + sample_id + '.fastq',
                                                 output_directory)

            # when we use multiple output files, we close each file after each
            # sequence is written to avoid using up all the file handles, so
            # we must open the file each time in append mode
            # fastq_file = open(output_file_path, 'a')

        if full_fasta_headers:
            fastq_sequence_header = fasta_header
        else:
            fastq_sequence_header = label

        if full_fastq:
            fastq_quality_header = fastq_sequence_header
        else:
            fastq_quality_header = ''

        #Writing to FASTQ file
        record = '@%s\n%s\n+%s\n' % (fastq_sequence_header,
                                     sequence,
                                     fastq_quality_header)

        if multiple_output_files:
            fastq_lookup[output_file_path] += record
        else:
            fastq_file.write(record)

        for qual_score in qual:
            # increment the qual score by the asciiIncrement (default 33),
            # and print the corresponding character, which represents that 
            # position's quality.
            qual_score += ascii_increment
            if qual_score < 32 or qual_score > 126:
                raise ValueError,("Cannot convert quality score to ASCII code"+
                 " between 32 and 126: " + str(qual_score - ascii_increment) +
                 "using ascii_increment = " + str(ascii_increment))

            if multiple_output_files:
                fastq_lookup[output_file_path] += chr(qual_score)
            else:
                fastq_file.write(chr(qual_score))

        if multiple_output_files:
            fastq_lookup[output_file_path] += '\n'
        else:
            fastq_file.write('\n')

        if multiple_output_files:
            if len(fastq_lookup[output_file_path]) >= per_file_buffer_size:
                fastq_file = open(output_file_path, 'a')
                fastq_file.write(fastq_lookup[output_file_path])
                fastq_lookup[output_file_path] = ''
                fastq_file.close()

    # write last seqs to output files, or close the output file if thre is only
    # one
    if multiple_output_files:
        for output_file_path, records in fastq_lookup.iteritems():
            if records:
                fastq_file = open(output_file_path, 'a')
                fastq_file.write(records)
                fastq_file.close()
    else:
        fastq_file.close()
Example #3
0
def convert_fastq(fasta_file_path,
                  qual_file_path,
                  output_directory='.',
                  multiple_output_files=False,
                  ascii_increment=33,
                  full_fastq=False,
                  full_fasta_headers=False):
    '''Takes a FASTA and QUAL file, generates FASTQ file(s)
    
    fasta_file_path:  filepath of input FASTA file.
    qual_file_path:  filepath of input QUAL file (needed for making FASTQ files)
    output_directory:  Directory to output converted files.
    multiple_output_files:  Make one file per SampleID.
    ascii_increment:  Conversion value for fastq ascii character to numeric
     quality score.
    full_fastq:  Write labels to both sequence and quality score lines.
    full_fasta_headers:  Retain all data on fasta label, instead of breaking at
     first whitespace.'''

    output_files = {}

    fasta_file = open(fasta_file_path, 'U')
    qual_file = open(qual_file_path, 'U')

    # Need to open file the first time as "w", thereafter open as "a"
    sample_ids_written = {}

    for fasta_data, qual_data in izip(MinimalFastaParser(fasta_file),
                                      MinimalQualParser(qual_file)):

        qual_header = qual_data[0]
        fasta_header = fasta_data[0]
        label = fasta_header.split()[0]
        sample_id = label.split('_')[0]
        sequence = fasta_data[1]
        qual = qual_data[1]
        try:
            quality_scores = qual_data[1]
        except KeyError:
            raise KeyError,("No entry in QUAL file for label: %s\n" % \
            label)

        if qual_header != label:
            raise KeyError,("Fasta(%s) and qual(%s) headers don't match" %\
            (label, qual_header))

        if len(qual) != len(sequence):
            raise KeyError,("Number of quality scores "+\
            "(%d) does not match number of positions (%d) for label: %s" %\
             (len(qual), len(sequence), label))

        if not multiple_output_files:
            output_file_path = path.join(output_directory, \
            path.splitext(path.split(fasta_file_path)[1])[0] + '.fastq')
            if output_file_path in sample_ids_written.keys():
                sample_ids_written[output_file_path] = True
            else:
                sample_ids_written[output_file_path] = False
            try:
                # Create new file if first time writing, else append
                if sample_ids_written[output_file_path]:
                    fastq_file = open(output_file_path, 'a')
                else:
                    fastq_file = open(output_file_path, 'w')
            except IOError:
                qual_file.close()
                fasta_file.close()
                raise IOError,("Could not open FASTQ file for writing: " \
                        + output_file_path + '\n')
            output_files[sample_id] = output_file_path

        if multiple_output_files:
            if sample_id not in output_files:
                output_file_path = path.join(output_directory, \
                        path.splitext(path.split(fasta_file_path)[1])[0] + \
                        '_' + sample_id + '.fastq')
                if output_file_path in sample_ids_written.keys():
                    sample_ids_written[output_file_path] = True
                else:
                    sample_ids_written[output_file_path] = False
                try:
                    # Create new file if first time writing, else append
                    if sample_ids_written[output_file_path]:
                        output_files[sample_id] = open(output_file_path, 'a')
                    else:
                        output_files[sample_id] = open(output_file_path, 'w')

                except IOError:
                    raise IOError,("Could not open FASTQ file for writing: " \
                            + output_file_path + '\n')
                output_files[sample_id] = output_file_path

        fastq_file = open(output_files[sample_id], 'a')

        if full_fasta_headers:
            fastq_sequence_header = fasta_header
        else:
            fastq_sequence_header = label

        if full_fastq:
            fastq_quality_header = fastq_sequence_header
        else:
            fastq_quality_header = ''

        #Writing to FASTQ file
        fastq_file.write('@' + fastq_sequence_header + '\n')
        fastq_file.write(sequence + '\n')
        fastq_file.write('+' + fastq_quality_header + '\n')
        qual_scores = list(qual)
        for qual_score in qual_scores:
            # increment the qual score by the asciiIncrement (default 33),
            # and print the corresponding character, which represents that
            # position's quality.
            qual_score += ascii_increment
            if qual_score < 32 or qual_score > 126:
                raise ValueError,("Cannot convert quality score to ASCII code"+\
                 " between 32 and 126: " + str(qual_score - ascii_increment) +\
                 "using ascii_increment = " + str(ascii_increment))
            fastq_file.write(chr(qual_score))
        fastq_file.write('\n')
        if multiple_output_files:
            fastq_file.close()