Example #1
0
    def test_process_id_map(self):
        """process_id_map should return correct results on small test map"""
        s = """#SampleID\tBarcodeSequence\tLinkerPrimerSequence\tX\tDescription
#fake data
x\tAA\tACGT\t3\tsample_x
y\t"AC"\tACGT\t4\t"sample_y"
z\tGG\tACGT\t5\tsample_z"""
        f = StringIO(s)
        f.name = 'test.xls'
        headers, id_map, description_map, run_description, errors, warnings = \
            process_id_map(f)

        self.assertEqual(headers, ['BarcodeSequence', 'LinkerPrimerSequence', \
         'X'])
        self.assertEqual(id_map, {'y': {'X': '4', 'LinkerPrimerSequence': \
         'ACGT', 'BarcodeSequence': 'AC'}, 'x': {'X': '3', \
         'LinkerPrimerSequence': 'ACGT', 'BarcodeSequence': 'AA'}, 'z': \
        {'X': '5', 'LinkerPrimerSequence': 'ACGT', 'BarcodeSequence': 'GG'}})
        self.assertEqual(description_map, {
            'x': 'sample_x',
            'y': 'sample_y',
            'z': 'sample_z',
        })
        self.assertEqual(run_description, ['fake data'])
        self.assertEqual(errors, [])
        self.assertEqual(warnings, [])
Example #2
0
    def test_process_id_map(self):
        """process_id_map should return correct results on small test map"""
        s = """#SampleID\tBarcodeSequence\tLinkerPrimerSequence\tX\tDescription
#fake data
x\tAA\tACGT\t3\tsample_x
y\t"AC"\tACGT\t4\t"sample_y"
z\tGG\tACGT\t5\tsample_z"""
        f = StringIO(s)
        f.name='test.xls'
        headers, id_map, description_map, run_description, errors, warnings = \
            process_id_map(f)

        self.assertEqual(headers, ['BarcodeSequence', 'LinkerPrimerSequence', \
         'X'])
        self.assertEqual(id_map, {'y': {'X': '4', 'LinkerPrimerSequence': \
         'ACGT', 'BarcodeSequence': 'AC'}, 'x': {'X': '3', \
         'LinkerPrimerSequence': 'ACGT', 'BarcodeSequence': 'AA'}, 'z': \
        {'X': '5', 'LinkerPrimerSequence': 'ACGT', 'BarcodeSequence': 'GG'}})
        self.assertEqual(description_map, {
            'x':'sample_x',
            'y':'sample_y',
            'z':'sample_z',
        })
        self.assertEqual(run_description, ['fake data'])
        self.assertEqual(errors, [])
        self.assertEqual(warnings, [])
Example #3
0
def add_qiime_labels(mapping_f,
                     fasta_dir,
                     filename_column,
                     output_dir=".",
                     count_start=0):
    """ Main function for combining fasta files, writing valid QIIME labels
    
    mapping_f:  open file object of the metadata mapping file
    fasta_dir:  Directory of fasta files to combine into a single file
    filename_column:  Column of metadata mapping file containing fasta filenames
    output_dir:  Directory to write output combined file to
    count_start:  Number to start enumeration of fasta labels with
    
    """
    
    headers, mapping_data, run_description, errors, warnings= \
        process_id_map(mapping_f, has_barcodes=False, \
        disable_primer_check=True, added_demultiplex_field=None,
        variable_len_barcodes=False)
    
    fasta_name_to_sample_id = check_mapping_data(mapping_data, headers,
     filename_column)
    
    fasta_files = get_fasta_fps(fasta_dir, fasta_name_to_sample_id.keys())
    
    write_combined_fasta(fasta_name_to_sample_id, fasta_files, output_dir,
     counter=count_start)
def get_mapping_details(mapping_fp):
    """ Returns SampleIDs, Barcodes, Primer seqs from mapping file
    
    mapping_fp: filepath to mapping file
    """
    
    mapping_f = open(mapping_fp, "U")
    
    # Only using the id_map and the errors from parsing the mapping file.
    hds, mapping_data, run_description, errors, warnings = \
        process_id_map(mapping_f)
        
    mapping_f.close()
    
    # Errors means problems with SampleIDs or headers
    if errors:
        raise ValueError,('Error in mapping file, please validate '+\
         'mapping file with check_id_map.py')
         
    # create dict of dicts with SampleID:{each header:mapping data}
    
    id_map = {}
    
    for curr_data in mapping_data:
        id_map[curr_data[0]] = {}
        
    
    for header in range(len(hds)):
        for curr_data in mapping_data:
            id_map[curr_data[0]][hds[header]] = curr_data[header]
         
    sample_ids = id_map.keys()
    
    barcode_seqs = []
    raw_linkerprimer_seqs = []
    
    for curr_id in id_map:
        barcode_seqs.append(id_map[curr_id]['BarcodeSequence'])
        raw_linkerprimer_seqs.append(id_map[curr_id]['LinkerPrimerSequence'])
    
    # remove duplicates    
    raw_linkerprimer_seqs = set(raw_linkerprimer_seqs)
    
    linker_primer_seqs = expand_degeneracies(raw_linkerprimer_seqs)
    
    return set(sample_ids), set(barcode_seqs), set(linker_primer_seqs)
Example #5
0
def get_rev_primer_seqs(mapping_fp):
    """ Parses mapping file to get dictionary of SampleID:Rev primer
    mapping_fp:  mapping filepath
    """
    hds, mapping_data, run_description, errors, warnings = \
        process_id_map(mapping_fp, has_barcodes=False,
                       disable_primer_check=True)

    if errors:
        for curr_err in errors:
            if curr_err.startswith("Duplicate SampleID"):
                raise ValueError('Errors were found with mapping file, ' +
                                 'please run validate_mapping_file.py to ' +
                                 'identify problems.')

    # create dict of dicts with SampleID:{each header:mapping data}

    id_map = {}

    for curr_data in mapping_data:
        id_map[curr_data[0]] = {}

    for header in range(len(hds)):
        for curr_data in mapping_data:
            id_map[curr_data[0]][hds[header]] = curr_data[header]

    reverse_primers = {}

    for curr_id in id_map.keys():
        try:
            reverse_primers[curr_id] =\
                [str(DNA(curr_rev_primer).rc()) for curr_rev_primer in
                 id_map[curr_id]['ReversePrimer'].split(',')]
        except KeyError:
            raise KeyError("Reverse primer not found in mapping file, " +
                           "please include a 'ReversePrimer' column.")

    # Check for valid reverse primers
    # Will have been detected as warnings from mapping file
    for curr_err in errors:
        if curr_err.startswith("Invalid DNA sequence detected"):
            raise ValueError(
                "Problems found with reverse primers, please " +
                "check mapping file with validate_mapping_file.py")

    return reverse_primers
def get_rev_primer_seqs(mapping_fp):
    """ Parses mapping file to get dictionary of SampleID:Rev primer
    mapping_fp:  mapping filepath
    """
    hds, mapping_data, run_description, errors, warnings = \
        process_id_map(mapping_fp, has_barcodes=False,
         disable_primer_check=True)
        
    if errors:
        for curr_err in errors:
            if curr_err.startswith("Duplicate SampleID"):
                raise ValueError,('Errors were found with mapping file, '+\
                 'please run check_id_map.py to identify problems.')
         
    # create dict of dicts with SampleID:{each header:mapping data}
    
    id_map = {}
    
    for curr_data in mapping_data:
        id_map[curr_data[0]] = {}
        
    
    for header in range(len(hds)):
        for curr_data in mapping_data:
            id_map[curr_data[0]][hds[header]] = curr_data[header]
    
    reverse_primers = {}
    
    for curr_id in id_map.keys():
        try:
            reverse_primers[curr_id] =\
             [DNA.rc(curr_rev_primer) for curr_rev_primer in\
             id_map[curr_id]['ReversePrimer'].split(',')]
        except KeyError:
            raise KeyError,("Reverse primer not found in mapping file, "+\
             "please include a 'ReversePrimer' column.")

             
    # Check for valid reverse primers
    # Will have been detected as warnings from mapping file
    for curr_err in errors:
        if curr_err.startswith("Invalid DNA sequence detected"):
            raise ValueError,("Problems found with reverse primers, please "+\
             "check mapping file with check_id_map.py")
    
    return reverse_primers
Example #7
0
def get_mapping_details(mapping_fp):
    """ Returns SampleIDs, Barcodes, Primer seqs from mapping file
    
    mapping_fp: filepath to mapping file
    """

    mapping_f = open(mapping_fp, "U")

    # Only using the id_map and the errors from parsing the mapping file.
    hds, mapping_data, run_description, errors, warnings = \
        process_id_map(mapping_f)

    mapping_f.close()

    # Errors means problems with SampleIDs or headers
    if errors:
        raise ValueError,('Error in mapping file, please validate '+\
         'mapping file with check_id_map.py')

    # create dict of dicts with SampleID:{each header:mapping data}

    id_map = {}

    for curr_data in mapping_data:
        id_map[curr_data[0]] = {}

    for header in range(len(hds)):
        for curr_data in mapping_data:
            id_map[curr_data[0]][hds[header]] = curr_data[header]

    sample_ids = id_map.keys()

    barcode_seqs = []
    raw_linkerprimer_seqs = []

    for curr_id in id_map:
        barcode_seqs.append(id_map[curr_id]['BarcodeSequence'])
        raw_linkerprimer_seqs.append(id_map[curr_id]['LinkerPrimerSequence'])

    # remove duplicates
    raw_linkerprimer_seqs = set(raw_linkerprimer_seqs)

    linker_primer_seqs = expand_degeneracies(raw_linkerprimer_seqs)

    return set(sample_ids), set(barcode_seqs), set(linker_primer_seqs)
def check_map(mapping_file,
              barcode_type="golay_12",
              added_demultiplex_field=None):
    """ Gets header, mapping data, halts execution if there are errors
    
    mapping_file:  list of lines of metadata mapping file
    barcode_type:  Specified barcode, can be golay_12, hamming_8,
     variable_length, or an integer specifying length.
    added_demultiplex_field:  Uses data supplied in metadata mapping field
     and demultiplexes according to data in fasta labels.
    """
    
    if barcode_type == 0:
        has_barcodes = False
        var_len_barcodes = False
    elif barcode_type == 'variable_length':
        has_barcodes = True
        var_len_barcodes = True
    else:
        has_barcodes = True
        var_len_barcodes = False
    
    header, mapping_data, run_description, errors, warnings= \
        process_id_map(mapping_file, has_barcodes=has_barcodes, \
        disable_primer_check=True,
        added_demultiplex_field=added_demultiplex_field,
        variable_len_barcodes=var_len_barcodes)

    # Need to specifically detect varied length barcodes, otherwise won't know
    # how much of sequence to slice off for barcode reads
    for warning in warnings:
        if "differs than length" in warning:
            raise ValueError,("Detected variable length barcodes, if these "+\
             "are being used, use -b variable_length")
    # Halt on errors, as these are serious problems with mapping file.
    # These include non-DNA characters in the barcodes, duplicate
    # barcodes or duplicate barcodes/added demultiplex fields, duplicate
    # SampleIDs, or header problems.
    if errors:
        raise ValueError,("Errors found in mapping file, please check "+\
         "mapping file with check_id_map.py")
    
    return header, mapping_data
Example #9
0
def check_map(mapping_file,
              barcode_type="golay_12",
              added_demultiplex_field=None):
    """ Gets header, mapping data, halts execution if there are errors
    
    mapping_file:  list of lines of metadata mapping file
    barcode_type:  Specified barcode, can be golay_12, hamming_8,
     variable_length, or an integer specifying length.
    added_demultiplex_field:  Uses data supplied in metadata mapping field
     and demultiplexes according to data in fasta labels.
    """
    
    if barcode_type == 0:
        has_barcodes = False
        var_len_barcodes = False
    elif barcode_type == 'variable_length':
        has_barcodes = True
        var_len_barcodes = True
    else:
        has_barcodes = True
        var_len_barcodes = False
    
    header, mapping_data, run_description, errors, warnings= \
        process_id_map(mapping_file, has_barcodes=has_barcodes, \
        disable_primer_check=True,
        added_demultiplex_field=added_demultiplex_field,
        variable_len_barcodes=var_len_barcodes)

    # Need to specifically detect varied length barcodes, otherwise won't know
    # how much of sequence to slice off for barcode reads
    for warning in warnings:
        if "differs than length" in warning:
            raise ValueError,("Detected variable length barcodes, if these "+\
             "are being used, use -b variable_length")
    # Halt on errors, as these are serious problems with mapping file.
    # These include non-DNA characters in the barcodes, duplicate
    # barcodes or duplicate barcodes/added demultiplex fields, duplicate
    # SampleIDs, or header problems.
    if errors:
        raise ValueError,("Errors found in mapping file, please check "+\
         "mapping file with check_id_map.py")
    
    return header, mapping_data
Example #10
0
def check_map(infile, has_barcodes=True, disable_primer_check=False):
    """Check mapping file and extract list of valid barcodes, primers """
    hds, id_map, dsp, run_description, errors, warnings = \
        process_id_map(infile, is_barcoded=has_barcodes, \
        disable_primer_check=disable_primer_check)
    barcode_to_sample_id = {}
    
    primer_seqs_lens = {}
    all_primers = {}

    for sample_id, sample in id_map.items():
        barcode_to_sample_id[sample['BarcodeSequence'].upper()] = sample_id
        if not disable_primer_check:
            raw_primer = sample['LinkerPrimerSequence'].upper()
            expanded_primers = expand_degeneracies(raw_primer)
            curr_bc_primers = {}
            for primer in expanded_primers:
                curr_bc_primers[primer] = len(primer)
                all_primers[primer] = len(primer)
            primer_seqs_lens[sample['BarcodeSequence']] = curr_bc_primers
    
    
    return hds, id_map, barcode_to_sample_id, warnings, errors, \
     primer_seqs_lens, all_primers
Example #11
0
    forward_primers = []
    reverse_primers = []
    for curr_primer in raw_forward_primers:
        forward_primers.append(
            compile(''.join([iupac[symbol] for symbol in curr_primer])))
    for curr_primer in raw_reverse_primers:
        reverse_primers.append(
            compile(''.join([iupac[symbol] for symbol in curr_primer])))

    return forward_primers, reverse_primers


map_fp = open(argv[1], "U")

header, mapping_data, run_description, errors, warnings = process_id_map(
    map_fp)
forward_primers, reverse_primers = get_primers(header, mapping_data)

seqs = open(argv[2], "U")

out_seqs = open(argv[3], "w")
log_out = open(argv[4], "w")

f_count = 0
r_count = 0
no_seq_left = 0

for label, seq in MinimalFastaParser(seqs):
    start_slice = 0
    end_slice = -1
    for curr_primer in forward_primers:
def get_mapping_details(mapping_fp,
                        suppress_barcode_checks=False,
                        suppress_primer_checks=False):
    """ Returns SampleIDs, Barcodes, Primer seqs from mapping file

    mapping_fp: filepath to mapping file
    suppress_barcode_checks=If True, will skip getting barcodes from mapping
     file and searching for these in sequences.
    suppress_primer_checks=If True, will skip getting primers from mapping
     file and searching for these in sequences
    """

    mapping_f = open(mapping_fp, "U")

    # Only using the id_map and the errors from parsing the mapping file.
    hds, mapping_data, run_description, errors, warnings = \
        process_id_map(mapping_f)

    mapping_f.close()

    # Should raise errors for barcodes or primers unless suppressed, and
    # should raise errors for headers or duplicate SampleIDs in any case.
    loc_bcs = ",1"
    loc_primers = ",2"
    if errors:
        for curr_error in errors:
            # Halt when header has error
            if curr_error.startswith("Found header field"):
                raise ValueError('Error in mapping file, please validate '
                                 'mapping file with validate_mapping_file.py')
            elif curr_error.endswith(loc_bcs):
                # Halt for barcode errors unless suppressed
                if suppress_barcode_checks:
                    continue
                else:
                    raise ValueError(
                        'Error in mapping file, please validate '
                        'mapping file with validate_mapping_file.py')
            elif curr_error.endswith(loc_primers):
                # Halt for primer errors unless suppressed
                if suppress_primer_checks:
                    continue
                else:
                    raise ValueError(
                        'Error in mapping file, please validate '
                        'mapping file with validate_mapping_file.py')
            # Raise error on duplicate sample IDs
            elif curr_error.startswith("Duplicate SampleID"):
                raise ValueError('Error in mapping file, please validate '
                                 'mapping file with validate_mapping_file.py')

    # create dict of dicts with SampleID:{each header:mapping data}

    id_map = {}

    for curr_data in mapping_data:
        id_map[curr_data[0]] = {}

    for header in range(len(hds)):
        for curr_data in mapping_data:
            id_map[curr_data[0]][hds[header]] = curr_data[header]

    sample_ids = id_map.keys()

    barcode_seqs = []
    raw_linkerprimer_seqs = []

    for curr_id in id_map:
        if not suppress_barcode_checks:
            barcode_seqs.append(id_map[curr_id]['BarcodeSequence'])
        if not suppress_primer_checks:
            raw_linkerprimer_seqs.append(
                id_map[curr_id]['LinkerPrimerSequence'])

    # remove duplicates
    raw_linkerprimer_seqs = set(raw_linkerprimer_seqs)

    linker_primer_seqs = expand_degeneracies(raw_linkerprimer_seqs)

    return set(sample_ids), set(barcode_seqs), set(linker_primer_seqs)
def get_mapping_details(mapping_fp,
                        suppress_barcode_checks=False,
                        suppress_primer_checks=False):
    """ Returns SampleIDs, Barcodes, Primer seqs from mapping file

    mapping_fp: filepath to mapping file
    suppress_barcode_checks=If True, will skip getting barcodes from mapping
     file and searching for these in sequences.
    suppress_primer_checks=If True, will skip getting primers from mapping
     file and searching for these in sequences
    """

    mapping_f = open(mapping_fp, "U")

    # Only using the id_map and the errors from parsing the mapping file.
    hds, mapping_data, run_description, errors, warnings = \
        process_id_map(mapping_f)

    mapping_f.close()

    # Should raise errors for barcodes or primers unless suppressed, and
    # should raise errors for headers or duplicate SampleIDs in any case.
    loc_bcs = ",1"
    loc_primers = ",2"
    if errors:
        for curr_error in errors:
            # Halt when header has error
            if curr_error.startswith("Found header field"):
                raise ValueError('Error in mapping file, please validate '
                                 'mapping file with validate_mapping_file.py')
            elif curr_error.endswith(loc_bcs):
                # Halt for barcode errors unless suppressed
                if suppress_barcode_checks:
                    continue
                else:
                    raise ValueError('Error in mapping file, please validate '
                                     'mapping file with validate_mapping_file.py')
            elif curr_error.endswith(loc_primers):
                # Halt for primer errors unless suppressed
                if suppress_primer_checks:
                    continue
                else:
                    raise ValueError('Error in mapping file, please validate '
                                     'mapping file with validate_mapping_file.py')
            # Raise error on duplicate sample IDs
            elif curr_error.startswith("Duplicate SampleID"):
                raise ValueError('Error in mapping file, please validate '
                                 'mapping file with validate_mapping_file.py')

    # create dict of dicts with SampleID:{each header:mapping data}

    id_map = {}

    for curr_data in mapping_data:
        id_map[curr_data[0]] = {}

    for header in range(len(hds)):
        for curr_data in mapping_data:
            id_map[curr_data[0]][hds[header]] = curr_data[header]

    sample_ids = id_map.keys()

    barcode_seqs = []
    raw_linkerprimer_seqs = []

    for curr_id in id_map:
        if not suppress_barcode_checks:
            barcode_seqs.append(id_map[curr_id]['BarcodeSequence'])
        if not suppress_primer_checks:
            raw_linkerprimer_seqs.append(
                id_map[curr_id]['LinkerPrimerSequence'])

    # remove duplicates
    raw_linkerprimer_seqs = set(raw_linkerprimer_seqs)

    linker_primer_seqs = expand_degeneracies(raw_linkerprimer_seqs)

    return set(sample_ids), set(barcode_seqs), set(linker_primer_seqs)
Example #14
0
def extract_barcodes(fastq1,
                     fastq2=None,
                     output_dir=".",
                     input_type="barcode_single_end",
                     bc1_len=6,
                     bc2_len=6,
                     rev_comp_bc1=False,
                     rev_comp_bc2=False,
                     char_delineator=":",
                     switch_bc_order=False,
                     map_fp=None,
                     attempt_read_orientation=False,
                     disable_header_match=False):
    """ Main program function for extracting barcodes from reads

    fastq1: Open fastq file 1.
    fastq2: None or open fastq file 2.
    output_dir: Directory to write output parses sequences to.
    input_type: Specifies the type of parsing to be done.
    bc1_len: Length of barcode 1 to be parsed from fastq1
    bc2_len: Length of barcode 2 to be parsed from fastq2, or from end of a
     stitched read.
    rev_comp_bc1: If True, reverse complement bc1 before writing.
    rev_comp_bc2: If True, reverse complement bc2 before writing.
    char_delineator: Specify character that immediately precedes the barcode
        for input_type of barcode_in_label.
    switch_bc_order: Normally, barcode 1 will be written first, followed by
        barcode 2 in a combined output fastq file. If True, the order will be
        reversed. Only applies to stitched reads processing, as other barcode
        orders are dictated by the the parameter chosen for the fastq files.
    map_fp: open file object of mapping file, requires a LinkerPrimerSequence
        and ReversePrimer field to be present. Used for orienting reads.
    attempt_read_orientation: If True, will attempt to orient the reads
        according to the forward primers in the mapping file. If primer is
        detected in current orientation, leave the read as is, but if reverse
        complement is detected (or ReversePrimer is detected in the current
        orientation) the read will either be written to the forward (read 1) or
        reverse (read 2) reads for the case of paired files, or the read will be
        reverse complemented in the case of stitched reads.
    disable_header_match: if True, suppresses checks between fastq headers.
    """

    # Turn off extra file creation for single read.
    if input_type == "barcode_single_end" and attempt_read_orientation:
        attempt_read_orientation = False
    if attempt_read_orientation:
        header, mapping_data, run_description, errors, warnings =\
            process_id_map(map_fp)
        forward_primers, reverse_primers = get_primers(header, mapping_data)
        output_bc_not_oriented = open(join(output_dir,
                                           "barcodes_not_oriented.fastq.incomplete"), "w")
        fastq1_out_not_oriented = open(join(output_dir,
                                            "reads1_not_oriented.fastq.incomplete"), "w")
        fastq2_out_not_oriented = open(join(output_dir,
                                            "reads2_not_oriented.fastq.incomplete"), "w")
    else:
        forward_primers = None
        reverse_primers = None
        output_bc_not_oriented = None
        fastq1_out_not_oriented = None
        fastq2_out_not_oriented = None

    output_bc_fastq = open(join(output_dir, "barcodes.fastq.incomplete"), "w")
    if input_type in ["barcode_single_end", "barcode_paired_stitched"]:
        output_fastq1 = open(join(output_dir, "reads.fastq.incomplete"), "w")
        output_fastq2 = None
        final_fastq1_name = join(output_dir, "reads.fastq")
    elif input_type in ["barcode_paired_end"]:
        output_fastq1 = open(join(output_dir, "reads1.fastq.incomplete"), "w")
        output_fastq2 = open(join(output_dir, "reads2.fastq.incomplete"), "w")
        final_fastq1_name = join(output_dir, "reads1.fastq")
    else:
        output_fastq1 = None
        output_fastq2 = None

    if not fastq2:
        fastq2 = cycle(["@", "AAAAAAAAAAAA", "+", "AAAAAAAAAAAA"])
        not_paired = True
    else:
        not_paired = False

    check_header_match_f = get_casava_version(fastq1)

    header_index = 0

    for read1_data, read2_data in izip(
            parse_fastq(fastq1, strict=False, enforce_qual_range=False),
            parse_fastq(fastq2, strict=False, enforce_qual_range=False)):
        if not disable_header_match:
            if not check_header_match_f(read1_data[header_index],
                                        read2_data[header_index]):
                raise FastqParseError("Headers of read1 and read2 do not match. Can't continue. "
                                      "Confirm that the fastq sequences that you are "
                                      "passing match one another. --disable_header_match can be "
                                      "used to suppress header checks.")

        if input_type == "barcode_single_end":
            process_barcode_single_end_data(read1_data, output_bc_fastq,
                                            output_fastq1, bc1_len, rev_comp_bc1)

        elif input_type == "barcode_paired_end":
            process_barcode_paired_end_data(read1_data, read2_data,
                                            output_bc_fastq, output_fastq1, output_fastq2, bc1_len, bc2_len,
                                            rev_comp_bc1, rev_comp_bc2, attempt_read_orientation,
                                            forward_primers, reverse_primers, output_bc_not_oriented,
                                            fastq1_out_not_oriented, fastq2_out_not_oriented)

        elif input_type == "barcode_paired_stitched":
            process_barcode_paired_stitched(read1_data,
                                            output_bc_fastq, output_fastq1, bc1_len, bc2_len,
                                            rev_comp_bc1, rev_comp_bc2, attempt_read_orientation,
                                            forward_primers, reverse_primers, output_bc_not_oriented,
                                            fastq1_out_not_oriented, switch_bc_order)

        elif input_type == "barcode_in_label":
            if not_paired:
                curr_read2_data = False
            else:
                curr_read2_data = read2_data
            process_barcode_in_label(read1_data, curr_read2_data,
                                     output_bc_fastq, bc1_len, bc2_len,
                                     rev_comp_bc1, rev_comp_bc2, char_delineator)

    output_bc_fastq.close()
    rename(output_bc_fastq.name, join(output_dir, "barcodes.fastq"))
    if output_fastq1:
        output_fastq1.close()
        rename(output_fastq1.name, final_fastq1_name)
    if output_fastq2:
        output_fastq2.close()
        rename(output_fastq2.name, join(output_dir, "reads2.fastq"))
    if output_bc_not_oriented:
        rename(output_bc_not_oriented.name,
               join(output_dir, "barcodes_not_oriented.fastq"))
    if fastq1_out_not_oriented:
        rename(fastq1_out_not_oriented.name,
               join(output_dir, "reads1_not_oriented.fastq"))
    if fastq2_out_not_oriented:
        rename(fastq2_out_not_oriented.name,
               join(output_dir, "reads2_not_oriented.fastq"))
Example #15
0
    def run_demultiplex_and_trim(self, opts, **kwargs):
        """
            The main part of the script that pulls all the various 
            manipulations together. It takes arguments from the command
            line as well as **kwargs (currently only specifying gzip or not)
        
        """

        import logging
        self.logger = logging.getLogger('demultip')

        sample_primer_dict = {}

        if not opts:
            sys.exit("command line options not getting to main method")

        metafile = opts.m

        # extract .gz to temp file location
        if 'gzipFilename' in kwargs:
            self.logger.info("Incoming kwargs detected...gzip file?")
            #sequence_file = kwargs.get('gzipFilename')
        else:
            self.logger.info("No kwargs, normal Fastq file")
            #sequence_file = opts.f
        self.logger.info("processing {0} total sequences".format(
            str((self.r1_tot + self.r2_tot) / 4)))
        self.logger.info(
            "using the first {0} bases of primer in search".format(
                self.search_length))

        #extract the relevant data from the metadata file, can maybe change this to non-qiime1
        self.logger.info("Getting header and mapping data...")
        header, mapping_data, run_description, errors, warnings = process_id_map(
            metafile)
        self.logger.debug("metadata headers {0}".format(header))
        self.logger.debug("csv mapping data from {0}...\n{1}".format(
            metafile, "\n".join([str(x) for x in mapping_data])))

        # get the primer regex search patterns
        self.logger.info("Generating regex search patterns...")
        forward_primers, forward_primers_rc, reverse_primers, reverse_primers_rc = self.create_primer_regex_patterns(
            header, mapping_data)
        self.primer_pattern_dict_list = {
            'fp': forward_primers,
            'fprc': forward_primers_rc,
            'rp': reverse_primers,
            'rprc': reverse_primers_rc
        }

        self.logger.debug("forward_primer patterns\n{0}\n".format("\n".join([
            str(x.pattern) for x in self.primer_pattern_dict_list.get('fp')
        ])))
        self.logger.debug("reverse_primers patterns\n{0}\n".format("\n".join([
            str(x.pattern) for x in self.primer_pattern_dict_list.get('rp')
        ])))

        # replace all extra characters in header with underscore
        intab = '.-+|=:;,&$'
        outtab = '__________'
        trantab = maketrans(intab, outtab)

        for samples in mapping_data:
            try:
                sample_primer_dict[samples[header.index('SampleID')].translate(
                    trantab)] = (samples[header.index('LinkerPrimerSequence')],
                                 samples[header.index('ReversePrimer')])
            except Exception as e:
                self.logger.error(
                    "Can not find {0} in header fields, please make sure metadata file has the required fields"
                    .format(e))

        self.logger.debug("sample_primer_dict...{0}".format(
            "\n".join(x) for x in sample_primer_dict.items()))
        self.logger.info("Starting demultiplex process...")

        bar = progressbar.ProgressBar(max_value=(self.r1_tot + self.r2_tot) /
                                      4,
                                      redirect_stdout=True)

        for r1, r2 in itertools.izip(self.R1.itervalues(),
                                     self.R2.itervalues()):
            #self.logger.debug("r1 {0}".format(r1))
            #self.logger.debug("r2 {0}".format(r1))

            pair_seq_dict = {'r1': r1, 'r2': r2}
            self.logger.debug("new read pair\n")
            self.logger.debug("processing new read pair {0}".format(
                pair_seq_dict.keys()))

            self.logger.debug("processing seq ID - R1 {0}... R2 {1}".format(
                r1.id, r2.id))
            self.logger.debug("R1 sequence - {0}...".format(r1.seq[0:50]))
            self.logger.debug("R2 sequence - {0}...".format(r2.seq[0:50]))

            self.sample_id = ""
            # because we process two sequences at a time (R1 and R2)
            self.processed_seqs += 2

            self.f_primer_found = []
            self.r_primer_found = []

            self.logger.debug("Looking in pair read for patterns...")

            search_result = self.regex_search_through_sequence(
                pair_seq_dict, self.primer_pattern_dict_list)
            #self.logger.debug("pre read correction search_result - {0}".format(search_result))
            #search_result = self.correct_orientation_of_reads(search_result)
            #self.logger.debug("post read correction search_result - {0}".format(search_result))

            try:
                if type(search_result) == list and len(search_result) > 1:
                    self.logger.debug("search result - {0}".format(
                        search_result[0]))
                    self.logger.debug("search result - {0}".format(
                        search_result[1]))
            except IndexError as e:
                self.logger.debug("search result - {0}".format(search_result))
                self.logger.debug("error in list index {0}".format(e))

            read_pair_proceed = self.screen_read_pair_suitability(
                search_result)

            self.logger.debug(
                "proceed with read pair ? {0}".format(read_pair_proceed))

            if read_pair_proceed != 'failed':
                try:
                    sample_id = self.get_sample_id_from_primer_sequence(
                        sample_primer_dict, search_result[0].get('pattern'),
                        search_result[1].get('pattern'))
                    self.logger.debug(
                        "- R1 ID -> {0} & R2 ID -> {1} from sample {2}".format(
                            r1.id, r2.id, sample_id))
                except IndexError as e:
                    # sample is missing one or both the patterns keys
                    self.logger.debug(
                        "Sample seq is missing a pattern, {0}- discarding read"
                        .format(e))
                    output = self.record_buffer_and_writer(
                        {'discarded': pair_seq_dict})
                    self.unmapped_count += 2
                    continue
                try:
                    new_seq = self.clip_primers_from_seq(
                        search_result, self.primer_pattern_dict_list,
                        pair_seq_dict, sample_primer_dict, sample_id)
                    self.logger.debug(
                        "clipped read returned...{0} seqs".format(
                            len(new_seq)))
                    output = self.record_buffer_and_writer(new_seq)
                    self.both_primers_count += 2
                except Exception as e:
                    output = self.record_buffer_and_writer(
                        {'discarded': pair_seq_dict})
                    self.logger.debug(
                        "attempt to clip sequence failed - errmsg - {0} - discarding read {1}"
                        .format(e, output))
                    self.unmapped_count += 2
                    continue

                bar.update(self.processed_seqs)

                if output == "cleared":
                    self.record_buffer = {}
                    self.logger.debug("buffer check {0}".format(
                        self.record_buffer))

            elif read_pair_proceed == 'failed':
                self.unmapped_count += 2
                output = self.record_buffer_and_writer(
                    {'discarded': pair_seq_dict})
                bar.update(self.processed_seqs)

        self.logger.info("__________________________")
        self.logger.info("Samples successfully mapped (F+R found): {0}".format(
            self.both_primers_count))
        self.logger.info("Read pairs in alternate orientation - {0}".format(
            str(len(self.alternate_orientation))))
        self.logger.info("Sequences not mapped: {0}".format(
            self.unmapped_count))
        self.logger.info("Total sequences checked: {0}".format(
            self.processed_seqs))

        self.logger.info("writing alternate record IDs...")
        with open("alternate_orientation_records.txt", 'w') as f:
            for sequence_id in self.alternate_orientation:
                output_id = ''.join(sequence_id)
                f.write(output_id)

        self.logger.info("Run finished")
Example #16
0
def extract_barcodes(fastq1,
                     fastq2=None,
                     output_dir=".",
                     input_type="barcode_single_end",
                     bc1_len=6,
                     bc2_len=6,
                     rev_comp_bc1=False,
                     rev_comp_bc2=False,
                     char_delineator=":",
                     switch_bc_order=False,
                     map_fp=None,
                     attempt_read_orientation=False,
                     disable_header_match=False):
    """ Main program function for extracting barcodes from reads

    fastq1: Open fastq file 1.
    fastq2: None or open fastq file 2.
    output_dir: Directory to write output parses sequences to.
    input_type: Specifies the type of parsing to be done.
    bc1_len: Length of barcode 1 to be parsed from fastq1
    bc2_len: Length of barcode 2 to be parsed from fastq2, or from end of a
     stitched read.
    rev_comp_bc1: If True, reverse complement bc1 before writing.
    rev_comp_bc2: If True, reverse complement bc2 before writing.
    char_delineator: Specify character that immediately precedes the barcode
        for input_type of barcode_in_label.
    switch_bc_order: Normally, barcode 1 will be written first, followed by
        barcode 2 in a combined output fastq file. If True, the order will be
        reversed. Only applies to stitched reads processing, as other barcode
        orders are dictated by the the parameter chosen for the fastq files.
    map_fp: open file object of mapping file, requires a LinkerPrimerSequence
        and ReversePrimer field to be present. Used for orienting reads.
    attempt_read_orientation: If True, will attempt to orient the reads
        according to the forward primers in the mapping file. If primer is
        detected in current orientation, leave the read as is, but if reverse
        complement is detected (or ReversePrimer is detected in the current
        orientation) the read will either be written to the forward (read 1) or
        reverse (read 2) reads for the case of paired files, or the read will be
        reverse complemented in the case of stitched reads.
    disable_header_match: if True, suppresses checks between fastq headers.
    """

    # Turn off extra file creation for single read.
    if input_type == "barcode_single_end" and attempt_read_orientation:
        attempt_read_orientation = False
    if attempt_read_orientation:
        header, mapping_data, run_description, errors, warnings =\
            process_id_map(map_fp)
        forward_primers, reverse_primers = get_primers(header, mapping_data)
        output_bc_not_oriented = open(
            join(output_dir, "barcodes_not_oriented.fastq.incomplete"), "w")
        fastq1_out_not_oriented = open(
            join(output_dir, "reads1_not_oriented.fastq.incomplete"), "w")
        fastq2_out_not_oriented = open(
            join(output_dir, "reads2_not_oriented.fastq.incomplete"), "w")
    else:
        forward_primers = None
        reverse_primers = None
        output_bc_not_oriented = None
        fastq1_out_not_oriented = None
        fastq2_out_not_oriented = None

    output_bc_fastq = open(join(output_dir, "barcodes.fastq.incomplete"), "w")
    if input_type in ["barcode_single_end", "barcode_paired_stitched"]:
        output_fastq1 = open(join(output_dir, "reads.fastq.incomplete"), "w")
        output_fastq2 = None
        final_fastq1_name = join(output_dir, "reads.fastq")
    elif input_type in ["barcode_paired_end"]:
        output_fastq1 = open(join(output_dir, "reads1.fastq.incomplete"), "w")
        output_fastq2 = open(join(output_dir, "reads2.fastq.incomplete"), "w")
        final_fastq1_name = join(output_dir, "reads1.fastq")
    else:
        output_fastq1 = None
        output_fastq2 = None

    if not fastq2:
        fastq2 = cycle(["@", "AAAAAAAAAAAA", "+", "AAAAAAAAAAAA"])
        not_paired = True
    else:
        not_paired = False

    check_header_match_f = get_casava_version(fastq1)

    header_index = 0

    for read1_data, read2_data in izip(
            parse_fastq(fastq1, strict=False, enforce_qual_range=False),
            parse_fastq(fastq2, strict=False, enforce_qual_range=False)):
        if not disable_header_match:
            if not check_header_match_f(read1_data[header_index],
                                        read2_data[header_index]):
                raise FastqParseError(
                    "Headers of read1 and read2 do not match. Can't continue. "
                    "Confirm that the fastq sequences that you are "
                    "passing match one another. --disable_header_match can be "
                    "used to suppress header checks.")

        if input_type == "barcode_single_end":
            process_barcode_single_end_data(read1_data, output_bc_fastq,
                                            output_fastq1, bc1_len,
                                            rev_comp_bc1)

        elif input_type == "barcode_paired_end":
            process_barcode_paired_end_data(
                read1_data, read2_data, output_bc_fastq, output_fastq1,
                output_fastq2, bc1_len, bc2_len, rev_comp_bc1, rev_comp_bc2,
                attempt_read_orientation, forward_primers, reverse_primers,
                output_bc_not_oriented, fastq1_out_not_oriented,
                fastq2_out_not_oriented)

        elif input_type == "barcode_paired_stitched":
            process_barcode_paired_stitched(
                read1_data, output_bc_fastq, output_fastq1, bc1_len, bc2_len,
                rev_comp_bc1, rev_comp_bc2, attempt_read_orientation,
                forward_primers, reverse_primers, output_bc_not_oriented,
                fastq1_out_not_oriented, switch_bc_order)

        elif input_type == "barcode_in_label":
            if not_paired:
                curr_read2_data = False
            else:
                curr_read2_data = read2_data
            process_barcode_in_label(read1_data, curr_read2_data,
                                     output_bc_fastq, bc1_len, bc2_len,
                                     rev_comp_bc1, rev_comp_bc2,
                                     char_delineator)

    output_bc_fastq.close()
    rename(output_bc_fastq.name, join(output_dir, "barcodes.fastq"))
    if output_fastq1:
        output_fastq1.close()
        rename(output_fastq1.name, final_fastq1_name)
    if output_fastq2:
        output_fastq2.close()
        rename(output_fastq2.name, join(output_dir, "reads2.fastq"))
    if output_bc_not_oriented:
        rename(output_bc_not_oriented.name,
               join(output_dir, "barcodes_not_oriented.fastq"))
    if fastq1_out_not_oriented:
        rename(fastq1_out_not_oriented.name,
               join(output_dir, "reads1_not_oriented.fastq"))
    if fastq2_out_not_oriented:
        rename(fastq2_out_not_oriented.name,
               join(output_dir, "reads2_not_oriented.fastq"))
    def test_process_id_map_added_demultiplex(self):
        """process_id_map handles added demultiplex fields"""
        s = """#SampleID\tBarcodeSequence\tLinkerPrimerSequence\tX\tDescription
#fake data
x\tAA\tACGT\t3\tsample_x
y\t"AC"\tACGT\t4\t"sample_y"
z\tGG\tACGT\t5\tsample_z"""
        f = StringIO(s)
        f.name='test.xls'
        
        # Should raise error since demultiplex field not in mapping data.
        self.assertRaises(ValueError, process_id_map, f, added_demultiplex_field = 'Not_A_Field')

        """process_id_map should return correct results on small test map with
         the combinations of barcodes and added demultiplex fields unique"""
        s = """#SampleID\tBarcodeSequence\tLinkerPrimerSequence\tX\tJanus\tDescription
#fake data
x\tAA\tACGT\t3\tDown\tsample_x
y\t"AC"\tACGT\t4\tDown\t"sample_y"
z\tAA\tACGT\t5\tNotUp\tsample_z"""
        f = StringIO(s)
        f.name='test.xls'
        headers, id_map, description_map, run_description, errors, warnings = \
            process_id_map(f, added_demultiplex_field='Janus')

        self.assertEqual(headers, ['BarcodeSequence', 'LinkerPrimerSequence', \
         'X', 'Janus'])
        self.assertEqual(id_map, {'y': {'X': '4', 'Janus':'Down', 'LinkerPrimerSequence': \
         'ACGT', 'BarcodeSequence': 'AC'}, 'x': {'X': '3', 'Janus':'Down', \
         'LinkerPrimerSequence': 'ACGT', 'BarcodeSequence': 'AA'}, 'z': \
        {'X': '5', 'Janus':'NotUp', 'LinkerPrimerSequence': 'ACGT', 'BarcodeSequence': 'AA'}})
        self.assertEqual(description_map, {
            'x':'sample_x',
            'y':'sample_y',
            'z':'sample_z',
        })
        self.assertEqual(run_description, ['fake data'])
        self.assertEqual(errors, [])
        self.assertEqual(warnings, [])
        
        # Should get warnings with non-unique combinations of barcodes and 
        # added demultiplex.
        s = """#SampleID\tBarcodeSequence\tLinkerPrimerSequence\tX\tJanus\tDescription
#fake data
x\tAA\tACGT\t3\tDown\tsample_x
y\t"AC"\tACGT\t4\tDown\t"sample_y"
z\tAA\tACGT\t5\tDown\tsample_z"""
        f = StringIO(s)
        f.name='test.xls'
        headers, id_map, description_map, run_description, errors, warnings = \
            process_id_map(f, added_demultiplex_field='Janus')

        self.assertEqual(headers, ['BarcodeSequence', 'LinkerPrimerSequence', \
         'X', 'Janus'])
        self.assertEqual(id_map, {'y': {'X': '4', 'Janus':'Down', 'LinkerPrimerSequence': \
         'ACGT', 'BarcodeSequence': 'AC'}, 'x': {'X': '3', 'Janus':'Down', \
         'LinkerPrimerSequence': 'ACGT', 'BarcodeSequence': 'AA'}, 'z': \
        {'X': '5', 'Janus':'Down', 'LinkerPrimerSequence': 'ACGT', 'BarcodeSequence': 'AA'}})
        self.assertEqual(description_map, {
            'x':'sample_x',
            'y':'sample_y',
            'z':'sample_z',
        })
        self.assertEqual(run_description, ['fake data'])
        
        expected_errors = ["DupChecker 'BarcodeSequence' found the following possible duplicates. If these metadata should have the same name, please correct.:\nGroup\tOriginal names\nAA\tAA,Down, AA,Down\nRow, column for all possible duplicate descriptions:\nLocation (row, column):\t0,1\nLocation (row, column):\t0,4\nLocation (row, column):\t2,1\nLocation (row, column):\t2,4\n"]
                
        self.assertEqual(errors, expected_errors)
        
        self.assertEqual(warnings, [])