Ejemplo n.º 1
0
    def __call__(self,
                 seq_path,
                 result_path=None,
                 log_path=None,
                 failure_path=None):
        # load candidate sequences
        seq_file = open(seq_path, 'U')
        candidate_sequences = parse_fasta(seq_file)

        # load template sequences
        template_alignment = []
        template_alignment_fp = self.Params['template_filepath']
        for seq_id, seq in parse_fasta(open(template_alignment_fp)):
            # replace '.' characters with '-' characters
            template_alignment.append((seq_id, seq.replace('.', '-').upper()))
        template_alignment = Alignment.from_fasta_records(template_alignment,
                                                          DNASequence,
                                                          validate=True)

        # initialize_logger
        logger = NastLogger(log_path)

        # get function for pairwise alignment method
        pairwise_alignment_f = pairwise_alignment_methods[
            self.Params['pairwise_alignment_method']]

        pynast_aligned, pynast_failed = pynast_seqs(
            candidate_sequences,
            template_alignment,
            min_pct=self.Params['min_pct'],
            min_len=self.Params['min_len'],
            align_unaligned_seqs_f=pairwise_alignment_f,
            logger=logger,
            temp_dir=get_qiime_temp_dir())

        logger.record(str(self))

        for i, seq in enumerate(pynast_failed):
            skb_seq = DNASequence(str(seq), id=seq.Name)
            pynast_failed[i] = skb_seq
        pynast_failed = SequenceCollection(pynast_failed)

        for i, seq in enumerate(pynast_aligned):
            skb_seq = DNASequence(str(seq), id=seq.Name)
            pynast_aligned[i] = skb_seq
        pynast_aligned = Alignment(pynast_aligned)

        if failure_path is not None:
            fail_file = open(failure_path, 'w')
            fail_file.write(pynast_failed.to_fasta())
            fail_file.close()

        if result_path is not None:
            result_file = open(result_path, 'w')
            result_file.write(pynast_aligned.to_fasta())
            result_file.close()
            return None
        else:
            return pynast_aligned
Ejemplo n.º 2
0
def make_single_mutant(sequence,wt_res,res_num,mut_res,first_res=1):
    """
    sequence (string) DNA sequence
    wt_res (char) single letter amino acid code of wildtype residue to be mutated
    res_num (int) residue id number of residue to be mutated
    mut_res (char) single letter amino acid code of mutant residue
    first_res (int) residue id number of first residue in sequence (default = 1)

    DNA sequence needs to start with the first residue of the protein (no promoter, etc)
    take DNA sequence, convert to AA, define AA point mutant, find corresponding codon of wt and mut, output forward and reverse primers
    DNA sequence should be only the kinase domain

    Desired mutation must require only a single nucleotide change
    """
    orig_code = genetic_code(11)
    sequence = sequence.upper()
    aa_sequence = orig_code.translate(sequence).sequence

    if not str(wt_res) == aa_sequence[res_num-first_res]:
        raise IOError("Desired residue not found -- check wildtype residue name and id, and first residue id")
    # start of codon of residue of interest is at (res_num - first_res)*3

    wt_codon = DNASequence(sequence[(res_num - first_res)*3:(res_num - first_res)*3+3])

    mut_codons = orig_code.synonyms[mut_res]
    mut_codon = None
    for codon in mut_codons:
        if wt_codon.distance(DNASequence(codon))*3 == 1:
            mut_codon = codon

    if not mut_codon:
        print("Cannot make desired mutant with a single base change")
        mut_codon = make_mutant(wt_codon, mut_codons)

    good_melting_temp = False
    start_ix = max(0,(res_num-first_res)*3-11)
    end_ix = min(len(sequence),(res_num+1-first_res)*3+11)

    while not good_melting_temp:
        if end_ix - start_ix > 45:
            print("Acceptable melting temp was not found")
            break
        forward_primer = sequence[start_ix:(res_num - first_res)*3]+mut_codon+sequence[(res_num+1 - first_res)*3:end_ix]
        forward_primer = forward_primer.lower()
        good_melting_temp, start_ix, end_ix = check_melting_temp(forward_primer, start_ix, end_ix, len(sequence))
    
    forward_sequence = DNASequence(forward_primer)
    reverse_sequence = forward_sequence.rc()
    
    reverse_primer = reverse_sequence.sequence

    return forward_primer, reverse_primer
Ejemplo n.º 3
0
def dna_to_aa(sequence, try_frames=False):
    """
    Translates from the input DNA nucleotide sequence to amino acid sequence

    Arguments:
    ----------
        sequence : str
            DNA nucleotide sequence
        Optional:
        ---------
            try_frames : Bool
                if True, tries 6 possible reading frames, translates all to amino
                acids and chooses sequence with fewest stop codons
                default = False
    Returns:
    --------
        aa_sequence : str
            sequence of one-letter amino acid codes
    """
    orig_code = genetic_code(11)

    if not try_frames:
        return orig_code.translate(sequence).sequence

    sequence = DNASequence(sequence)
    translated = orig_code.translate_six_frames(sequence)
    stops = [aastring.sequence.count('*') for aastring in translated]

    return translated[stops.index(min(stops))].sequence
Ejemplo n.º 4
0
def check_dna_chars_primers(header,
                            mapping_data,
                            errors,
                            disable_primer_check=False
                            ):
    """ Checks for valid DNA characters in primer fields

    Also flags empty fields as errors unless flags are passed to suppress
    barcode or primer checks.

    header:  list of header strings
    mapping_data:  list of lists of raw metadata mapping file data
    errors:  list of errors
    disable_primer_check:  If True, disables tests for valid primer sequences.
    """

    valid_dna_chars = DNASequence.iupac_characters()
    valid_dna_chars.add(',')

    # Detect fields directly, in case user does not have fields in proper
    # order in the mapping file (this will generate error separately)
    header_fields_to_check = ["ReversePrimer"]
    if not disable_primer_check:
        header_fields_to_check.append("LinkerPrimerSequence")

    check_indices = []

    for curr_field in range(len(header)):
        if header[curr_field] in header_fields_to_check:
            check_indices.append(curr_field)

    # Correction factor for header being the first line
    correction_ix = 1
    # Check for missing data
    for curr_data in range(len(mapping_data)):
        for curr_ix in check_indices:
            if len(mapping_data[curr_data][curr_ix]) == 0:
                errors.append("Missing expected DNA sequence\t%d,%d" %
                              (curr_data + correction_ix, curr_ix))

    # Check for non-DNA characters
    for curr_data in range(len(mapping_data)):
        for curr_ix in check_indices:
            for curr_nt in mapping_data[curr_data][curr_ix]:
                if curr_nt not in valid_dna_chars:
                    errors.append("Invalid DNA sequence detected: %s\t%d,%d" %
                                  (mapping_data[curr_data][curr_ix],
                                   curr_data + correction_ix, curr_ix))
                    continue

    return errors
Ejemplo n.º 5
0
def check_dna_chars_primers(header,
                            mapping_data,
                            errors,
                            disable_primer_check=False):
    """ Checks for valid DNA characters in primer fields

    Also flags empty fields as errors unless flags are passed to suppress
    barcode or primer checks.

    header:  list of header strings
    mapping_data:  list of lists of raw metadata mapping file data
    errors:  list of errors
    disable_primer_check:  If True, disables tests for valid primer sequences.
    """

    valid_dna_chars = DNASequence.iupac_characters()
    valid_dna_chars.add(',')

    # Detect fields directly, in case user does not have fields in proper
    # order in the mapping file (this will generate error separately)
    header_fields_to_check = ["ReversePrimer"]
    if not disable_primer_check:
        header_fields_to_check.append("LinkerPrimerSequence")

    check_indices = []

    for curr_field in range(len(header)):
        if header[curr_field] in header_fields_to_check:
            check_indices.append(curr_field)

    # Correction factor for header being the first line
    correction_ix = 1
    # Check for missing data
    for curr_data in range(len(mapping_data)):
        for curr_ix in check_indices:
            if len(mapping_data[curr_data][curr_ix]) == 0:
                errors.append("Missing expected DNA sequence\t%d,%d" %
                              (curr_data + correction_ix, curr_ix))

    # Check for non-DNA characters
    for curr_data in range(len(mapping_data)):
        for curr_ix in check_indices:
            for curr_nt in mapping_data[curr_data][curr_ix]:
                if curr_nt not in valid_dna_chars:
                    errors.append("Invalid DNA sequence detected: %s\t%d,%d" %
                                  (mapping_data[curr_data][curr_ix],
                                   curr_data + correction_ix, curr_ix))
                    continue

    return errors
Ejemplo n.º 6
0
def check_dna_chars_bcs(header,
                        mapping_data,
                        errors,
                        has_barcodes=True):
    """ Checks for valid DNA characters in barcode field

    Also flags empty fields as errors unless flags are passed to suppress
    barcode or primer checks.

    header:  list of header strings
    mapping_data:  list of lists of raw metadata mapping file data
    errors:  list of errors
    has_barcodes:  If True, will test for perform barcodes test (presence,
     uniqueness, valid IUPAC DNA chars).
    """

    valid_dna_chars = DNASequence.iupac_standard_characters()
    # Detect fields directly, in case user does not have fields in proper
    # order in the mapping file (this will generate error separately)
    header_fields_to_check = []
    if has_barcodes:
        header_fields_to_check.append("BarcodeSequence")

    check_indices = []

    for curr_field in range(len(header)):
        if header[curr_field] in header_fields_to_check:
            check_indices.append(curr_field)

    # Correction factor for header being the first line
    correction_ix = 1
    # Check for missing data
    for curr_data in range(len(mapping_data)):
        for curr_ix in check_indices:
            if len(mapping_data[curr_data][curr_ix]) == 0:
                errors.append("Missing expected DNA sequence\t%d,%d" %
                              (curr_data + correction_ix, curr_ix))
                continue
            for curr_nt in mapping_data[curr_data][curr_ix]:
                if curr_nt not in valid_dna_chars:
                    errors.append("Invalid DNA sequence detected: %s\t%d,%d" %
                                  (mapping_data[curr_data][curr_ix],
                                   curr_data + correction_ix, curr_ix))
                    continue

    return errors
Ejemplo n.º 7
0
def check_dna_chars_bcs(header, mapping_data, errors, has_barcodes=True):
    """ Checks for valid DNA characters in barcode field

    Also flags empty fields as errors unless flags are passed to suppress
    barcode or primer checks.

    header:  list of header strings
    mapping_data:  list of lists of raw metadata mapping file data
    errors:  list of errors
    has_barcodes:  If True, will test for perform barcodes test (presence,
     uniqueness, valid IUPAC DNA chars).
    """

    valid_dna_chars = DNASequence.iupac_standard_characters()
    # Detect fields directly, in case user does not have fields in proper
    # order in the mapping file (this will generate error separately)
    header_fields_to_check = []
    if has_barcodes:
        header_fields_to_check.append("BarcodeSequence")

    check_indices = []

    for curr_field in range(len(header)):
        if header[curr_field] in header_fields_to_check:
            check_indices.append(curr_field)

    # Correction factor for header being the first line
    correction_ix = 1
    # Check for missing data
    for curr_data in range(len(mapping_data)):
        for curr_ix in check_indices:
            if len(mapping_data[curr_data][curr_ix]) == 0:
                errors.append("Missing expected DNA sequence\t%d,%d" %
                              (curr_data + correction_ix, curr_ix))
                continue
            for curr_nt in mapping_data[curr_data][curr_ix]:
                if curr_nt not in valid_dna_chars:
                    errors.append("Invalid DNA sequence detected: %s\t%d,%d" %
                                  (mapping_data[curr_data][curr_ix],
                                   curr_data + correction_ix, curr_ix))
                    continue

    return errors
Ejemplo n.º 8
0
    def _make_mutant(wt_codon, mut_codons):
        """
        Finds the mutant codon, if mutation requires more than 1 nucleotide change

        Arguments:
        ----------
            wt_codon : str
                len(wt_codon) = 3
                nucleotide codon from the wild type sequence for the residue to be mutated
            mut_codons : list(str)
                all codons that translate to desired mutant residue
        Returns:
        --------
            mut_codon : str
                codon selected from mut_codons which requires the fewest changes from
                the wild type codon
        """
        mut_codons = [DNASequence(codon) for codon in mut_codons]
        distances = [wt_codon.distance(codon) for codon in mut_codons]

        changed_bp = int(min(distances)*3)
        print("This mutant required "+str(changed_bp)+"bp modifications\n")
        # choose the codon that requires fewest changes
        return mut_codons[distances.index(min(distances))].sequence
Ejemplo n.º 9
0
def get_consensus(fasta_tempfile, min_consensus):
    """
    Returns consensus sequence from a set of sequences
    input: fasta file, min_consensus
    fasta_file should be in the following format:
    >random_bc|number
    seq
    >random_bc|number
    seq
    ....

    number = number of times this seq has appeared with this random_barcode
    Parameters
    ----------
    fasta_seqs: list
    min_consensus: float
    Returns
    ----------
    consensus_seq: string
        consensus sequence for the given list of sequences
    """
    seqs = list()
    counts = list()

    for label, seq in parse_fasta(fasta_tempfile):
        RE_output = search(r'\w+\|(\d+)', label)
        counts.append(int(RE_output.group(1)))
        seqs.append(seq)

    length = len(seqs[0])
    number_of_seqs = len(seqs)

    for seq_index in range(number_of_seqs):
        if len(seqs[seq_index]) != length:
            raise SeqLengthMismatchError()

    freq_this_pos_this_base = dict()
    count_of_seq_with_max_count = dict()

    for x in range(length):
        freq_this_pos_this_base[x] = dict()
        count_of_seq_with_max_count[x] = dict()

        for y in DNASequence.iupac_characters():
            freq_this_pos_this_base[x][y] = 0
            count_of_seq_with_max_count[x][y] = 0

        for this_seq_count, seq in enumerate(seqs):
            freq_this_pos_this_base[x][
                seq[x]] += counts[this_seq_count]
            if counts[this_seq_count] > count_of_seq_with_max_count[x][seq[x]]:
                count_of_seq_with_max_count[x][seq[x]] = counts[this_seq_count]

    consensus = list()
    for index in range(length):
        sorted_bases = sorted(
            freq_this_pos_this_base[index].iteritems(),
            key=lambda x: x[1])
        max_base, max_freq = sorted_bases[-1]

        for (counter, (b, n)) in enumerate(sorted_bases):
            if max_freq == n:
                try:
                    if (count_of_seq_with_max_count[counter][b] >
                            count_of_seq_with_max_count[counter][max_base]):
                        max_base = b
                except KeyError:
                    pass

        score = 10.0 * max_freq / number_of_seqs
        if score < min_consensus:
            raise LowConsensusScoreError()
        consensus.append(max_base)

    consensus_seq = ''.join(map(str, consensus))
    return consensus_seq
Ejemplo n.º 10
0
def run_ampliconnoise(
    mapping_fp,
    output_dir,
    command_handler,
    params,
    qiime_config,
    logger=None,
    status_update_callback=print_to_stdout,
    chimera_alpha=-3.8228,
    chimera_beta=0.6200,
    sff_txt_fp=None,
    numnodes=2,
    suppress_perseus=True,
    output_filepath=None,
    platform="flx",
    seqnoise_resolution=None,
    truncate_len=None,
):
    """ Run the ampliconnoise pipeline

        The steps performed by this function are:
1. Split input sff.txt file into one file per sample

2. Run scripts required for PyroNoise

3. Run scripts required for SeqNoise

4. Run scripts requred for Perseus (chimera removal)

5. Merge output files into one file similar to the output of split_libraries.py

    output_filepath should be absolute
    seqnoise_resolution should be string
    environment variable PYRO_LOOKUP_FILE must be set correctly. Thus be
    careful passing command handlers that don't spawn child processes, as they
    may not inherit the correct environment variable setting
    """
    map_data, headers, comments = parse_mapping_file(open(mapping_fp, "U"))
    create_dir(output_dir)

    if seqnoise_resolution is None:
        if platform == "flx":
            seqnoise_resolution = "30.0"
        elif platform == "titanium":
            seqnoise_resolution = "25.0"
        else:
            raise RuntimeError("seqnoise_resolution not set, and no" + " default for platform " + platform)

    if truncate_len is None:
        if platform == "flx":
            truncate_len = "220"
        elif platform == "titanium":
            truncate_len = "400"
        else:
            raise RuntimeError("truncate_len not set, and no" + " default for platform " + platform)

    # these are filenames minus extension, and are sample IDs
    sample_names = []
    primer_seqs = []  # same order as sample_names
    bc_seqs = []  # same order as sample_names
    for i in range(len(map_data)):
        sample_names.append(map_data[i][headers.index("SampleID")])
        bc_seqs.append(map_data[i][headers.index("BarcodeSequence")])
        primer = map_data[i][headers.index("LinkerPrimerSequence")]
        for char, bases in DNASequence.iupac_degeneracies().iteritems():
            primer = primer.replace(char, "[" + "".join(bases) + "]")
        primer_seqs.append(primer)

    if len(set(primer_seqs)) != 1:
        raise RuntimeError("Error: only one primer per mapping file supported.")
    one_primer = primer_seqs[0]

    commands = []

    if logger is None:
        logger = WorkflowLogger(generate_log_fp(output_dir), params=params, qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    log_input_md5s(logger, [mapping_fp, sff_txt_fp])

    # execute commands in output_dir
    called_dir = os.getcwd()
    os.chdir(output_dir)
    fh = open(os.path.join(output_dir, "map.csv"), "w")
    for i in range(len(sample_names)):
        fh.write(sample_names[i] + "," + bc_seqs[i] + "\n")
    fh.close()

    # these are the fasta results, e.g. PC.636_Good.fa
    # later we merge them and copy to output file
    post_pyro_tail = "_" + truncate_len
    if suppress_perseus:
        fasta_result_names = [sample_name + post_pyro_tail + "_seqnoise_cd.fa" for sample_name in sample_names]
    else:
        fasta_result_names = [sample_name + "_Good.fa" for sample_name in sample_names]

    cmd = "cd " + output_dir  # see also os.chdir above
    commands.append([("change to output dir", cmd)])

    cmd = "echo $PYRO_LOOKUP_FILE > pyro_lookup_filepath.txt"
    commands.append([("confirm pyro lookup filepath environment variable", cmd)])

    cmd = (
        "SplitKeys.pl "
        + one_primer
        + " map.csv < "
        + os.path.join(called_dir, sff_txt_fp)
        + " > splitkeys_log.txt 2> unassigned.fna"
    )
    commands.append([("split sff.txt via barcodes (keys)", cmd)])

    for i, sample_name in enumerate(sample_names):

        # Build the summarize taxonomy command
        if platform == "flx":
            cmd = "Clean360.pl " + one_primer + " " + sample_name + " < " + sample_name + ".raw"
            commands.append([("clean flows " + sample_name, cmd)])

            # these run through the whole sff file once per sample, I think
            # cmd = "FlowsFA.pl " + primer_seqs[i] + ' '+sample_name +' < '+\
            #     os.path.join(called_dir,sff_txt_fp)
            # commands.append([('extract flows '+sample_name, cmd)])
        elif platform == "titanium":
            cmd = "CleanMinMax.pl " + one_primer + " " + sample_name + " < " + sample_name + ".raw"
            commands.append([("clean flows " + sample_name, cmd)])

            # cmd = "FlowsMinMax.pl " + primer_seqs[i] + ' '+sample_name +' < '+\
            #     os.path.join(called_dir,sff_txt_fp)
            # commands.append([('extract flows '+sample_name, cmd)])
        else:
            raise RuntimeError("platform " + platform + " not supported")

        cmd = (
            "mpirun -np "
            + str(numnodes)
            + " PyroDist -in "
            + sample_name
            + ".dat -out "
            + sample_name
            + " > "
            + sample_name
            + ".pdout"
        )
        commands.append([("pyrodist " + sample_name, cmd)])

        cmd = "FCluster -in " + sample_name + ".fdist -out " + sample_name + " > " + sample_name + ".fcout"
        commands.append([("fcluster pyrodist " + sample_name, cmd)])

        # e.g.:
        # mpirun -np 2 PyroNoise -din PC.354.dat -out PC.354_pyronoise -lin
        # PC.354.list -s 60.0 -c 0.01 > PC.354_pyronoise.pnout
        cmd = (
            "mpirun -np "
            + str(numnodes)
            + " PyroNoise -din "
            + sample_name
            + ".dat -out "
            + sample_name
            + "_pyronoise "
            + "-lin "
            + sample_name
            + ".list -s 60.0 -c 0.01 > "
            + sample_name
            + "_pyronoise.pnout"
        )
        commands.append([("pyronoise " + sample_name, cmd)])

        cmd = (
            "Parse.pl "
            + bc_seqs[i]
            + one_primer
            + " "
            + truncate_len
            + " < "
            + sample_name
            + "_pyronoise_cd.fa"
            + " > "
            + sample_name
            + "_"
            + truncate_len
            + ".fa"
        )
        commands.append([("truncate " + sample_name, cmd)])

        # now start with post_pyro_tail
        cmd = (
            "mpirun -np "
            + str(numnodes)
            + " SeqDist -in "
            + sample_name
            + post_pyro_tail
            + ".fa > "
            + sample_name
            + post_pyro_tail
            + ".seqdist"
        )
        commands.append([("seqdist " + sample_name, cmd)])

        cmd = (
            "FCluster -in "
            + sample_name
            + post_pyro_tail
            + ".seqdist -out "
            + sample_name
            + post_pyro_tail
            + "fcl > "
            + sample_name
            + post_pyro_tail
            + ".fcout"
        )
        commands.append([("fcluster seqdist " + sample_name, cmd)])

        # e.g.:
        # mpirun -np 2 SeqNoise -in PC.354_pyronoise_cd.fa -din
        # PC.354_pyronoise_cd.seqdist -out PC.354_pyronoise_cd_seqnoise -lin
        # PC.354_pyronoise_cdfcl.list -min PC.354_pyronoise.mapping -s 30.0 -c 0.08 >
        # PC.354_pyronoise_cd.snout

        cmd = (
            "mpirun -np "
            + str(numnodes)
            + " SeqNoise -in "
            + sample_name
            + post_pyro_tail
            + ".fa -din "
            + sample_name
            + post_pyro_tail
            + ".seqdist -out "
            + sample_name
            + post_pyro_tail
            + "_seqnoise -lin "
            + sample_name
            + post_pyro_tail
            + "fcl.list -min "
            + sample_name
            + "_pyronoise"
            + ".mapping -s "
            + seqnoise_resolution
            + " -c 0.08 > "
            + sample_name
            + post_pyro_tail
            + ".snout"
        )
        commands.append([("seqnoise " + sample_name, cmd)])

        if not suppress_perseus:

            cmd = "Perseus -sin " + sample_name + post_pyro_tail + "_seqnoise_cd.fa > " + sample_name + ".per"
            commands.append([("Perseus " + sample_name, cmd)])

            cmd = (
                "Class.pl "
                + sample_name
                + ".per "
                + str(chimera_alpha)
                + " "
                + str(chimera_beta)
                + " > "
                + sample_name
                + ".class"
            )
            commands.append([("Class.pl " + sample_name, cmd)])

            cmd = (
                "FilterGoodClass.pl "
                + sample_name
                + post_pyro_tail
                + "_seqnoise_cd.fa "
                + sample_name
                + ".class 0.5 > "
                + sample_name
                + "_Chi.fa 2> "
                + sample_name
                + "_Good.fa"
            )
            commands.append([("FilterGoodClass " + sample_name, cmd)])

        cmd = "unweight_fasta.py -i %s -o %s -l %s" % (fasta_result_names[i], sample_name + "_unw.fna", sample_name)
        commands.append([("unweight fasta " + sample_name, cmd)])

    cmd = (
        "cat " + " ".join([sample_name + "_unw.fna" for sample_name in sample_names]) + " > " + output_filepath
    )  # this should be an abs filepath
    commands.append([("cat into one fasta file", cmd)])

    # Call the command handler on the list of commands
    command_handler(commands, status_update_callback, logger=logger, close_logger_on_success=close_logger_on_success)
Ejemplo n.º 11
0
def run_ampliconnoise(mapping_fp,
                      output_dir, command_handler, params, qiime_config,
                      logger=None, status_update_callback=print_to_stdout,
                      chimera_alpha=-3.8228, chimera_beta=0.6200, sff_txt_fp=None, numnodes=2,
                      suppress_perseus=True, output_filepath=None, platform='flx',
                      seqnoise_resolution=None, truncate_len=None):
    """ Run the ampliconnoise pipeline

        The steps performed by this function are:
1. Split input sff.txt file into one file per sample

2. Run scripts required for PyroNoise

3. Run scripts required for SeqNoise

4. Run scripts requred for Perseus (chimera removal)

5. Merge output files into one file similar to the output of split_libraries.py

    output_filepath should be absolute
    seqnoise_resolution should be string
    environment variable PYRO_LOOKUP_FILE must be set correctly. Thus be
    careful passing command handlers that don't spawn child processes, as they
    may not inherit the correct environment variable setting
    """
    map_data, headers, comments = parse_mapping_file(open(mapping_fp, 'U'))
    create_dir(output_dir)

    if seqnoise_resolution is None:
        if platform == 'flx':
            seqnoise_resolution = '30.0'
        elif platform == 'titanium':
            seqnoise_resolution = '25.0'
        else:
            raise RuntimeError('seqnoise_resolution not set, and no' +
                               ' default for platform ' + platform)

    if truncate_len is None:
        if platform == 'flx':
            truncate_len = '220'
        elif platform == 'titanium':
            truncate_len = '400'
        else:
            raise RuntimeError('truncate_len not set, and no' +
                               ' default for platform ' + platform)

    # these are filenames minus extension, and are sample IDs
    sample_names = []
    primer_seqs = []  # same order as sample_names
    bc_seqs = []  # same order as sample_names
    for i in range(len(map_data)):
        sample_names.append(map_data[i][headers.index('SampleID')])
        bc_seqs.append(map_data[i][headers.index('BarcodeSequence')])
        primer = (map_data[i][headers.index('LinkerPrimerSequence')])
        for char, bases in DNASequence.iupac_degeneracies().iteritems():
            primer = primer.replace(char, '[' + ''.join(bases) + ']')
        primer_seqs.append(primer)

    if len(set(primer_seqs)) != 1:
        raise RuntimeError(
            'Error: only one primer per mapping file supported.')
    one_primer = primer_seqs[0]

    commands = []

    if logger is None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    log_input_md5s(logger, [mapping_fp, sff_txt_fp])

    # execute commands in output_dir
    called_dir = os.getcwd()
    os.chdir(output_dir)
    fh = open(os.path.join(output_dir, 'map.csv'), 'w')
    for i in range(len(sample_names)):
        fh.write(sample_names[i] + ',' + bc_seqs[i] + '\n')
    fh.close()

    # these are the fasta results, e.g. PC.636_Good.fa
    # later we merge them and copy to output file
    post_pyro_tail = '_' + truncate_len
    if suppress_perseus:
        fasta_result_names = [sample_name + post_pyro_tail + '_seqnoise_cd.fa'
                              for sample_name in sample_names]
    else:
        fasta_result_names = [sample_name + '_Good.fa'
                              for sample_name in sample_names]

    cmd = 'cd ' + output_dir  # see also os.chdir above
    commands.append([('change to output dir', cmd)])

    cmd = 'echo $PYRO_LOOKUP_FILE > pyro_lookup_filepath.txt'
    commands.append([('confirm pyro lookup filepath environment variable',
                      cmd)])

    cmd = 'SplitKeys.pl ' + one_primer + ' map.csv < ' +\
        os.path.join(called_dir, sff_txt_fp) +\
        ' > splitkeys_log.txt 2> unassigned.fna'
    commands.append([('split sff.txt via barcodes (keys)', cmd)])

    for i, sample_name in enumerate(sample_names):

        # Build the summarize taxonomy command
        if platform == 'flx':
            cmd = 'Clean360.pl ' + one_primer + ' ' + sample_name + ' < ' +\
                sample_name + '.raw'
            commands.append([('clean flows ' + sample_name, cmd)])

            # these run through the whole sff file once per sample, I think
            # cmd = "FlowsFA.pl " + primer_seqs[i] + ' '+sample_name +' < '+\
            #     os.path.join(called_dir,sff_txt_fp)
            # commands.append([('extract flows '+sample_name, cmd)])
        elif platform == 'titanium':
            cmd = 'CleanMinMax.pl ' + one_primer + ' ' + sample_name + ' < ' +\
                sample_name + '.raw'
            commands.append([('clean flows ' + sample_name, cmd)])

            # cmd = "FlowsMinMax.pl " + primer_seqs[i] + ' '+sample_name +' < '+\
            #     os.path.join(called_dir,sff_txt_fp)
            # commands.append([('extract flows '+sample_name, cmd)])
        else:
            raise RuntimeError("platform " + platform + " not supported")

        cmd = "mpirun -np " + str(numnodes) + " PyroDist -in " +\
            sample_name + ".dat -out " + \
            sample_name + " > " + sample_name + ".pdout"
        commands.append([('pyrodist ' + sample_name, cmd)])

        cmd = "FCluster -in " + sample_name + ".fdist -out " + sample_name +\
            " > " + sample_name + ".fcout"
        commands.append([('fcluster pyrodist ' + sample_name, cmd)])

# e.g.:
# mpirun -np 2 PyroNoise -din PC.354.dat -out PC.354_pyronoise -lin
# PC.354.list -s 60.0 -c 0.01 > PC.354_pyronoise.pnout
        cmd = "mpirun -np " + str(numnodes) + " PyroNoise -din " +\
            sample_name + ".dat -out " +\
            sample_name + "_pyronoise " + "-lin " +\
            sample_name + ".list -s 60.0 -c 0.01 > " +\
            sample_name + "_pyronoise.pnout"
        commands.append([('pyronoise ' + sample_name, cmd)])

        cmd = 'Parse.pl ' + bc_seqs[i] + one_primer + ' ' + truncate_len + ' < ' +\
            sample_name + '_pyronoise_cd.fa' + ' > ' + sample_name + '_' +\
            truncate_len + '.fa'
        commands.append([('truncate ' + sample_name, cmd)])

        # now start with post_pyro_tail
        cmd = "mpirun -np " + str(numnodes) + " SeqDist -in " +\
            sample_name + post_pyro_tail +\
            ".fa > " + sample_name + post_pyro_tail + ".seqdist"
        commands.append([('seqdist ' + sample_name, cmd)])

        cmd = "FCluster -in " + sample_name + post_pyro_tail + ".seqdist -out " +\
            sample_name + post_pyro_tail + "fcl > " +\
            sample_name + post_pyro_tail + ".fcout"
        commands.append([('fcluster seqdist ' + sample_name, cmd)])

# e.g.:
# mpirun -np 2 SeqNoise -in PC.354_pyronoise_cd.fa -din
# PC.354_pyronoise_cd.seqdist -out PC.354_pyronoise_cd_seqnoise -lin
# PC.354_pyronoise_cdfcl.list -min PC.354_pyronoise.mapping -s 30.0 -c 0.08 >
# PC.354_pyronoise_cd.snout

        cmd = "mpirun -np " + str(numnodes) + " SeqNoise -in " +\
            sample_name + post_pyro_tail +\
            ".fa -din " + sample_name + post_pyro_tail + ".seqdist -out " +\
            sample_name + post_pyro_tail +\
            "_seqnoise -lin " + sample_name + post_pyro_tail + 'fcl.list -min ' +\
            sample_name + '_pyronoise' +\
            '.mapping -s ' + seqnoise_resolution + ' -c 0.08 > ' +\
            sample_name + post_pyro_tail + '.snout'
        commands.append([('seqnoise ' + sample_name, cmd)])

        if not suppress_perseus:

            cmd = 'Perseus -sin ' + sample_name + post_pyro_tail +\
                '_seqnoise_cd.fa > ' +\
                sample_name + '.per'
            commands.append([('Perseus ' + sample_name, cmd)])

            cmd = 'Class.pl ' + sample_name + '.per ' +\
                str(chimera_alpha) + ' ' + str(chimera_beta) +\
                ' > ' + sample_name + '.class'
            commands.append([('Class.pl ' + sample_name, cmd)])

            cmd = 'FilterGoodClass.pl ' + sample_name + post_pyro_tail +\
                '_seqnoise_cd.fa ' +\
                sample_name + '.class 0.5 > ' + sample_name + '_Chi.fa 2> ' +\
                sample_name + '_Good.fa'
            commands.append([('FilterGoodClass ' + sample_name, cmd)])

        cmd = 'unweight_fasta.py -i %s -o %s -l %s' %\
            (fasta_result_names[i], sample_name + '_unw.fna', sample_name)
        commands.append([('unweight fasta ' + sample_name, cmd)])

    cmd = 'cat ' +\
        ' '.join([sample_name + '_unw.fna' for sample_name in sample_names]) +\
        ' > ' + output_filepath  # this should be an abs filepath
    commands.append([('cat into one fasta file', cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)
Ejemplo n.º 12
0
    def make_single_mutant(self, wt_res,res_num,mut_res):
        """
        Determines how many nucleotide changes are required for the desired amino acid
        mutation, then constructs a primer with a minimum of 25 nucleotides, increasing 
        the length symmetrically (such that the mutant codon is centered in the primer)
        up to 45 nucleotides, using the minimum length possible to achieve acceptable
        melting temperature (78C minimum)

        DNA sequence needs to start with the first residue of the protein (no promoter, etc)
        take DNA sequence, convert to AA, define AA point mutant, find corresponding codon 
        of wt and mut, output forward and reverse primers
        DNA sequence should be only the kinase domain

        Desired mutation should require only a single nucleotide change; will print warning
        if more nucleotide changes are required

        Arguments:
        ----------
            sequence : str
                DNA sequence
            wt_res : char
                single letter amino acid code of wildtype residue to be mutated
            res_num : int 
                residue id number of residue to be mutated
            mut_res : char
                single letter amino acid code of mutant residue
        Returns:
        --------
            forward_primer : str
                nucleotide sequence
            reverse_primer : str
                nucleotide sequence
        """
        aa_sequence = self.aa_sequence
        sequence = self.sequence
        first_res = self.first_res
        orig_code = self.orig_code

        if not str(wt_res) == aa_sequence[res_num-first_res]:
            raise IOError("Desired residue not found -- check wildtype residue name and id, and first residue id")
        # start of codon of residue of interest is at (res_num - first_res)*3

        wt_codon = DNASequence(sequence[(res_num - first_res)*3:(res_num - first_res)*3+3])

        mut_codons = orig_code.synonyms[mut_res]
        mut_codon = None
        for codon in mut_codons:
            if wt_codon.distance(DNASequence(codon))*3 == 1:
                mut_codon = codon

        if not mut_codon:
            print("Cannot make desired mutant with a single base change")
            mut_codon = self._make_mutant(wt_codon, mut_codons)

        good_melting_temp = False
        start_ix = max(0,(res_num-first_res)*3-11)
        end_ix = min(len(sequence),(res_num+1-first_res)*3+11)

        while not good_melting_temp:
            if end_ix - start_ix > 45:
                print("Acceptable melting temp was not found")
                break
            forward_primer = sequence[start_ix:(res_num - first_res)*3]+mut_codon+sequence[(res_num+1 - first_res)*3:end_ix]
            forward_primer = forward_primer.lower()
            good_melting_temp, start_ix, end_ix = self._check_melting_temp(forward_primer, start_ix, end_ix, len(sequence))
    
        forward_sequence = DNASequence(forward_primer)
        reverse_sequence = forward_sequence.rc()
    
        reverse_primer = reverse_sequence.sequence

        return forward_primer, reverse_primer
Ejemplo n.º 13
0
def run_ampliconnoise(mapping_fp,
                      output_dir, command_handler, params, qiime_config,
                      logger=None, status_update_callback=print_to_stdout,
                      chimera_alpha=-3.8228, chimera_beta=0.6200, sff_txt_fp=None, numnodes=2,
                      suppress_perseus=True, output_filepath=None, platform='flx',
                      seqnoise_resolution=None, truncate_len=None):
    """ Run the ampliconnoise pipeline

        The steps performed by this function are:
1. Split input sff.txt file into one file per sample

2. Run scripts required for PyroNoise

3. Run scripts required for SeqNoise

4. Run scripts requred for Perseus (chimera removal)

5. Merge output files into one file similar to the output of split_libraries.py

    output_filepath should be absolute
    seqnoise_resolution should be string
    environment variable PYRO_LOOKUP_FILE must be set correctly. Thus be
    careful passing command handlers that don't spawn child processes, as they
    may not inherit the correct environment variable setting
    """
    map_data, headers, comments = parse_mapping_file(open(mapping_fp, 'U'))
    create_dir(output_dir)

    if seqnoise_resolution is None:
        if platform == 'flx':
            seqnoise_resolution = '30.0'
        elif platform == 'titanium':
            seqnoise_resolution = '25.0'
        else:
            raise RuntimeError('seqnoise_resolution not set, and no' +
                               ' default for platform ' + platform)

    if truncate_len is None:
        if platform == 'flx':
            truncate_len = '220'
        elif platform == 'titanium':
            truncate_len = '400'
        else:
            raise RuntimeError('truncate_len not set, and no' +
                               ' default for platform ' + platform)

    # these are filenames minus extension, and are sample IDs
    sample_names = []
    primer_seqs = []  # same order as sample_names
    bc_seqs = []  # same order as sample_names
    for i in range(len(map_data)):
        sample_names.append(map_data[i][headers.index('SampleID')])
        bc_seqs.append(map_data[i][headers.index('BarcodeSequence')])
        primer = (map_data[i][headers.index('LinkerPrimerSequence')])
        for char, bases in DNASequence.iupac_degeneracies().iteritems():
            primer = primer.replace(char, '[' + ''.join(bases) + ']')
        primer_seqs.append(primer)

    if len(set(primer_seqs)) != 1:
        raise RuntimeError(
            'Error: only one primer per mapping file supported.')
    one_primer = primer_seqs[0]

    commands = []

    if logger is None:
        logger = WorkflowLogger(generate_log_fp(output_dir),
                                params=params,
                                qiime_config=qiime_config)
        close_logger_on_success = True
    else:
        close_logger_on_success = False
    log_input_md5s(logger, [mapping_fp, sff_txt_fp])

    # execute commands in output_dir
    called_dir = os.getcwd()
    os.chdir(output_dir)
    fh = open(os.path.join(output_dir, 'map.csv'), 'w')
    for i in range(len(sample_names)):
        fh.write(sample_names[i] + ',' + bc_seqs[i] + '\n')
    fh.close()

    # these are the fasta results, e.g. PC.636_Good.fa
    # later we merge them and copy to output file
    post_pyro_tail = '_' + truncate_len
    if suppress_perseus:
        fasta_result_names = [sample_name + post_pyro_tail + '_seqnoise_cd.fa'
                              for sample_name in sample_names]
    else:
        fasta_result_names = [sample_name + '_Good.fa'
                              for sample_name in sample_names]

    cmd = 'cd ' + output_dir  # see also os.chdir above
    commands.append([('change to output dir', cmd)])

    cmd = 'echo $PYRO_LOOKUP_FILE > pyro_lookup_filepath.txt'
    commands.append([('confirm pyro lookup filepath environment variable',
                      cmd)])

    cmd = 'SplitKeys.pl ' + one_primer + ' map.csv < ' +\
        os.path.join(called_dir, sff_txt_fp) +\
        ' > splitkeys_log.txt 2> unassigned.fna'
    commands.append([('split sff.txt via barcodes (keys)', cmd)])

    for i, sample_name in enumerate(sample_names):

        # Build the summarize taxonomy command
        if platform == 'flx':
            cmd = 'Clean360.pl ' + one_primer + ' ' + sample_name + ' < ' +\
                sample_name + '.raw'
            commands.append([('clean flows ' + sample_name, cmd)])

            # these run through the whole sff file once per sample, I think
            # cmd = "FlowsFA.pl " + primer_seqs[i] + ' '+sample_name +' < '+\
            #     os.path.join(called_dir,sff_txt_fp)
            # commands.append([('extract flows '+sample_name, cmd)])
        elif platform == 'titanium':
            cmd = 'CleanMinMax.pl ' + one_primer + ' ' + sample_name + ' < ' +\
                sample_name + '.raw'
            commands.append([('clean flows ' + sample_name, cmd)])

            # cmd = "FlowsMinMax.pl " + primer_seqs[i] + ' '+sample_name +' < '+\
            #     os.path.join(called_dir,sff_txt_fp)
            # commands.append([('extract flows '+sample_name, cmd)])
        else:
            raise RuntimeError("platform " + platform + " not supported")

        cmd = "mpirun -np " + str(numnodes) + " PyroDist -in " +\
            sample_name + ".dat -out " + \
            sample_name + " > " + sample_name + ".pdout"
        commands.append([('pyrodist ' + sample_name, cmd)])

        cmd = "FCluster -in " + sample_name + ".fdist -out " + sample_name +\
            " > " + sample_name + ".fcout"
        commands.append([('fcluster pyrodist ' + sample_name, cmd)])

# e.g.:
# mpirun -np 2 PyroNoise -din PC.354.dat -out PC.354_pyronoise -lin
# PC.354.list -s 60.0 -c 0.01 > PC.354_pyronoise.pnout
        cmd = "mpirun -np " + str(numnodes) + " PyroNoise -din " +\
            sample_name + ".dat -out " +\
            sample_name + "_pyronoise " + "-lin " +\
            sample_name + ".list -s 60.0 -c 0.01 > " +\
            sample_name + "_pyronoise.pnout"
        commands.append([('pyronoise ' + sample_name, cmd)])

        cmd = 'Parse.pl ' + bc_seqs[i] + one_primer + ' ' + truncate_len + ' < ' +\
            sample_name + '_pyronoise_cd.fa' + ' > ' + sample_name + '_' +\
            truncate_len + '.fa'
        commands.append([('truncate ' + sample_name, cmd)])

        # now start with post_pyro_tail
        cmd = "mpirun -np " + str(numnodes) + " SeqDist -in " +\
            sample_name + post_pyro_tail +\
            ".fa > " + sample_name + post_pyro_tail + ".seqdist"
        commands.append([('seqdist ' + sample_name, cmd)])

        cmd = "FCluster -in " + sample_name + post_pyro_tail + ".seqdist -out " +\
            sample_name + post_pyro_tail + "fcl > " +\
            sample_name + post_pyro_tail + ".fcout"
        commands.append([('fcluster seqdist ' + sample_name, cmd)])

# e.g.:
# mpirun -np 2 SeqNoise -in PC.354_pyronoise_cd.fa -din
# PC.354_pyronoise_cd.seqdist -out PC.354_pyronoise_cd_seqnoise -lin
# PC.354_pyronoise_cdfcl.list -min PC.354_pyronoise.mapping -s 30.0 -c 0.08 >
# PC.354_pyronoise_cd.snout

        cmd = "mpirun -np " + str(numnodes) + " SeqNoise -in " +\
            sample_name + post_pyro_tail +\
            ".fa -din " + sample_name + post_pyro_tail + ".seqdist -out " +\
            sample_name + post_pyro_tail +\
            "_seqnoise -lin " + sample_name + post_pyro_tail + 'fcl.list -min ' +\
            sample_name + '_pyronoise' +\
            '.mapping -s ' + seqnoise_resolution + ' -c 0.08 > ' +\
            sample_name + post_pyro_tail + '.snout'
        commands.append([('seqnoise ' + sample_name, cmd)])

        if not suppress_perseus:

            cmd = 'Perseus -sin ' + sample_name + post_pyro_tail +\
                '_seqnoise_cd.fa > ' +\
                sample_name + '.per'
            commands.append([('Perseus ' + sample_name, cmd)])

            cmd = 'Class.pl ' + sample_name + '.per ' +\
                str(chimera_alpha) + ' ' + str(chimera_beta) +\
                ' > ' + sample_name + '.class'
            commands.append([('Class.pl ' + sample_name, cmd)])

            cmd = 'FilterGoodClass.pl ' + sample_name + post_pyro_tail +\
                '_seqnoise_cd.fa ' +\
                sample_name + '.class 0.5 > ' + sample_name + '_Chi.fa 2> ' +\
                sample_name + '_Good.fa'
            commands.append([('FilterGoodClass ' + sample_name, cmd)])

        cmd = 'unweight_fasta.py -i %s -o %s -l %s' %\
            (fasta_result_names[i], sample_name + '_unw.fna', sample_name)
        commands.append([('unweight fasta ' + sample_name, cmd)])

    cmd = 'cat ' +\
        ' '.join([sample_name + '_unw.fna' for sample_name in sample_names]) +\
        ' > ' + output_filepath  # this should be an abs filepath
    commands.append([('cat into one fasta file', cmd)])

    # Call the command handler on the list of commands
    command_handler(commands,
                    status_update_callback,
                    logger=logger,
                    close_logger_on_success=close_logger_on_success)