Esempio n. 1
0
def main():
    from cogent.parse.fasta import MinimalFastaParser

    greengenes_filename = os.path.expanduser(
        '~/Data/greengenes/sequences_16S_gg_2011_1.sel4cni.inf.aln.masked.fasta'
    )

    logging.basicConfig(level='INFO',
                        format='%(levelname)s: %(message)s',
                        filename='log.log',
                        filemode='w')

    distributions = []
    with open(greengenes_filename) as greengenes:
        for label, seq in MinimalFastaParser(greengenes):
            d = distribution(seq, RNA.Alphabet)
            distributions.append([label, d, shannon(d)])

    print distributions
Esempio n. 2
0
def read_preprocessed_data(out_fp="/tmp/"):
    """Read data of a previous preprocessing run.

    out_fp: output directory of previous preprocess run.
            Supposed to contain two files:
              - prefix_dereplicated.fasta
              - prefix_mapping.txt
    """
    # read mapping, and extract seqs
    # mapping has fasta_header like this:
    #  > id:   count

    seqs = dict([(a.split(':')[0], b) for (a, b) in (
        MinimalFastaParser(open(out_fp + "/prefix_dereplicated.fasta")))])

    mapping = read_denoiser_mapping(open(out_fp + "/prefix_mapping.txt"))

    return (out_fp + "/prefix_dereplicated.sff.txt", len(mapping), mapping,
            seqs)
Esempio n. 3
0
    def __call__(self, seq_path=None, result_path=None, log_path=None):
        """Returns a dict mapping {seq_id:(taxonomy, confidence)} for each seq

        Keep in mind, "confidence" is only done for consistency and in fact
        all assignments will have a score of 0 because a method for determining
        confidence is not currently implemented.

        Parameters:
        seq_path: path to file of sequences. The sequences themselves are
            never actually used, but they are needed for their ids.
        result_path: path to file of results. If specified, dumps the
            result to the desired path instead of returning it.
        log_path: path to log, which should include dump of params.
        """

        # initialize the logger
        logger = self._get_logger(log_path)
        logger.info(str(self))

        with open(seq_path, 'U') as f:
            seqs = dict(MinimalFastaParser(f))

        consensus_map = tax2tree.prep_consensus(
            open(self.Params['id_to_taxonomy_fp']), seqs.keys())
        seed_con = consensus_map[0].strip().split('\t')[1]
        determine_rank_order(seed_con)

        tipnames_map = load_consensus_map(consensus_map, False)

        tree = load_tree(open(self.Params['tree_fp']), tipnames_map)

        results = tax2tree.generate_constrings(tree, tipnames_map)
        results = tax2tree.clean_output(results, seqs.keys())

        if result_path:
            # if the user provided a result_path, write the
            # results to file
            with open(result_path, 'w') as f:
                for seq_id, (lineage, confidence) in results.iteritems():
                    f.write('%s\t%s\t%s\n' % (seq_id, lineage, confidence))
            logger.info('Result path: %s' % result_path)

        return results
Esempio n. 4
0
def main():
    option_parser, opts, args = parse_command_line_parameters(**script_info)

    negate = opts.negate
    sample_ids = opts.sample_ids
    mapping_fp = opts.mapping_fp
    input_fasta_fp = opts.input_fasta_fp
    output_fasta_fp = opts.output_fasta_fp

    if not mapping_fp:
        sample_ids = sample_ids.split(',')
    else:
        map_data, map_header, map_comments = parse_mapping_file(mapping_fp)
        sample_ids = get_sample_ids(
            map_data, map_header,
            parse_metadata_state_descriptions(sample_ids))
        if len(sample_ids) == 0:
            raise ValueError,\
             "No samples match the search criteria: %s" % valid_states

    if opts.verbose:
        # This is useful when using the --valid_states feature so you can
        # find out if a search query didn't work as you expected before a
        # lot of time is spent
        print "Extracting samples: %s" % ', '.join(sample_ids)

    try:
        seqs = MinimalFastaParser(open(input_fasta_fp))
    except IOError:
        option_parser.error('Cannot open %s. Does it exist? Do you have read access?'%\
         input_fasta_fp)
        exit(1)

    try:
        output_fasta_f = open(output_fasta_fp, 'w')
    except IOError:
        option_parser.error("Cannot open %s. Does path exist? Do you have write access?" %\
         output_fasta_fp)
        exit(1)

    for r in extract_seqs_by_sample_id(seqs, sample_ids, negate):
        output_fasta_f.write('>%s\n%s\n' % r)
    output_fasta_f.close()
Esempio n. 5
0
def fix_abundance_labels(output_consensus_fp, filtered_consensus_fp):
    """ puts size= part of label as second component after white space

    output_consensus_fp: consensus filepath with abundance data
    filtered_consensus_fp: output filepath name
    """

    consensus_f = open(output_consensus_fp, "U")

    filtered_f = open(filtered_consensus_fp, "w")

    for label, seq in MinimalFastaParser(consensus_f):
        fasta_label = label.split()[0]
        size = "size=" + label.split('size=')[1].replace(';', '')
        final_label = "%s;%s" % (fasta_label, size)
        filtered_f.write(">%s\n%s\n" % (final_label, seq))

    consensus_f.close()
    filtered_f.close()
Esempio n. 6
0
def get_aligned_muscle(seq1, seq2):
    """Returns aligned sequences and frac_same using MUSCLE.

        This needs to be moved to the muscle app controller
    """
    outname = get_tmp_filename()
    res = muscle_seqs([seq1, seq2],
                      add_seq_names=True,
                      WorkingDir="/tmp",
                      out_filename=outname)
    seq1_aligned, seq2_aligned = list(
        MinimalFastaParser(res['MuscleOut'].read()))
    res.cleanUp()
    del (res)
    seq1_aligned = seq1_aligned[1][1:]
    seq2_aligned = seq2_aligned[1][1:]
    frac_same = (array(seq1_aligned, 'c') == array(seq2_aligned, 'c')).sum(0)\
            / min(len(seq1), len(seq2))

    return seq1_aligned, seq2_aligned, frac_same
Esempio n. 7
0
def write_combined_fasta(fasta_name_to_sample_id,
                         fasta_files,
                         output_dir=".",
                         counter=0):
    """ Writes combined, enumerated fasta file
    
    fasta_name_to_sample_id:  dict of fasta file name to SampleID
    fasta_files: list of filepaths to iterate through
    output_dir:  output directory to write combined file to
    counter:  Starting number to enumerate sequences with
    """
    
    combined_file_out = open(join(output_dir + "/", "combined_seqs.fna"), "w")
    
    for curr_fasta in fasta_files:
        for label, seq in MinimalFastaParser(open(curr_fasta, "U")):
            combined_file_out.write(">%s_%d %s\n" %\
             (fasta_name_to_sample_id[basename(curr_fasta)], counter, label))
            combined_file_out.write("%s\n" % seq)
            counter += 1
Esempio n. 8
0
    def test_call_alt_input_types(self):
        """BlastTaxonAssigner.__call__ functions w alt input types """
        p = BlastTaxonAssigner({\
         'reference_seqs_filepath':self.reference_seqs_fp,\
         'id_to_taxonomy_filepath':self.id_to_taxonomy_fp})

        # neither seqs or seq_fp passed results in AssertionError
        self.assertRaises(AssertionError, p)

        # Functions with a list of (seq_id, seq) pairs
        seqs = list(MinimalFastaParser(open(self.input_seqs_fp)))
        actual = p(seqs=seqs)
        self.assertEqual(actual, self.expected1)

        # Functions with input path
        actual = p(self.input_seqs_fp)
        self.assertEqual(actual, self.expected1)

        # same result when passing fp or seqs
        self.assertEqual(p(seqs=seqs), p(self.input_seqs_fp))
Esempio n. 9
0
def kegg_fasta_to_codon_list(lines):
    """Reads list of CodonUsage objects from KEGG-format FASTA file."""
    result = []
    for label, seq in MinimalFastaParser(lines):
        seq = seq.upper()
        curr_info = {}
        fields = label.split()
        curr_info['SpeciesAbbreviation'], curr_info['GeneId'] = \
            fields[0].split(':')
        if len(fields) > 1: #additional annotation
            first_word = fields[1]
            if first_word.endswith(';'):    #gene label
                curr_info['Gene'] = first_word[:-1]
                curr_info['Description'] = ' '.join(fields[2:])
            else:
                curr_info['Description'] = ' '.join(fields[1:])
        curr_codon_usage = CodonUsage(seq_to_codon_dict(seq), Info=curr_info)
        curr_codon_usage.__dict__.update(curr_info)
        result.append(curr_codon_usage)
    return result
Esempio n. 10
0
def cdhit_from_seqs(seqs, moltype, params=None):
    """Returns the CD-HIT results given seqs

    seqs    : dict like collection of sequences
    moltype : cogent.core.moltype object
    params  : cd-hit parameters

    NOTE: This method will call CD_HIT if moltype is PROTIEN,
        CD_HIT_EST if moltype is RNA/DNA, and raise if any other
        moltype is passed.
    """
    # keys are not remapped. Tested against seq_ids of 100char length
    seqs = SequenceCollection(seqs, MolType=moltype)

    # setup params and make sure the output argument is set
    if params is None:
        params = {}
    if '-o' not in params:
        params['-o'] = get_tmp_filename()

    # call the correct version of cd-hit base on moltype
    working_dir = get_tmp_filename()
    if moltype is PROTEIN:
        app = CD_HIT(WorkingDir=working_dir, params=params)
    elif moltype is RNA:
        app = CD_HIT_EST(WorkingDir=working_dir, params=params)
    elif moltype is DNA:
        app = CD_HIT_EST(WorkingDir=working_dir, params=params)
    else:
        raise ValueError, "Moltype must be either PROTEIN, RNA, or DNA"

    # grab result
    res = app(seqs.toFasta())
    new_seqs = dict(MinimalFastaParser(res['FASTA'].readlines()))

    # perform cleanup
    res.cleanUp()
    shutil.rmtree(working_dir)
    remove(params['-o'] + '.bak.clstr')

    return SequenceCollection(new_seqs, MolType=moltype)
Esempio n. 11
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    suppress_errors = opts.suppress_errors

    input_fps = []
    for input_fp in opts.input_fps.split(','):
        input_fps.extend(glob(input_fp))

    for input_fp in input_fps:
        i = 0
        try:
            input_f = open(input_fp, 'U')
        except IOError, e:
            if suppress_errors:
                continue
            else:
                print input_fp, e
        for s in MinimalFastaParser(input_f):
            i += 1
        print input_fp, i
Esempio n. 12
0
def align_unaligned_seqs(seqs, moltype, params=None):
    """Returns an Alignment object from seqs.

    seqs: SequenceCollection object, or data that can be used to build one.
    
    moltype: a MolType object.  DNA, RNA, or PROTEIN.

    params: dict of parameters to pass in to the Muscle app controller.
    
    Result will be an Alignment object.
    """
    if not params:
        params = {}
    #create SequenceCollection object from seqs
    seq_collection = SequenceCollection(seqs, MolType=moltype)
    #Create mapping between abbreviated IDs and full IDs
    int_map, int_keys = seq_collection.getIntMap()
    #Create SequenceCollection from int_map.
    int_map = SequenceCollection(int_map, MolType=moltype)
    #get temporary filename
    params.update({'-out': get_tmp_filename()})
    #Create Muscle app.
    app = Muscle(InputHandler='_input_as_multiline_string',\
                 params=params)
    #Get results using int_map as input to app
    res = app(int_map.toFasta())
    #Get alignment as dict out of results
    alignment = dict(MinimalFastaParser(res['MuscleOut'].readlines()))
    #Make new dict mapping original IDs
    new_alignment = {}
    for k, v in alignment.items():
        new_alignment[int_keys[k]] = v
    #Create an Alignment object from alignment dict
    new_alignment = Alignment(new_alignment, MolType=moltype)
    #Clean up
    res.cleanUp()
    del (seq_collection, int_map, int_keys, app, res, alignment, params)

    return new_alignment
Esempio n. 13
0
    def test_main(self):
        """Denoiser should always give same result on test data"""

        expected = """>FS8APND01D3TW3 | cluster size: 94 
CTCCCGTAGGAGTCTGGGCCGTATCTCAGTCCCAATGTGGCCGGTCACCCTCTCAGGCCGGCTACCCGTCAAAGCCTTGGTAAGCCACTACCCCACCAACAAGCTGATAAGCCGCGAGTCCATCCCCAACCGCCGAAACTTTCCAACCCCCACCATGCAGCAGGAGCTCCTATCCGGTATTAGCCCCAGTTTCCTGAAGTTATCCCAAAGTCAAGGGCAGGTTACTCACGTGTTACTCACCCGTTCGCC
"""

        expected_map = """FS8APND01EWRS4:
FS8APND01DXG45:
FS8APND01D3TW3:\tFS8APND01CSXFN\tFS8APND01DQ8MX\tFS8APND01DY7QW\tFS8APND01B5QNI\tFS8APND01CQ6OG\tFS8APND01C7IGN\tFS8APND01DHSGH\tFS8APND01DJ17E\tFS8APND01CUXOA\tFS8APND01EUTYG\tFS8APND01EKK7T\tFS8APND01D582W\tFS8APND01B5GWU\tFS8APND01D7N2A\tFS8APND01BJGHZ\tFS8APND01D6DYZ\tFS8APND01C6ZIM\tFS8APND01D2X6Y\tFS8APND01BUYCE\tFS8APND01BNUEY\tFS8APND01DKLOE\tFS8APND01C24PP\tFS8APND01EBWQX\tFS8APND01ELDYW\tFS8APND01B0GCS\tFS8APND01D4QXI\tFS8APND01EMYD9\tFS8APND01EA2SK\tFS8APND01DZOSO\tFS8APND01DHYAZ\tFS8APND01C7UD9\tFS8APND01BTZFV\tFS8APND01CR78R\tFS8APND01B39IE\tFS8APND01ECVC0\tFS8APND01DM3PL\tFS8APND01DELWS\tFS8APND01CIEK8\tFS8APND01D7ZOZ\tFS8APND01CZSAI\tFS8APND01DYOVR\tFS8APND01BX9XY\tFS8APND01DEWJA\tFS8APND01BEKIW\tFS8APND01DCKB9\tFS8APND01EEYIS\tFS8APND01DDKEA\tFS8APND01DSZLO\tFS8APND01C6EBC\tFS8APND01EE15M\tFS8APND01ELO9B\tFS8APND01C58QY\tFS8APND01DONCG\tFS8APND01DVXX2\tFS8APND01BL5YT\tFS8APND01BIL2V\tFS8APND01EBSYQ\tFS8APND01CCX8R\tFS8APND01B2YCJ\tFS8APND01B1JG4\tFS8APND01DJ024\tFS8APND01BIJY0\tFS8APND01CIA4G\tFS8APND01DV74M\tFS8APND01ECAX5\tFS8APND01DC3TZ\tFS8APND01EJVO6\tFS8APND01D4VFG\tFS8APND01DYYYO\tFS8APND01D1EDD\tFS8APND01DQUOT\tFS8APND01A2NSJ\tFS8APND01DDC8I\tFS8APND01BP1T2\tFS8APND01DPY6U\tFS8APND01CIQGV\tFS8APND01BPUT8\tFS8APND01BDNH4\tFS8APND01DOZDN\tFS8APND01DS866\tFS8APND01DGS2J\tFS8APND01EDK32\tFS8APND01EPA0T\tFS8APND01CK3JM\tFS8APND01BKLWW\tFS8APND01DV0BO\tFS8APND01DPNXE\tFS8APND01B7LUA\tFS8APND01BTTE2\tFS8APND01CKO4X\tFS8APND01DGGBY\tFS8APND01C4NHX\tFS8APND01DYPQN
FS8APND01BSTVP:
FS8APND01EFK0W:
FS8APND01DCIOO:
FS8APND01CKOMZ:
"""

        command = " ".join([
            "%s/denoiser.py" % get_qiime_scripts_dir(), "--force", "-o",
            self.test_dir, "-i",
            "%s/qiime/support_files/denoiser/TestData/denoiser_test_set.sff.txt"
            % PROJECT_HOME
        ])

        result = Popen(command,shell=True,universal_newlines=True,\
                           stdout=PIPE,stderr=STDOUT).stdout.read()
        self.result_dir = self.test_dir

        observed = "".join(list(open(self.result_dir + "centroids.fasta")))
        self.assertEqual(observed, expected)

        self.assertEqual(
            len(
                list(
                    MinimalFastaParser(
                        open(self.result_dir + "singletons.fasta")))), 6)

        observed = "".join(list(open(self.result_dir +
                                     "denoiser_mapping.txt")))
        self.assertEqual(observed, expected_map)
Esempio n. 14
0
def split_fasta(infile, seqs_per_file, outfile_prefix, working_dir=''):
    """ Split infile into files with seqs_per_file sequences in each
    
        infile: list of fasta lines or open file object
        seqs_per_file: the number of sequences to include in each file
        out_fileprefix: string used to create output filepath - output filepaths
         are <out_prefix>.<i>.fasta where i runs from 0 to number of output files
        working_dir: directory to prepend to temp filepaths (defaults to 
         empty string -- files written to cwd)
         
        List of output filepaths is returned.
    
    """
    if seqs_per_file <= 0:
        raise ValueError("seqs_per_file must be > 0!")

    seq_counter = 0
    out_files = []
    if working_dir and not working_dir.endswith('/'):
        working_dir += '/'
        create_dir(working_dir)
    
    for seq_id,seq in MinimalFastaParser(infile):
        if seq_counter == 0:
            current_out_fp = '%s%s.%d.fasta' \
              % (working_dir,outfile_prefix,len(out_files))
            current_out_file = open(current_out_fp, 'w')
            out_files.append(current_out_fp)
        current_out_file.write('>%s\n%s\n' % (seq_id, seq))
        seq_counter += 1
        
        if seq_counter == seqs_per_file:
            current_out_file.close()
            seq_counter = 0
    
    if not current_out_file.closed:
        current_out_file.close()

    return out_files
Esempio n. 15
0
 def test_insert_sequences_into_tree(self):
     """Inserts sequences into Tree"""
     
     params={}
     # generate temp filename for output
     params["-r"] = self.refseq_fname
     params["-t"] = self.tree_fname
     params["-s"] = self.stats_fname
     params["--out-dir"] = "/tmp"
     
     aln_ref_query=MinimalFastaParser(StringIO(QUERY_SEQS))
     aln = Alignment(aln_ref_query)
     seqs, align_map = aln.toPhylip()
     tree = insert_sequences_into_tree(seqs, DNA, params=params,
                                       write_log=False)
     
     # rename tips back to query names
     for node in tree.tips():
         if node.Name in align_map:
             node.Name = align_map[node.Name]
     
     self.assertEqual(tree.getNewick(with_distances=True), RESULT_TREE)
Esempio n. 16
0
    def test_gt_bracket_in_seq(self):
        """MinimalFastaParser handles alternate finder function
            
            this test also illustrates how to use the MinimalFastaParser
            to handle "sequences" that start with a > symbol, which can
            happen when we abuse the MinimalFastaParser to parse
            fasta-like sequence quality files.
        """
        oneseq_w_gt = '>abc\n>CAG\n'.split('\n')

        def get_two_line_records(infile):
            line1 = None
            for line in infile:
                if line1 == None:
                    line1 = line
                else:
                    yield (line1, line)
                    line1 = None

        f = list(MinimalFastaParser(oneseq_w_gt, finder=get_two_line_records))
        self.assertEqual(len(f), 1)
        a = f[0]
        self.assertEqual(a, ('abc', '>CAG'))
Esempio n. 17
0
def compute_sample_novelty(table_fs, rep_set_f, verbose=False):
    """"""
    ref_otus = [seq_id.split()[0] for seq_id, _ in
                MinimalFastaParser(rep_set_f)]

    # {sample_id: [novel_count, known_count, [novel_obs_ids]]}
    sample_novelty = defaultdict(lambda: [0, 0, []])
    tables_processed = 0
    for table_f in table_fs:
        table = parse_biom_table(table_f)
        novel_obs = set(table.ObservationIds) - set(ref_otus)

        for counts, obs_id, _ in table.iterObservations():
            if obs_id in novel_obs:
                for sid, count in zip(table.SampleIds, counts):
                    if count > 0:
                        sample_novelty[sid][0] += count
                        sample_novelty[sid][2].append(obs_id)
            else:
                for sid, count in zip(table.SampleIds, counts):
                    sample_novelty[sid][1] += count

        tables_processed += 1
        if verbose:
            print "Processed %d table(s)." % tables_processed

    results = []
    for sid, (novel_count, known_count, novel_obs_ids) in \
            sample_novelty.items():
        percent_novel_seqs = (novel_count / (known_count + novel_count)) * 100

        # Create a set first in case a sample in multiple tables has the same
        # novel observations.
        num_new_obs = len(set(novel_obs_ids))
        results.append((sid, num_new_obs, percent_novel_seqs))

    return sorted(results, reverse=True, key=itemgetter(1))
Esempio n. 18
0
def parse_fasta(lines):
    """lightweight parser for KEGG FASTA format sequences"""
    for label, seq in MinimalFastaParser(lines):
        yield '\t'.join(list(kegg_label_fields(label)) \
          + [seq] + ["\n"])
Esempio n. 19
0
def usearch61_chimera_check(input_seqs_fp,
                            output_dir,
                            reference_seqs_fp=None,
                            suppress_usearch61_intermediates=False,
                            suppress_usearch61_ref=False,
                            suppress_usearch61_denovo=False,
                            split_by_sampleid=False,
                            non_chimeras_retention="union",
                            usearch61_minh=0.28,
                            usearch61_xn=8.0,
                            usearch61_dn=1.4,
                            usearch61_mindiffs=3,
                            usearch61_mindiv=0.8,
                            usearch61_abundance_skew=2.0,
                            percent_id_usearch61=0.97,
                            minlen=64,
                            word_length=8,
                            max_accepts=1,
                            max_rejects=8,
                            verbose=False,
                            threads=1.0,
                            HALT_EXEC=False):
    """ Main convenience function for usearch61 chimera checking
    
    input_seqs_fp:  filepath of input fasta file.
    output_dir:  output directory
    reference_seqs_fp: fasta filepath for reference chimera detection.
    suppress_usearch61_intermediates:  Suppress retention of .uc and log files.
    suppress_usearch61_ref:  Suppress usearch61 reference chimera detection.
    suppress_usearch61_denovo:  Suppress usearch61 de novo chimera detection.
    split_by_sampleid:  Split by sample ID for de novo chimera detection.
    non_chimeras_retention: Set to "union" or "intersection" to retain 
     non-chimeras between de novo and reference based results.
    usearch61_minh: Minimum score (h) to be classified as chimera. 
     Increasing this value tends to the number of false positives (and also
     sensitivity).
    usearch61_xn:  Weight of "no" vote.  Increasing this value tends to the
     number of false positives (and also sensitivity).
    usearch61_dn:  Pseudo-count prior for "no" votes. (n). Increasing this 
     value tends to the number of false positives (and also sensitivity).
    usearch61_mindiffs:  Minimum number of diffs in a segment. Increasing this
     value tends to reduce the number of false positives while reducing 
     sensitivity to very low-divergence chimeras.
    usearch61_mindiv:  Minimum divergence, i.e. 100% - identity between the 
     query and closest reference database sequence. Expressed as a percentage,
     so the default is 0.8%, which allows chimeras that are up to 99.2% similar
     to a reference sequence.
    usearch61_abundance_skew: abundance skew for de novo chimera comparisons.
    percent_id_usearch61: identity to cluster sequences at
    minlen: minimum sequence length for use with usearch61
    word_length: length of nucleotide 'words' for usearch61
    max_accepts: max number of accepts for hits with usearch61
    max_rejects: max number of rejects for usearch61, increasing allows more
     sensitivity at a cost of speed
    threads: Specify number of threads used per core per CPU
    HALT_EXEC=application controller option to halt execution and print command
    """
    """ 
    Need to cluster sequences de novo first to get 1. abundance information
    and 2 consensus sequence for each cluster.  Using dereplication followed
    by clustering does not appear to automatically update complete cluster 
    size, will directly cluster raw seqs with the small_mem clustering option.
    
    This means without additional parsing steps to recalculate 
    actual cluster sizes, the sizeorder option can't be used for de novo
    clustering and downstream chimera detection."""

    files_to_remove = []

    # Get absolute paths to avoid issues with calling usearch
    input_seqs_fp = abspath(input_seqs_fp)
    output_dir = abspath(output_dir)
    if reference_seqs_fp:
        reference_seqs_fp = abspath(reference_seqs_fp)
    log_fp = join(output_dir, "identify_chimeric_seqs.log")
    chimeras_fp = join(output_dir, "chimeras.txt")
    non_chimeras_fp = join(output_dir, "non_chimeras.txt")

    non_chimeras = []
    chimeras = []
    log_lines = {
        'denovo_chimeras': 0,
        'denovo_non_chimeras': 0,
        'ref_chimeras': 0,
        'ref_non_chimeras': 0
    }

    if split_by_sampleid:
        if verbose:
            print "Splitting fasta according to SampleID..."
        full_seqs = open(input_seqs_fp, "U")
        sep_fastas =\
         split_fasta_on_sample_ids_to_files(MinimalFastaParser(full_seqs),
         output_dir)
        full_seqs.close()

        if suppress_usearch61_intermediates:
            files_to_remove += sep_fastas

        for curr_fasta in sep_fastas:
            curr_chimeras, curr_non_chimeras, files_to_remove, log_lines =\
             identify_chimeras_usearch61(curr_fasta, output_dir,
             reference_seqs_fp, suppress_usearch61_intermediates,
             suppress_usearch61_ref, suppress_usearch61_denovo,
             non_chimeras_retention, usearch61_minh, usearch61_xn,
             usearch61_dn, usearch61_mindiffs, usearch61_mindiv,
             usearch61_abundance_skew, percent_id_usearch61, minlen,
             word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC,
             log_lines, verbose, threads)

            chimeras += curr_chimeras
            non_chimeras += curr_non_chimeras

    else:
        chimeras, non_chimeras, files_to_remove, log_lines =\
         identify_chimeras_usearch61(input_seqs_fp, output_dir,
         reference_seqs_fp, suppress_usearch61_intermediates,
         suppress_usearch61_ref, suppress_usearch61_denovo,
         non_chimeras_retention, usearch61_minh, usearch61_xn,
         usearch61_dn, usearch61_mindiffs, usearch61_mindiv,
         usearch61_abundance_skew, percent_id_usearch61, minlen,
         word_length, max_accepts, max_rejects, files_to_remove, HALT_EXEC,
         log_lines, verbose, threads)

    # write log, non chimeras, chimeras.
    write_usearch61_log(
        log_fp, input_seqs_fp, output_dir, reference_seqs_fp,
        suppress_usearch61_intermediates, suppress_usearch61_ref,
        suppress_usearch61_denovo, split_by_sampleid, non_chimeras_retention,
        usearch61_minh, usearch61_xn, usearch61_dn, usearch61_mindiffs,
        usearch61_mindiv, usearch61_abundance_skew, percent_id_usearch61,
        minlen, word_length, max_accepts, max_rejects, HALT_EXEC, log_lines)

    chimeras_f = open(chimeras_fp, "w")
    non_chimeras_f = open(non_chimeras_fp, "w")
    for curr_chimera in chimeras:
        chimeras_f.write("%s\n" % curr_chimera)
    for curr_non_chimera in non_chimeras:
        non_chimeras_f.write("%s\n" % curr_non_chimera)
    chimeras_f.close()
    non_chimeras_f.close()

    remove_files(files_to_remove)
Esempio n. 20
0
def get_chimeras_from_Nast_aligned(seqs_fp,
                                   ref_db_aligned_fp=None,
                                   ref_db_fasta_fp=None,
                                   HALT_EXEC=False,
                                   min_div_ratio=None,
                                   keep_intermediates=False):
    """remove chimeras from seqs_fp using chimeraSlayer.

    seqs_fp:  a filepath with the seqs to check in the file
    ref_db_aligned_fp: fp to (pynast) aligned reference sequences
    ref_db_fasta_fp: same seqs as above, just unaligned. Will be computed on the fly if not provided,
    HALT_EXEC: stop execution if true
    min_div_ratio: passed to ChimeraSlayer App
    """

    files_to_remove = []
    #might come in as FilePath object with quotes
    seqs_fp = str(seqs_fp)
    seqs_fp = seqs_fp.rstrip('"')
    seqs_fp = seqs_fp.lstrip('"')

    seqs_dir, new_seqs_fp = split(seqs_fp)

    #if fp is in current dir, we fake a dir change
    if seqs_dir == "":
        seqs_dir = "./"

    #Chimera Slayer puts some temp files in current dir and some in dir of input file
    #use exe_dir to change to dir of input file, so to have all tmp files in one place
    params = {'--query_NAST': new_seqs_fp, '--exec_dir': seqs_dir}

    if ref_db_aligned_fp == None and ref_db_fasta_fp == None:
        #use default db, whose relative position to the
        #ChimeraSlayer binary is hardcoded
        pass

    else:
        if not ref_db_fasta_fp:
            #make degapped reference file
            ref_db_fasta_fp = write_degapped_fasta_to_file(MinimalFastaParser( \
                    open(ref_db_aligned_fp)))
            files_to_remove.append(ref_db_fasta_fp)
        #use user db
        params.update({
            '--db_NAST': abspath(ref_db_aligned_fp),
            '--db_FASTA': abspath(ref_db_fasta_fp)
        })

    if min_div_ratio != None:
        params.update({'-R': min_div_ratio})

    app = ChimeraSlayer(params=params, HALT_EXEC=HALT_EXEC)
    app_results = app()

    #    this is a FilePath object in case of success.
    #    How can we test for failure here?
    #    if not exists(app_results['CPS']):
    #         raise ApplicationError, "ChimeraSlayer failed. No output file."

    chimeras = parse_CPS_file((app_results['CPS']))
    if not keep_intermediates:
        app.remove_intermediate_files()
        remove_files(files_to_remove)

    return chimeras
Esempio n. 21
0
def get_seqs_to_keep_lookup_from_prefix(fasta_f, prefix):
    seqs_to_keep = [
        seq_id for seq_id, seq in MinimalFastaParser(fasta_f)
        if seq_id.startswith(prefix)
    ]
    return {}.fromkeys(seqs_to_keep)
Esempio n. 22
0
def assign_taxonomy(data,
                    min_confidence=0.80,
                    output_fp=None,
                    training_data_fp=None,
                    max_memory=None):
    """ Assign taxonomy to each sequence in data with the RDP classifier 
    
        data: open fasta file object or list of fasta lines
        confidence: minimum support threshold to assign taxonomy to a sequence
        output_fp: path to write output; if not provided, result will be 
         returned in a dict of {seq_id:(taxonomy_assignment,confidence)}
    
    """
    data = list(data)

    # build a map of seq identifiers as the RDP classifier doesn't
    # preserve these perfectly
    identifier_lookup = {}
    for seq_id, seq in MinimalFastaParser(data):
        identifier_lookup[seq_id.split()[0]] = seq_id

    # build the classifier object
    app = RdpClassifier20()
    if max_memory is not None:
        app.Parameters['-Xmx'].on(max_memory)
    if training_data_fp is not None:
        app.Parameters['-training-data'].on(training_data_fp)

    # apply the rdp app controller
    rdp_result = app('\n'.join(data))
    # grab assignment output
    result_lines = rdp_result['Assignments']

    # start a list to store the assignments
    results = {}

    # ShortSequenceException messages are written to stdout
    # Tag these ID's as unassignable
    stdout_lines = rdp_result['StdOut']
    for line in stdout_lines:
        if line.startswith('ShortSequenceException'):
            matchobj = re.search('recordID=(\S+)', line)
            if matchobj:
                rdp_id = matchobj.group(1)
                orig_id = identifier_lookup[rdp_id]
                results[orig_id] = ('Unassignable', 1.0)

    # iterate over the identifier, assignment strings (this is a bit
    # of an abuse of the MinimalFastaParser, as these are not truely
    # fasta lines)
    for identifier, assignment_str in MinimalFastaParser(result_lines):
        # get the original identifier from the one in the rdp result
        identifier = identifier_lookup[\
         identifier[:identifier.index('reverse=')].strip()]
        # build a list to store the assignments we're confident in
        # (i.e., the ones that have a confidence greater than min_confidence)
        confident_assignments = []
        # keep track of the lowest acceptable confidence value that
        # has been encountered
        lowest_confidence = 0.0

        # split the taxonomy assignment string
        assignment_fields = assignment_str.split(';')
        # iterate over (assignment, assignment confidence) pairs
        for i in range(0, len(assignment_fields), 2):
            assignment = assignment_fields[i]
            try:
                assignment_confidence = float(assignment_fields[i + 1])
            except IndexError:
                break
            # check the confidence of the current assignment
            if assignment_confidence >= min_confidence:
                # if the current assignment confidence is greater than
                # the min, store the assignment and confidence value
                confident_assignments.append(assignment.strip())
                lowest_confidence = assignment_confidence
            else:
                # otherwise, we've made it to the lowest assignment that
                # met the confidence threshold, so bail out of the loop
                break

        # store the identifier, the semi-colon-separated assignments, and the
        # confidence for the last assignment
        results[identifier] = \
             (';'.join(confident_assignments),lowest_confidence)

    if output_fp:
        try:
            output_file = open(output_fp, 'w')
        except OSError:
            raise OSError, "Can't open output file for writing: %s" % output_fp

        for seq_id, values in results.items():
            output_file.write('%s\t%s\t%1.3f\n' %
                              (seq_id, values[0], values[1]))

        output_file.close()
        return None
    else:
        return results
Esempio n. 23
0
def process_silva(seqs, tax_out, seq_out):
    for label, seq in MinimalFastaParser(seqs):
        new_header, taxonomy = parse_label(label)
        fixed_seq = parse_seq(seq)
        tax_out.write(new_header + '\t' + taxonomy + '\n')
        seq_out.write('>' + new_header + '\n' + fixed_seq + '\n')
Esempio n. 24
0
def assign_dna_reads_to_protein_database(query_fasta_fp,
                                         database_fasta_fp,
                                         output_fp,
                                         temp_dir="/tmp",
                                         params=None):
    """Assign DNA reads to a database fasta of protein sequences.

    Wraps assign_reads_to_database, setting database and query types. All
    parameters are set to default unless params is passed. A temporary
    file must be written containing the translated sequences from the input
    query fasta file because BLAT cannot do this automatically.

    query_fasta_fp: absolute path to the query fasta file containing DNA
                   sequences.
    database_fasta_fp: absolute path to the database fasta file containing
                      protein sequences.
    output_fp: absolute path where the output file will be generated.
    temp_dir: optional. Change the location where the translated sequences
              will be written before being used as the query. Defaults to
              /tmp.
    params: optional. dict containing parameter settings to be used
                  instead of default values. Cannot change database or query
                  file types from protein and dna, respectively.

    This method returns an open file object. The output format
    defaults to blast9 and should be parsable by the PyCogent BLAST parsers.
    """
    if params is None:
        params = {}

    my_params = {'-t': 'prot', '-q': 'prot'}

    # make sure temp_dir specifies an absolute path
    if not isabs(temp_dir):
        raise ApplicationError("temp_dir must be an absolute path.")

    # if the user specified parameters other than default, then use them.
    # However, if they try to change the database or query types, raise an
    # applciation error.
    if '-t' in params or '-q' in params:
        raise ApplicationError(
            "Cannot change database or query types "
            "when using assign_dna_reads_to_dna_database. Use "
            "assign_reads_to_database instead.")

    if 'genetic_code' in params:
        my_genetic_code = GeneticCodes[params['genetic_code']]
        del params['genetic_code']
    else:
        my_genetic_code = GeneticCodes[1]

    my_params.update(params)

    # get six-frame translation of the input DNA sequences and write them to
    # temporary file.
    tmp = get_tmp_filename(tmp_dir=temp_dir, result_constructor=str)
    tmp_out = open(tmp, 'w')

    for label, sequence in MinimalFastaParser(open(query_fasta_fp)):
        seq_id = label.split()[0]

        s = DNA.makeSequence(sequence)
        translations = my_genetic_code.sixframes(s)
        frames = [1, 2, 3, -1, -2, -3]
        translations = dict(zip(frames, translations))

        for frame, translation in sorted(translations.iteritems()):
            entry = '>{seq_id}_frame_{frame}\n{trans}\n'
            entry = entry.format(seq_id=seq_id, frame=frame, trans=translation)
            tmp_out.write(entry)

    tmp_out.close()
    result = assign_reads_to_database(tmp,
                                      database_fasta_fp,
                                      output_fp,
                                      params=my_params)

    remove(tmp)

    return result
Esempio n. 25
0
def get_hits_data(primer, 
                  primer_id,
                  fasta_fp,
                  tp_len,
                  last_base_mm,
                  tp_mm,
                  non_tp_mm,
                  tp_gap,
                  non_tp_gap):
    """ Finds mismatches, gaps, scores for primer/seqs sets
    
    Returns a list of lines of hits data for writing to the output hits file,
    and a list of lists containing the mismatches, gaps, and weighted scores
    for writing a histogram file.
    
    primer: current primer (DNA.Sequence object)
    primer_ids: current primer name
    fasta_fp: current open fasta filepath object to test primers against
    seq_collection: tuple of (collection_id, seq_collection), with id based upon
     root name of fasta file, collection is degapped SequenceCollection object
    tp_len: three prime length
    last_base_mm: penalty for last base mismatch
    tp_mm: three prime mismatch penalty
    non_tp_mm: non three prime mismatch penalty
    tp_gap: penalty for three prime gaps
    non_tp_gap: penalty for non three prime gaps
    """
    
    
    
    
    # Contains header, parameters, comments for the output hits file
    hits_lines = ["# Primer: %s 5'-%s-3'" % (primer.Name, primer),
     '# Input fasta file: %s' % basename(fasta_fp.name),
     '# Parameters',
     '# 3\' length: %d' % tp_len,
     '# non 3\' mismatch penalty: %1.2f per mismatch' % non_tp_mm,
     '# 3\' mismatch penalty: %1.2f per mismatch' % tp_mm,
     '# last base mismatch penalty: %1.2f' % last_base_mm,
     '# non 3\' gap penalty: %1.2f per gap' % non_tp_gap,
     '# 3\' gap penalty: %1.2f per gap' % tp_gap,
     '# Note - seq hit and primer hit are the best local pairwise alignment '+\
     'results for a given sequence and primer pair.  A gap in seq hit '+\
     'represents a '+\
     'deletion in the sequence, whereas a gap in the primer hit signifies '+\
     'an insertion in the target sequence.\n#\n'
     '# seq ID, seq hit, primer hit, hit start position, non 3\' mismatches, '+\
     '3\' mismatches (except last base), last base mismatch, '+\
     'non 3\' gaps, 3\' gaps, overall weighted score, '+\
     'hits sequence end ']
     
     
    # Calculate range of GC content, accounting for degeneracies
    min_gc = sum([primer.count(c) for c in 'GCS']) / len(primer)
    max_gc = sum([primer.count(c) for c in 'GCSNRYKMBDHV']) / len(primer)
    
    
    # Put together strings for text in output summary graphs
    degen_gc_content = '%s; Degeneracy: %d; GC content %.2f - %.2f'%\
     (primer_id, primer.possibilities(), min_gc, max_gc)
    primer_title = '\n5\'-%s-3\'' % str(primer)
    seq_collection_title = '\nSequences tested: ' + basename(fasta_fp.name)
    figure_title = degen_gc_content + primer_title + seq_collection_title
    
    # Weighted score strings for the bottom of the histogram, following
    # weighted score results.
    tp_len_title = '3\' length: %d nucleotides' % tp_len
    weighted_score_info = "\nWeighted score = non-3' mismatches * "+\
     "%1.2f + 3' mismatches * %1.2f + non 3' gaps * %1.2f + 3\' gaps * %1.2f" %\
     (non_tp_mm, tp_mm, non_tp_gap, tp_gap)
    last_base_info = '\nAn additional %1.2f penalty is assigned if the ' %\
     last_base_mm + 'final 3\' base mismatches'
    rounded_clause = '\nWeighted score is rounded to the nearest whole '+\
     'number in this graphical display'
    weighted_score_subtext = tp_len_title + weighted_score_info +\
     last_base_info + rounded_clause
    
    
    # Set upper limit for purpose of displaying data on histograms
    max_mm = 5
    max_gaps = 5
    max_weighted_score = 5.0
    
    non_tp_mm_data = []
    tp_mm_data = []
    non_tp_gap_data = []
    tp_gap_data = []
    weighted_score_data = []
    last_base_mm_data = []
    
    
    # get primer length to test for hitting sequence end
    primer_len = len(primer)
    primer_seq = primer_to_match_query(primer)
    
    for label, seq in MinimalFastaParser(fasta_fp):
        primer_hit, target_hit, hit_start = \
         local_align_primer_seq(primer_seq, seq)
        # Get score, numbers of gaps/mismatches
        weighted_score, non_tp_gaps, tp_gaps, non_tp_mismatches,\
         tp_mismatches, last_base_mismatches = score_primer(primer, primer_hit,
         target_hit, tp_len, last_base_mm,
         tp_mm, non_tp_mm, tp_gap, non_tp_gap)
        
        # Append data to lists for generating histograms
        # Max value appended to this list capped for purposes of readability
        # in the output histogram
        if non_tp_mismatches <= max_mm:
            non_tp_mm_data.append(non_tp_mismatches)
        else:
            non_tp_mm_data.append(max_mm)
            
        if tp_mismatches <= max_mm:
            tp_mm_data.append(tp_mismatches)
        else:
            tp_mm_data.append(max_mm)
            
        if non_tp_gaps <= max_gaps:
            non_tp_gap_data.append(non_tp_gaps)
        else:
            non_tp_gap_data.append(max_gaps)
            
        if tp_gaps <= max_gaps:
            tp_gap_data.append(tp_gaps)
        else:
            tp_gap_data.append(max_gaps)
            
        if weighted_score <= max_weighted_score:
            weighted_score_data.append(float('%2.2f' % weighted_score))
        else:
            weighted_score_data.append(max_weighted_score)
            
        if last_base_mismatches:
            last_base_mm_data.append(1)
        else:
            last_base_mm_data.append(0)
         
        # Determine if primer hits sequence end
        # Difficult to use this in scoring, but can be parsed out if one wants
        # to determine if primer sequences were left in fasta sequences
        hits_sequence_end = hits_seq_end(seq, hit_start, primer_len)
        
        # Append hit info for output hits file data
        # Label is split to just contain fasta ID
        hits_lines.append(','.join(map(str,[label.split()[0], target_hit, 
                primer_hit, hit_start, non_tp_mismatches, tp_mismatches,
                bool(last_base_mismatches), non_tp_gaps, tp_gaps,
                weighted_score, hits_sequence_end])))
                
    
    # Make list of all histogram data lists so only one data item being
    # passed around
    hist_data = [non_tp_mm_data, tp_mm_data, non_tp_gap_data, tp_gap_data,
     weighted_score_data, last_base_mm_data, figure_title, 
     weighted_score_subtext]
    
    
    
    return hits_lines, hist_data
Esempio n. 26
0
 def setUp(self):
     self.seqs = Alignment(dict(MinimalFastaParser(test_seqs.split())))
Esempio n. 27
0
def get_seqs_to_keep_lookup_from_fasta_file(fasta_f):
    """return the sequence ids within the fasta file"""
    return set(
        [seq_id.split()[0] for seq_id, seq in MinimalFastaParser(fasta_f)])
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)

    input_fp = opts.input_fp
    output_fp = opts.output_fp

    min_count = opts.min_count
    max_count = opts.max_count
    min_count_fraction = opts.min_count_fraction
    if min_count_fraction < 0. or min_count_fraction > 1.:
        option_parser.error("min_count_fraction must be between 0 and 1")
    if min_count != 0 and min_count_fraction != 0:
        option_parser.error(
            "cannot specify both min_count and min_count_fraction")

    min_samples = opts.min_samples
    max_samples = opts.max_samples

    otu_ids_to_exclude_fp = opts.otu_ids_to_exclude_fp
    negate_ids_to_exclude = opts.negate_ids_to_exclude

    if not (min_count != 0 or \
            min_count_fraction != 0 or \
            not isinf(max_count) or \
            otu_ids_to_exclude_fp != None or \
            min_samples !=0 or not isinf(max_samples)):
        option_parser.error(
            "No filtering requested. Must provide either "
            "min counts, max counts, min samples, max samples, min_count_fraction, "
            "or exclude_fp (or some combination of those).")

    otu_table = parse_biom_table(open(opts.input_fp, 'U'))

    if min_count_fraction > 0:
        min_count = otu_table.sum() * min_count_fraction
        print otu_table.sum(), min_count

    output_f = open(opts.output_fp, 'w')

    otu_ids_to_keep = set(otu_table.ObservationIds)

    if otu_ids_to_exclude_fp:
        if otu_ids_to_exclude_fp.endswith('.fasta') or \
           otu_ids_to_exclude_fp.endswith('.fna'):
            otu_ids_to_exclude = set([
                id_.strip().split()[0] for id_, seq in MinimalFastaParser(
                    open(otu_ids_to_exclude_fp, 'U'))
            ])
        else:
            otu_ids_to_exclude = set([
                l.strip().split('\t')[0]
                for l in open(otu_ids_to_exclude_fp, 'U')
            ])

        otu_ids_to_keep -= otu_ids_to_exclude

    filtered_otu_table = filter_otus_from_otu_table(otu_table, otu_ids_to_keep,
                                                    min_count, max_count,
                                                    min_samples, max_samples,
                                                    negate_ids_to_exclude)
    output_f.write(format_biom_table(filtered_otu_table))
    output_f.close()
Esempio n. 29
0
def assign_taxonomy(
    data, min_confidence=0.80, output_fp=None, training_data_fp=None,
    fixrank=True, max_memory=None, tmp_dir=None):
    """Assign taxonomy to each sequence in data with the RDP classifier
    
        data: open fasta file object or list of fasta lines
        confidence: minimum support threshold to assign taxonomy to a sequence
        output_fp: path to write output; if not provided, result will be 
         returned in a dict of {seq_id:(taxonomy_assignment,confidence)}
    """
    # Going to iterate through this twice in succession, best to force
    # evaluation now
    data = list(data)

    # RDP classifier doesn't preserve identifiers with spaces
    # Use lookup table
    seq_id_lookup = {}
    for seq_id, seq in MinimalFastaParser(data):
        seq_id_lookup[seq_id.split()[0]] = seq_id

    app_kwargs = {}
    if tmp_dir is not None:
        app_kwargs['TmpDir'] = tmp_dir
    app = RdpClassifier(**app_kwargs)

    if max_memory is not None:
        app.Parameters['-Xmx'].on(max_memory)
    
    temp_output_file = tempfile.NamedTemporaryFile(
        prefix='RdpAssignments_', suffix='.txt', dir=tmp_dir)
    app.Parameters['-o'].on(temp_output_file.name)
    if training_data_fp is not None:
        app.Parameters['-t'].on(training_data_fp)

    if fixrank:
        app.Parameters['-f'].on('fixrank')
    else:
        app.Parameters['-f'].on('allrank')

    app_result = app(data)

    assignments = {}

    # ShortSequenceException messages are written to stdout
    # Tag these ID's as unassignable
    for line in app_result['StdOut']:
        excep = parse_rdp_exception(line)
        if excep is not None:
            _, rdp_id = excep
            orig_id = seq_id_lookup[rdp_id]
            assignments[orig_id] = ('Unassignable', 1.0)
    
    for line in app_result['Assignments']:
        rdp_id, direction, taxa = parse_rdp_assignment(line)
        if taxa[0][0] == "Root":
            taxa = taxa[1:]
        orig_id = seq_id_lookup[rdp_id]
        lineage, confidence = get_rdp_lineage(taxa, min_confidence)
        if lineage:
            assignments[orig_id] = (';'.join(lineage), confidence)
        else:
            assignments[orig_id] = ('Unclassified', 1.0)

    if output_fp:
        try:
            output_file = open(output_fp, 'w')
        except OSError:
            raise OSError("Can't open output file for writing: %s" % output_fp)
        for seq_id, assignment in assignments.items():
            lineage, confidence = assignment
            output_file.write(
                '%s\t%s\t%1.3f\n' % (seq_id, lineage, confidence))
        output_file.close()
        return None
    else:
        return assignments
Esempio n. 30
0
def filter_fasta_fp(input_seqs_fp, output_seqs_fp, seqs_to_keep, negate=False):
    """Filter a fasta file to include only sequences listed in seqs_to_keep """
    input_seqs = MinimalFastaParser(open(input_seqs_fp, 'U'))
    output_f = open(output_seqs_fp, 'w')
    return filter_fasta(input_seqs, output_f, seqs_to_keep, negate)