Esempio n. 1
0
def prefix_filter_flowgrams(flowgrams, squeeze=False):
    """Filters flowgrams by common prefixes.

    flowgrams: iterable source of flowgrams

    squeeze: if True, collapse all poly-X to X

    Returns prefix mapping.
    """

    # collect flowgram sequences
    if squeeze:
        seqs = imap(
            lambda f: (f.Name, squeeze_seq(str(f.toSeq(truncate=True)))),
            flowgrams)
    else:
        seqs = imap(lambda f: (f.Name, str(f.toSeq(truncate=True))), flowgrams)
    # equivalent but more efficient than
    #seqs = [(f.Name, str(f.toSeq(truncate=True))) for f in flowgrams]

    # get prefix mappings
    mapping = build_prefix_map(seqs)
    l = len(mapping)
    orig_l = sum([len(a) for a in mapping.values()]) + l

    return (l, orig_l, mapping)
Esempio n. 2
0
def prefix_filter_flowgrams(flowgrams, squeeze=False):
    """Filters flowgrams by common prefixes.

    flowgrams: iterable source of flowgrams
    
    squeeze: if True, collapse all poly-X to X

    Returns prefix mapping.
    """

    #collect flowgram sequences
    if squeeze:
        seqs = imap(lambda f: (f.Name, squeeze_seq(str(f.toSeq(truncate=True)))),
                    flowgrams)
    else:
        seqs = imap(lambda f: (f.Name, str(f.toSeq(truncate=True))), flowgrams)
    #equivalent but more efficient than 
    #seqs = [(f.Name, str(f.toSeq(truncate=True))) for f in flowgrams] 
 
    #get prefix mappings
    mapping = build_prefix_map(seqs)
    l = len(mapping)
    orig_l=sum([len(a) for a in mapping.values()]) +l;

    return (l, orig_l, mapping)
Esempio n. 3
0
    def __call__(self, seq_path, result_path=None, log_path=None):
        """Returns dict mapping {otu_id:[seq_ids]} for each otu.
        
        Parameters:
        seq_path: path to file of sequences
        result_path: path to file of results. If specified,
        dumps the result to the desired path instead of returning it.
        log_path: path to log, which includes dump of params.

        """
        log_lines = []

        # Get the appropriate sequence iterator
        if self.Params['Reverse']:
            # Reverse the sequences prior to building the prefix map.
            # This effectively creates a suffix map.
            # Also removes descriptions from seq identifier lines
            seqs = imap(lambda s: (s[0].split()[0], s[1][::-1]),\
                        MinimalFastaParser(open(seq_path)))
            log_lines.append(\
             'Seqs reversed for suffix mapping (rather than prefix mapping).')
        else:
            # remove descriptions from seq identifier lines
            seqs = imap(lambda s: (s[0].split()[0], s[1]),\
                        MinimalFastaParser(open(seq_path)))

        # Build the mapping
        mapping = build_prefix_map(seqs)
        log_lines.append('Num OTUs: %d' % len(mapping))

        if result_path:
            # if the user provided a result_path, write the
            # results to file with one tab-separated line per
            # cluster
            of = open(result_path, 'w')
            for i, (otu_id, members) in enumerate(mapping.iteritems()):
                of.write('%s\t%s\n' % (i, '\t'.join([otu_id] + members)))
            of.close()
            result = None
            log_lines.append('Result path: %s' % result_path)
        else:
            # if the user did not provide a result_path, store
            # the clusters in a dict of {otu_id:[seq_ids]}, where
            # otu_id is arbitrary
            #add key to cluster_members
            for key in mapping.keys():
                mapping[key].append(key)
            result = dict(enumerate(mapping.values()))
            log_lines.append('Result path: None, returned as dict.')

        if log_path:
            # if the user provided a log file path, log the run
            log_file = open(log_path, 'w')
            log_lines = [str(self)] + log_lines
            log_file.write('\n'.join(log_lines))

        # return the result (note this is None if the data was
        # written to file)
        return result
Esempio n. 4
0
    def __call__(self, seq_path, result_path=None, log_path=None):
        """Returns dict mapping {otu_id:[seq_ids]} for each otu.
        
        Parameters:
        seq_path: path to file of sequences
        result_path: path to file of results. If specified,
        dumps the result to the desired path instead of returning it.
        log_path: path to log, which includes dump of params.

        """
        log_lines = []

        # Get the appropriate sequence iterator
        if self.Params["Reverse"]:
            # Reverse the sequences prior to building the prefix map.
            # This effectively creates a suffix map.
            # Also removes descriptions from seq identifier lines
            seqs = imap(lambda s: (s[0].split()[0], s[1][::-1]), MinimalFastaParser(open(seq_path)))
            log_lines.append("Seqs reversed for suffix mapping (rather than prefix mapping).")
        else:
            # remove descriptions from seq identifier lines
            seqs = imap(lambda s: (s[0].split()[0], s[1]), MinimalFastaParser(open(seq_path)))

        # Build the mapping
        mapping = build_prefix_map(seqs)
        log_lines.append("Num OTUs: %d" % len(mapping))

        if result_path:
            # if the user provided a result_path, write the
            # results to file with one tab-separated line per
            # cluster
            of = open(result_path, "w")
            for i, (otu_id, members) in enumerate(mapping.iteritems()):
                of.write("%s\t%s\n" % (i, "\t".join([otu_id] + members)))
            of.close()
            result = None
            log_lines.append("Result path: %s" % result_path)
        else:
            # if the user did not provide a result_path, store
            # the clusters in a dict of {otu_id:[seq_ids]}, where
            # otu_id is arbitrary
            # add key to cluster_members
            for key in mapping.keys():
                mapping[key].append(key)
            result = dict(enumerate(mapping.values()))
            log_lines.append("Result path: None, returned as dict.")

        if log_path:
            # if the user provided a log file path, log the run
            log_file = open(log_path, "w")
            log_lines = [str(self)] + log_lines
            log_file.write("\n".join(log_lines))

        # return the result (note this is None if the data was
        # written to file)
        return result
Esempio n. 5
0
    def _prefilter_with_trie(self, seq_path):

        trunc_id = lambda (a, b): (a.split()[0], b)
        # get the prefix map
        mapping = build_prefix_map(imap(trunc_id, MinimalFastaParser(open(seq_path))))
        for key in mapping.keys():
            mapping[key].append(key)

        # collect the representative seqs
        filtered_seqs = []
        for (label, seq) in MinimalFastaParser(open(seq_path)):
            label = label.split()[0]
            if label in mapping:
                filtered_seqs.append((label, seq))
        return filtered_seqs, mapping
Esempio n. 6
0
    def _prefilter_with_trie(self, seq_path):

        trunc_id = lambda (a, b): (a.split()[0], b)
        # get the prefix map
        mapping = build_prefix_map(
            imap(trunc_id, MinimalFastaParser(open(seq_path))))
        for key in mapping.keys():
            mapping[key].append(key)

        # collect the representative seqs
        filtered_seqs = []
        for (label, seq) in MinimalFastaParser(open(seq_path)):
            label = label.split()[0]
            if label in mapping:
                filtered_seqs.append((label, seq))
        return filtered_seqs, mapping