Exemple #1
0
    def label(self) :
        self.seqdb = self.__read_fasta(self.options['cluster-fasta'])
        blast_fname = self.options['cluster-fasta']

        # if we are only going to label the clusters without labels
        # then we need to find the names of those clusters and write a 
        # fasta file containing only those sequences
        if self.options['label-missing'] :
            tmp = []
            biom = json.load(open(self.options['cluster-biom']))
            for r in biom['rows'] :
                if r['metadata']['label'] in ("", "unknown", "error", "cannot label (matches multiple domains!)") :
                    tmp.append(r['id'])

            if len(tmp) == 0 :
                self.log.error("there are no missing labels")
                exit(1)

            self.log.info("%d clusters missing labels" % len(tmp))
            blast_fname = self.__fasta(join(self.options['outdir'], 'missing.fasta'), tmp)


        print "getting OTU names (this may take a while)..." 
        otu_names = BlastN(self.options['verbose']).get_names(blast_fname, self.options['labels'], self.options['labels-similarity'], self.options['labels_db'])

        # rework the biom
        biom = BiomFile()
        biom.change_otu_names(self.options['cluster-biom'], otu_names)
        self.log.info("written %s" % self.options['cluster-biom'])

        # get the rest of the names and rewrite fasta
        otu_names = biom.get_label_mapping(self.options['cluster-biom'])
        self.__fasta(self.options['cluster-fasta'], self.seqdb.keys(), names=otu_names)

        return 0
Exemple #2
0
    def __biom(self, filename, samples, clustering, cluster_names) :
        centroids = clustering.centroids()
        all_keys = clustering.all()

        output_clusters = clustering.clusters
        output_samples = [ s for s in samples if s.contains(all_keys) ]
        output_otus = [ ("seance" + str(k), cluster_names.get("seance" + str(k), "unknown")) for k in centroids ]

        #self.log.info("%d / %d samples have at least one sequence used in clustering" % \
        #        (len(output_samples), len(samples)))

        b = BiomFile()
        b.set_samples(output_samples)
        b.set_otus(output_otus)

        for sind,sample in enumerate(output_samples) :
            for cind,cluster in enumerate(output_clusters) :
                count = 0

                for read in cluster :
                    if read in sample :
                        count += sample.seqcounts[read]

                b.add_quantity(cind, sind, count)

        b.write_to(filename)
        self.log.info("written %s" % filename)
Exemple #3
0
    def showlabels(self) :
        delim = self.options['delimiter']
        
        #biom = json.load(open(self.options['cluster-biom']))
        #
        #for r in biom['rows'] :
        #    print delim.join([r['id'], r['metadata']['label']])

        biom = BiomFile()
        labels = biom.get_label_mapping(self.options['cluster-biom'])

        for x in labels.iteritems() :
            print delim.join(x)

        return 0