def label(self) : self.seqdb = self.__read_fasta(self.options['cluster-fasta']) blast_fname = self.options['cluster-fasta'] # if we are only going to label the clusters without labels # then we need to find the names of those clusters and write a # fasta file containing only those sequences if self.options['label-missing'] : tmp = [] biom = json.load(open(self.options['cluster-biom'])) for r in biom['rows'] : if r['metadata']['label'] in ("", "unknown", "error", "cannot label (matches multiple domains!)") : tmp.append(r['id']) if len(tmp) == 0 : self.log.error("there are no missing labels") exit(1) self.log.info("%d clusters missing labels" % len(tmp)) blast_fname = self.__fasta(join(self.options['outdir'], 'missing.fasta'), tmp) print "getting OTU names (this may take a while)..." otu_names = BlastN(self.options['verbose']).get_names(blast_fname, self.options['labels'], self.options['labels-similarity'], self.options['labels_db']) # rework the biom biom = BiomFile() biom.change_otu_names(self.options['cluster-biom'], otu_names) self.log.info("written %s" % self.options['cluster-biom']) # get the rest of the names and rewrite fasta otu_names = biom.get_label_mapping(self.options['cluster-biom']) self.__fasta(self.options['cluster-fasta'], self.seqdb.keys(), names=otu_names) return 0
def showlabels(self) : delim = self.options['delimiter'] #biom = json.load(open(self.options['cluster-biom'])) # #for r in biom['rows'] : # print delim.join([r['id'], r['metadata']['label']]) biom = BiomFile() labels = biom.get_label_mapping(self.options['cluster-biom']) for x in labels.iteritems() : print delim.join(x) return 0