def unique(self, options): """Unique command""" self.logger.info('') self.logger.info('*******************************************************************************') self.logger.info('[RefineM - unique] Ensuring sequences are assigned to a single genome.') self.logger.info('*******************************************************************************') genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext) if not self._check_nuclotide_seqs(genome_files): self.logger.warning('[Warning] All files must contain nucleotide sequences.') sys.exit() duplicates = genome_tk.unique(genome_files) self.logger.info('') if len(duplicates) == 0: self.logger.info(' Pass: All sequences were identified exactly once.') else: self.logger.info(' Fail: One or more sequences were observed multiple times.') genome_ids = sorted(duplicates.keys()) for i in xrange(0, len(genome_ids)): genome_idA = genome_ids[i] for j in xrange(i, len(genome_ids)): genome_idB = genome_ids[j] dup_seq_ids = duplicates[genome_idA][genome_idB] if len(dup_seq_ids) == 0: continue self.logger.info('') if genome_idA == genome_idB: self.logger.info(' There are %d sequences present more than once in %s:' % (len(dup_seq_ids), genome_idA)) else: self.logger.info(' There are %d sequences shared between %s and %s:' % (len(dup_seq_ids), genome_idA, genome_idB)) for seq_id in dup_seq_ids: self.logger.info(' %s' % seq_id) self.time_keeper.print_time_stamp()
def test_unique(self): """Verify GenomeTk.unique()""" unique_test_data_dir = os.path.join(self.test_data_dir, 'unique') genome_files = [ os.path.join(unique_test_data_dir, f) for f in os.listdir(unique_test_data_dir) ] duplicates = genome_tk.unique(genome_files) gt = { 'genome2': { 'genome1': set(['c_dup']) }, 'genome1': { 'genome2': set(['c_dup']), 'genome1': ['b_dup'] } } assert_equals(duplicates, gt)
def unique(self, options): """Unique command""" genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext) if not self._check_nuclotide_seqs(genome_files): self.logger.warning('All files must contain nucleotide sequences.') sys.exit() duplicates = genome_tk.unique(genome_files) if len(duplicates) == 0: self.reporter.info( 'Pass: All sequences were identified exactly once.') else: self.reporter.info( 'Fail: One or more sequences were observed multiple times.') genome_ids = sorted(duplicates.keys()) for i in xrange(0, len(genome_ids)): genome_idA = genome_ids[i] for j in xrange(i, len(genome_ids)): genome_idB = genome_ids[j] dup_seq_ids = duplicates[genome_idA][genome_idB] if len(dup_seq_ids) == 0: continue if genome_idA == genome_idB: self.logger.info( 'There are %d sequences present more than once in %s:' % (len(dup_seq_ids), genome_idA)) else: self.logger.info( 'There are %d sequences shared between %s and %s:' % (len(dup_seq_ids), genome_idA, genome_idB)) for seq_id in dup_seq_ids: print ' %s' % seq_id