Beispiel #1
0
    def unique(self, options):
        """Unique command"""
        self.logger.info('')
        self.logger.info('*******************************************************************************')
        self.logger.info('[RefineM - unique] Ensuring sequences are assigned to a single genome.')
        self.logger.info('*******************************************************************************')

        genome_files = self._genome_files(options.genome_nt_dir, options.genome_ext)
        if not self._check_nuclotide_seqs(genome_files):
            self.logger.warning('[Warning] All files must contain nucleotide sequences.')
            sys.exit()

        duplicates = genome_tk.unique(genome_files)

        self.logger.info('')
        if len(duplicates) == 0:
            self.logger.info('  Pass: All sequences were identified exactly once.')
        else:
            self.logger.info('  Fail: One or more sequences were observed multiple times.')

            genome_ids = sorted(duplicates.keys())
            for i in xrange(0, len(genome_ids)):
                genome_idA = genome_ids[i]

                for j in xrange(i, len(genome_ids)):
                    genome_idB = genome_ids[j]

                    dup_seq_ids = duplicates[genome_idA][genome_idB]
                    if len(dup_seq_ids) == 0:
                        continue

                    self.logger.info('')
                    if genome_idA == genome_idB:
                        self.logger.info('  There are %d sequences present more than once in %s:' % (len(dup_seq_ids), genome_idA))
                    else:
                        self.logger.info('  There are %d sequences shared between %s and %s:' % (len(dup_seq_ids), genome_idA, genome_idB))

                    for seq_id in dup_seq_ids:
                        self.logger.info('    %s' % seq_id)

        self.time_keeper.print_time_stamp()
Beispiel #2
0
    def test_unique(self):
        """Verify GenomeTk.unique()"""
        unique_test_data_dir = os.path.join(self.test_data_dir, 'unique')

        genome_files = [
            os.path.join(unique_test_data_dir, f)
            for f in os.listdir(unique_test_data_dir)
        ]

        duplicates = genome_tk.unique(genome_files)

        gt = {
            'genome2': {
                'genome1': set(['c_dup'])
            },
            'genome1': {
                'genome2': set(['c_dup']),
                'genome1': ['b_dup']
            }
        }
        assert_equals(duplicates, gt)
Beispiel #3
0
    def unique(self, options):
        """Unique command"""

        genome_files = self._genome_files(options.genome_nt_dir,
                                          options.genome_ext)
        if not self._check_nuclotide_seqs(genome_files):
            self.logger.warning('All files must contain nucleotide sequences.')
            sys.exit()

        duplicates = genome_tk.unique(genome_files)

        if len(duplicates) == 0:
            self.reporter.info(
                'Pass: All sequences were identified exactly once.')
        else:
            self.reporter.info(
                'Fail: One or more sequences were observed multiple times.')

            genome_ids = sorted(duplicates.keys())
            for i in xrange(0, len(genome_ids)):
                genome_idA = genome_ids[i]

                for j in xrange(i, len(genome_ids)):
                    genome_idB = genome_ids[j]

                    dup_seq_ids = duplicates[genome_idA][genome_idB]
                    if len(dup_seq_ids) == 0:
                        continue

                    if genome_idA == genome_idB:
                        self.logger.info(
                            'There are %d sequences present more than once in %s:'
                            % (len(dup_seq_ids), genome_idA))
                    else:
                        self.logger.info(
                            'There are %d sequences shared between %s and %s:'
                            % (len(dup_seq_ids), genome_idA, genome_idB))

                    for seq_id in dup_seq_ids:
                        print '    %s' % seq_id