예제 #1
0
    def cluster(self, input_files, args, work_dir, threads=1):
        J = lambda p: os.path.join(work_dir, p)

        bin_prefix = J('METABAT_')
        log_path = J('logs.txt')

        cmd_line = [self.program_name,
            '-i', input_files.contigs_fasta,
            '-a', input_files.contig_coverages,
            '-o', bin_prefix,
            '--cvExt',
            '-l',
            *utils.serialize_args(args)]


        self.progress.new(self.program_name)
        self.progress.update('Running using %d threads...' % threads)
        utils.run_command(cmd_line, log_path)
        self.progress.end()

        output_file_paths = glob.glob(J(bin_prefix + '*'))
        if not len(output_file_paths):
            raise ConfigError("Some critical output files are missing. Please take a look at the\
                               log file: %s" % (log_path))

        clusters = {}
        bin_count = 0
        for bin_file in output_file_paths:
            bin_count += 1
            with open(bin_file, 'r') as f:
                pretty_bin_name = os.path.basename(bin_file).replace('.', '_')
                clusters[pretty_bin_name] = list(map(str.strip, f.readlines()))

        return clusters
예제 #2
0
    def cluster(self,
                input_files,
                args,
                work_dir,
                threads=1,
                log_file_path=None):
        J = lambda p: os.path.join(work_dir, p)

        if not log_file_path:
            log_file_path = J('logs.txt')

        translation = {
            'preference': 'p',
            'maxiter': 'm',
            'conviter': 'v',
            'damp': 'd',
            'contigsize': 'x'
        }

        cmd_line = [
            self.program_name, '-c', input_files.contig_coverages_log_norm,
            '-f',
            os.path.dirname(input_files.contigs_fasta), '-l',
            os.path.basename(input_files.contigs_fasta), '-o', work_dir,
            *utils.serialize_args(
                args, single_dash=True, translate=translation)
        ]

        self.progress.new(self.program_name)
        self.progress.update('Running using %d threads...' % threads)
        utils.run_command(cmd_line, log_file_path)
        self.progress.end()

        output_file_paths = glob.glob(J('*.fna'))
        if not len(output_file_paths):
            raise ConfigError(
                "Some critical output files are missing. Please take a look at the "
                "log file: %s" % (log_file_path))

        clusters = {}
        bin_count = 0
        for bin_file in output_file_paths:
            bin_count += 1
            with open(bin_file, 'r') as f:
                pretty_bin_name = os.path.basename(bin_file)
                pretty_bin_name = pretty_bin_name.replace('sequence_', '')
                pretty_bin_name = pretty_bin_name.replace('.fna', '')
                pretty_bin_name = pretty_bin_name.replace('-', '_')

                clusters[pretty_bin_name] = [
                    line.strip().replace('>', '') for line in f
                    if line.startswith('>')
                ]

        return clusters
예제 #3
0
파일: maxbin2.py 프로젝트: simatei/anvio
    def cluster(self,
                input_files,
                args,
                work_dir,
                threads=1,
                log_file_path=None):
        J = lambda p: os.path.join(work_dir, p)

        output_file_prefix = J('MAXBIN_')

        if not log_file_path:
            log_file_path = J('logs.txt')

        cmd_line = [
            self.program_name, '-contig', input_files.contigs_fasta, '-abund',
            input_files.contig_coverages, '-out', output_file_prefix,
            '-thread',
            str(threads),
            *utils.serialize_args(args, single_dash=True, use_underscore=True)
        ]

        self.progress.new(self.program_name)
        self.progress.update('Running using %d threads...' % threads)
        utils.run_command(cmd_line, log_file_path)
        self.progress.end()

        output_file_paths = glob.glob(J(output_file_prefix + '*.fasta'))
        if not len(output_file_paths):
            raise ConfigError(
                "Some critical output files are missing. Please take a look at the "
                "log file: %s" % (log_file_path))

        clusters = {}
        bin_count = 0

        for bin_file in output_file_paths:
            bin_count += 1
            with open(bin_file, 'r') as f:
                bin_name = os.path.basename(bin_file).replace('.fasta', '')
                bin_name = bin_name.replace('.', '_')

                clusters[bin_name] = []

                for line in f.readlines():
                    if line.startswith('>'):
                        clusters[bin_name].append(line[1:].strip())

        return clusters
예제 #4
0
    def cluster(self, input_files, args, work_dir, threads=1, log_file_path=None):
        J = lambda p: os.path.join(work_dir, p)

        if not log_file_path:
            log_file_path = J('logs.txt')

        cmd_line = [self.program_name,
            '--coverage_file', input_files.contig_coverages,
            '--composition_file', input_files.contigs_fasta,
            '--basename', work_dir,
            '--threads', threads,
             *utils.serialize_args(args, use_underscore=True)]

        self.progress.new(self.program_name)
        self.progress.update('Running using %d threads...' % threads)
        utils.run_command(cmd_line, log_file_path)
        self.progress.end()

        clusters = {}
        threshold = args.length_threshold or '1000'

        output_file_name = 'clustering_gt%s.csv' % threshold
        output_file_path = J(output_file_name)
        if not os.path.exists(output_file_path):
            raise ConfigError("One of the critical output files is missing ('%s'). Please take a look at the "
                              "log file: %s" % (output_file_name, log_file_path))

        with open(output_file_path, 'r') as f:
            lines = f.readlines()[1:]

            for entry in lines:
                contig, bin_name = map(str.strip, entry.split(','))

                pretty_bin_name = 'Bin_' + bin_name

                if pretty_bin_name not in clusters:
                    clusters[pretty_bin_name] = []

                clusters[pretty_bin_name].append(contig)

        return clusters
예제 #5
0
    def cluster(self, input_files, args, work_dir, threads=1):
        J = lambda p: os.path.join(work_dir, p)

        cwd_backup = os.getcwd()
        os.chdir(work_dir)
        log_path = J('logs.txt')

        c = ccollections.Collections(r=run, p=progress)
        c.populate_collections_dict(input_files.profile_db)

        source_collections = set(
            map(str.strip, args.source_collections.split(',')))

        missing_collections = source_collections - set(
            c.collections_dict.keys())

        if len(missing_collections):
            raise ConfigError(
                "Some of the collections you wanted are missing in the database. "
                "Here is the list of missing collections: %s" %
                (", ".join(missing_collections)))

        c_names = []
        c_files = []

        for collection_name in source_collections:
            prefix = J(collection_name)

            c_names.append(collection_name)
            c_files.append(prefix + '.txt')

            c.export_collection(collection_name,
                                output_file_prefix=prefix,
                                include_unbinned=False)

        cmd_line = [
            self.program_name, '-c', input_files.splits_fasta, '-i',
            ','.join(c_files), '-l', ','.join(c_names), '-o',
            J('OUTPUT'), '--threads',
            str(threads), *utils.serialize_args(
                args, use_underscore=True, skip_keys=['source_collections'])
        ]

        self.progress.new(self.program_name)
        self.progress.update('Running using %d threads...' % threads)
        utils.run_command(cmd_line, log_path)
        self.progress.end()

        output_file_name = 'OUTPUT_DASTool_scaffolds2bin.txt'
        output_file_path = J(output_file_name)
        if not os.path.exists(output_file_path):
            raise ConfigError(
                "One of the critical output files is missing ('%s'). Please take a look at the "
                "log file: %s" % (output_file_name, log_path))

        clusters = {}
        with open(output_file_path, 'r') as f:
            lines = f.readlines()

            for entry in lines:
                contig, bin_name = map(str.strip, entry.split())

                pretty_bin_name = 'Bin_' + bin_name.replace('.', '_')

                if pretty_bin_name not in clusters:
                    clusters[pretty_bin_name] = []

                clusters[pretty_bin_name].append(contig)

        # restore cwd
        os.chdir(cwd_backup)

        return clusters