Esempio n. 1
0
    def run(self, infile_handle):
        ''' Run CD-HIT in parallel on list of fasta files. Each file is clustered seperately.
        
        Other flags used:
        -d 0   --> No limit on description written to cluster file (goes to first space in seq ID). 
        -r 0   --> DO Only +/+ and not -/+ alignment comparisons as reads are done in both directions but on different strands. 
        -s 0.8 --> If shorter sequence is less than 80% of the representative sequence, dont cluster. 
        
        infile_handle -- Takes file object or string of the file path/filename
        ouput -- Defines output location and file name.
        
        Writes stdout to console.
        Counter dictionary and summary logfile are generated after each run. 
        
        '''

        # input checks    
        infile_handle = inputfile_check(infile_handle)
        
        logfile_path = os.path.join(os.path.split(self.args.output)[0], 'clusterfile.log')
        infile_path = os.path.abspath(infile_handle.name)
            
        logfile = outputfile_check(logfile_path)

        # setup internal vars        
        start_time = time.time()
    
        #=======================================================================
        # Run CDHIT
        #=======================================================================
    
        cmd = ('cd-hit-est -i {0} -o {1} -c {2} -n {3} -d 0 -r 0 -s 0.8 -M {4} '
            '-T {5}').format(infile_path, self.args.output, 
                             self.args.similarity, 
                             self.args.n_gram, 
                             self.args.maxmemory, 
                             self.args.threads)   
    
        if self.args.maskN:
            cmd = cmd + ' -mask N'
        if self.args.allvall:
            cmd = cmd + ' -g 1'
        
        cdhitpath = os.path.expanduser(self.args.cdhitpath)
        
        # Spawn Process to run CD-HIT
        subprocess.check_call(shlex.split(os.path.join(cdhitpath, cmd)))
        
        finish_time = time.time()
        
        
        #=======================================================================
        # Generate a summary log file 
        #=======================================================================
        
        # Get cluster size summary counter 
        total_counter, by_seqlen_counter = self.cluster_summary_counter(infile_path=self.args.output,
                                                                        mode='both', report=True)    
        st_idx = cmd.find('-c ')
        CDHIT_parameters = cmd[st_idx:]
        
        # Write summary logfile 
        with logfile as f:
            program_name = os.path.join(self.args.cdhitpath, cmd).split(' -i ')[0]
            f.write('=========================================================\n')
            f.write('Program     : {0}\n'.format(program_name))
            f.write('Input File  : {0}\n'.format(infile_path))
            f.write('Output File : {0}\n'.format(self.args.output))
            f.write('Commands    : {0}\n'.format(CDHIT_parameters))
            f.write('\n')
            f.write('Started     : {0}\n'.format(time.strftime('%a, %d %b %Y, %H:%M:%S', 
                                                    time.gmtime(start_time))))
            f.write('=========================================================\n')
            f.write('\n')
            f.write('                       Report Log\n')
            f.write('---------------------------------------------------------\n')
            
            reads_per_cluster = {key: int(key)*value for key, value in total_counter.iteritems()}
            total_reads = sum(reads_per_cluster.values())
            total_clusters = sum(total_counter.values())
            f.write('Total number of reads     : {0}\n'.format(total_reads))
            f.write('Total number of clusters  : {0}\n'.format(total_clusters))
            read_lengths = [int(key) for key in by_seqlen_counter.keys()]
            f.write('Read length Min and Max    : {0} and {1}\n'.format(min(read_lengths), max(read_lengths)))
            f.write('Time taken                 : {0}\n'.format(time.strftime('%H:%M:%S', 
                                                    time.gmtime(finish_time - start_time))))
            f.write('\n')
            f.write('Top 20 Percentage Reads per cluster \n')
            f.write('---------------------------------------------------------\n')
            f.write('Cluster Size    No. Clusters    Total Reads         %    \n')
            f.write('---------------------------------------------------------\n')
            top_reads_per_cluster = sorted(reads_per_cluster.iteritems(), 
                                           key=lambda tup: int(tup[1]), reverse=True)[:20]
            for tup in top_reads_per_cluster:
                if total_reads == 0:
                    perc = 0.0
                else:
                    perc = float(tup[1]) / total_reads
                
                f.write("{clust_size: <16}{num_clust: <16}{total_reads: <18d}{percentage:.2%}\n".format(
                      clust_size=tup[0], num_clust=total_counter[tup[0]], total_reads=tup[1], 
                      percentage=perc))

        cluster_file_handle = open(self.args.output, 'rb')
        
        return cluster_file_handle, total_counter
Esempio n. 2
0
    def load_cluster_file(self, cluster_file_handle, table_prefix=None,
                          overwrite=False, fmin=2, fmax=None, skipsort=False, buffer_max=1000000):
        ''' Load in a clustering file into the database 
        
        By default singletons are not loaded as cutoff = 2
        can also set a fmin and fmax threshold if only clusters of a certain 
        size are to be added.
        
        '''
        # define names
        if table_prefix is None:
            members_table_name = 'members'
            cluster_table_name = 'clusters'
            index_name = 'clustersizeIndex'
        else:
            members_table_name = table_prefix + '_members'
            cluster_table_name = table_prefix + '_clusters'
            index_name = table_prefix + '_clustersizeIndex'

        # input checks
        if type(cluster_file_handle) == str:
            if not cluster_file_handle.endswith('.clstr'):
                cluster_file_handle = cluster_file_handle + '.clstr'

        cluster_file_handle = inputfile_check(cluster_file_handle)

        # Sort file
        if not skipsort:
            # Filter out singletons and sort clusters in accending order
            print >> sys.stderr, 'Sorting cluster file %s ...' % (cluster_file_handle.name)
            sorted_cluster_file = sortby(cluster_file_handle, reverse=True,
                                         mode='cluster_size', outfile_postfix='-subset', cutoff=fmin)
            cluster_file_handle = inputfile_check(sorted_cluster_file)

        print >> sys.stderr, 'Importing cluster file %s  to database...' % (cluster_file_handle.name)

        # Overwrite/ make tables if necessary
        if overwrite:
            with self.con as con:
                con.execute(''' DROP TABLE IF EXISTS {0} '''.format(cluster_table_name))
                con.execute(''' DROP TABLE IF EXISTS {0} '''.format(members_table_name))

        self.create_cluster_table(cluster_table_name)
        self.create_members_table(members_table_name)

        # Make cluster generator. Returns all cluster info
        cluster_gen = parse(cluster_file_handle)

        # Buffer to hold clusters in memory then write all at once
        cluster_info_list = []
        cumulative_cluster_size = 0

        # Find starting cluster id 
        c = self.con.execute(''' SELECT COUNT(*) FROM {0}'''.format(cluster_table_name))
        clusterid = c.fetchone()['count(*)'] + 1

        # Drop any index on Cluster size 
        with self.con as con:
            con.execute(''' DROP INDEX IF EXISTS {0} '''.format(index_name))

        if fmax:
            for cluster in cluster_gen:

                if cluster.size <= fmax and cluster.size >= fmin:

                    cluster_info_list.append(( clusterid, cluster.rep_seq_id, cluster.size, cluster.members_id))
                    clusterid += 1
                    cumulative_cluster_size += cluster.size

                    if cumulative_cluster_size > buffer_max:
                        self.load_batch_clusterdata(cluster_info_list, table_prefix)
                        cluster_info_list = []
        else:
            for cluster in cluster_gen:
                cluster_info_list.append(( clusterid, cluster.rep_seq_id, cluster.size, cluster.members_id))
                clusterid += 1
                cumulative_cluster_size += cluster.size

                if cumulative_cluster_size > buffer_max:
                    self.load_batch_clusterdata(cluster_info_list, table_prefix)
                    cluster_info_list = []

        # Final flush of data
        if cluster_info_list:
            self.load_batch_clusterdata(cluster_info_list, table_prefix)

        # Rebuild index on Cluster size 
        with self.con as con:
            con.execute('''CREATE INDEX {indexname} ON {tablename}(size)'''.format(
                indexname=index_name, tablename=cluster_table_name))