Exemple #1
0
    def write_reads(self, out_file_handle, output_format='fasta', filter_expression=None,
                    startidx=0, rowbuffer=100000, overwrite=False):
        """ Write records returned by the querry to one large fasta or fastq 
        
        Defaults is to search by GLOBing the individual descriptions with the search_query.
        
            If sql_query = True, search_query is passed as a full sql statment.
            If use_type_column=True, search is done by GLOBing the individual type column instead.
        
        
        out_file_handle -- A file object or string specifying a filename. 
        
        startidx -- starting base index of DNA sequence that is written, used to miss out 
        cutsite if necessary.        
        """

        # Output check
        out_file_handle = outputfile_check(out_file_handle, mode='a', overwrite=overwrite)

        query = '''SELECT seqid, seq, phred  
                    FROM seqs INNER JOIN samples ON seqs.sampleId=samples.sampleId'''

        if filter_expression:
            query += ' WHERE {0}'.format(filter_expression)

        with self.con as con:

            toc = time.time()
            print >> sys.stderr, 'Writing records to {0} format....'.format(output_format),

            c = con.execute(query)
            returned_records = c.fetchmany(rowbuffer)
            rec_count = 0
            while returned_records:

                for rec in returned_records:
                    seq_rec = SeqRecord(Seq(rec['seq'][startidx:]), id=str(rec['seqid']), description='')

                    if output_format == 'fastq':
                        seq_rec.letter_annotations['phred_quality'] = [ord(x) - 33 for x in rec['phred']]

                    SeqIO.write(seq_rec, out_file_handle, format=output_format)
                    rec_count += 1

                    # Fetch next batch of records from cursor
                returned_records = c.fetchmany(rowbuffer)

            print >> sys.stderr, ' Done!'
            print >> sys.stderr, '\n{0} records written successfully to {1}\nin {2}'.format(rec_count,
                        out_file_handle.name, time.strftime('%H:%M:%S', time.gmtime(time.time() - toc)))

            if out_file_handle.name not in ['<stdout>', '<stderr>']:
                out_file_handle.close()

        return out_file_handle
Exemple #2
0
    def run(self, infile_handle):
        ''' Run CD-HIT in parallel on list of fasta files. Each file is clustered seperately.
        
        Other flags used:
        -d 0   --> No limit on description written to cluster file (goes to first space in seq ID). 
        -r 0   --> DO Only +/+ and not -/+ alignment comparisons as reads are done in both directions but on different strands. 
        -s 0.8 --> If shorter sequence is less than 80% of the representative sequence, dont cluster. 
        
        infile_handle -- Takes file object or string of the file path/filename
        ouput -- Defines output location and file name.
        
        Writes stdout to console.
        Counter dictionary and summary logfile are generated after each run. 
        
        '''

        # input checks    
        infile_handle = inputfile_check(infile_handle)
        
        logfile_path = os.path.join(os.path.split(self.args.output)[0], 'clusterfile.log')
        infile_path = os.path.abspath(infile_handle.name)
            
        logfile = outputfile_check(logfile_path)

        # setup internal vars        
        start_time = time.time()
    
        #=======================================================================
        # Run CDHIT
        #=======================================================================
    
        cmd = ('cd-hit-est -i {0} -o {1} -c {2} -n {3} -d 0 -r 0 -s 0.8 -M {4} '
            '-T {5}').format(infile_path, self.args.output, 
                             self.args.similarity, 
                             self.args.n_gram, 
                             self.args.maxmemory, 
                             self.args.threads)   
    
        if self.args.maskN:
            cmd = cmd + ' -mask N'
        if self.args.allvall:
            cmd = cmd + ' -g 1'
        
        cdhitpath = os.path.expanduser(self.args.cdhitpath)
        
        # Spawn Process to run CD-HIT
        subprocess.check_call(shlex.split(os.path.join(cdhitpath, cmd)))
        
        finish_time = time.time()
        
        
        #=======================================================================
        # Generate a summary log file 
        #=======================================================================
        
        # Get cluster size summary counter 
        total_counter, by_seqlen_counter = self.cluster_summary_counter(infile_path=self.args.output,
                                                                        mode='both', report=True)    
        st_idx = cmd.find('-c ')
        CDHIT_parameters = cmd[st_idx:]
        
        # Write summary logfile 
        with logfile as f:
            program_name = os.path.join(self.args.cdhitpath, cmd).split(' -i ')[0]
            f.write('=========================================================\n')
            f.write('Program     : {0}\n'.format(program_name))
            f.write('Input File  : {0}\n'.format(infile_path))
            f.write('Output File : {0}\n'.format(self.args.output))
            f.write('Commands    : {0}\n'.format(CDHIT_parameters))
            f.write('\n')
            f.write('Started     : {0}\n'.format(time.strftime('%a, %d %b %Y, %H:%M:%S', 
                                                    time.gmtime(start_time))))
            f.write('=========================================================\n')
            f.write('\n')
            f.write('                       Report Log\n')
            f.write('---------------------------------------------------------\n')
            
            reads_per_cluster = {key: int(key)*value for key, value in total_counter.iteritems()}
            total_reads = sum(reads_per_cluster.values())
            total_clusters = sum(total_counter.values())
            f.write('Total number of reads     : {0}\n'.format(total_reads))
            f.write('Total number of clusters  : {0}\n'.format(total_clusters))
            read_lengths = [int(key) for key in by_seqlen_counter.keys()]
            f.write('Read length Min and Max    : {0} and {1}\n'.format(min(read_lengths), max(read_lengths)))
            f.write('Time taken                 : {0}\n'.format(time.strftime('%H:%M:%S', 
                                                    time.gmtime(finish_time - start_time))))
            f.write('\n')
            f.write('Top 20 Percentage Reads per cluster \n')
            f.write('---------------------------------------------------------\n')
            f.write('Cluster Size    No. Clusters    Total Reads         %    \n')
            f.write('---------------------------------------------------------\n')
            top_reads_per_cluster = sorted(reads_per_cluster.iteritems(), 
                                           key=lambda tup: int(tup[1]), reverse=True)[:20]
            for tup in top_reads_per_cluster:
                if total_reads == 0:
                    perc = 0.0
                else:
                    perc = float(tup[1]) / total_reads
                
                f.write("{clust_size: <16}{num_clust: <16}{total_reads: <18d}{percentage:.2%}\n".format(
                      clust_size=tup[0], num_clust=total_counter[tup[0]], total_reads=tup[1], 
                      percentage=perc))

        cluster_file_handle = open(self.args.output, 'rb')
        
        return cluster_file_handle, total_counter