예제 #1
0
def countMotifs( infile, motifs ):
    '''find regular expression *motifs* in
    sequences within fasta formatted *infile*.
    '''
    
    it = FastaIterator.FastaIterator( infile )
    positions = []
    while 1:
        try:
            seq = it.next()
        except StopIteration:
            break
        if not seq: break
        
        rseq = Genomics.complement( seq.sequence )
        lsequence = len(seq.sequence)
        pos = []
        for motif, pattern in motifs:

            for x in pattern.finditer( seq.sequence ):
                pos.append( ( motif, "+", x.start(), x.end()) )
            for x in pattern.finditer( rseq ):
                pos.append( ( motif, "-", lsequence - x.end(), lsequence - x.start()) )

        positions.append( (seq.title, pos) )

    return positions
예제 #2
0
def buildPFAMDomains( infiles, outfile ):
    '''map PFAM domains onto current sequence collection. 
    The mapping is done by ID lookup.'''
    
    infile = infiles[0]
    with IOTools.openFile( "nrdb50.fasta.tsv") as inf:

        reader = csv.DictReader( inf, dialect='excel-tab' )
        map_id2nid = {}
        for row in reader:
            map_id2nid[row['repid']] = row['nid']
    
    rx = re.compile( "(\S+)\/(\d+)-(\d+)\s+(\S+);(.*);" )

    c = E.Counter()
    outf = IOTools.openFile( outfile, "w" )
    with IOTools.openFile( infile ) as inf:
        for entry in FastaIterator.iterate( inf ):
            c.input += 1
            pid, start, end, pfam_id, description = rx.match( entry.title ).groups()
            try:
                outf.write( "%s\t%i\t%i\t%s\n" % (map_id2nid[pid], int(start)-1, int(end), pfam_id ) )
            except KeyError:
                c.missed += 1
                continue
            c.output += 1

    outf.close()
    E.info( c )
예제 #3
0
파일: Masker.py 프로젝트: yangjl/cgat
    def maskSequences(self, sequences):
        '''mask a collection of sequences.'''

        outfile, infile = tempfile.mkstemp()

        for x, s in enumerate(sequences):
            os.write(outfile, ">%i\n%s\n" % (x, s))

        os.close(outfile)

        statement = self.mCommand % locals()

        E.debug("statement: %s" % statement)

        s = subprocess.Popen(statement,
                             shell=True,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE,
                             close_fds=True)

        (out, err) = s.communicate()

        if s.returncode != 0:
            raise RuntimeError(
                "Error in running %s \n%s\nTemporary directory" %
                (statement, err))

        result = [
            x.sequence for x in FastaIterator.iterate(StringIO.StringIO(out))
        ]

        os.remove(infile)

        return result
예제 #4
0
    def maskSequences( self, sequences ):
        '''mask a collection of sequences.'''

        outfile, infile = tempfile.mkstemp()

        for x,s in enumerate(sequences):
            os.write(outfile, ">%i\n%s\n" % (x,s) )
                     
        os.close(outfile)
                     
        statement = self.mCommand % locals()

        E.debug( "statement: %s" % statement )

        s = subprocess.Popen( statement,
                              shell = True,
                              stdout = subprocess.PIPE,
                              stderr = subprocess.PIPE,
                              close_fds = True)                              

        (out, err) = s.communicate()

        if s.returncode != 0:
            raise RuntimeError("Error in running %s \n%s\nTemporary directory" % (statement, err))

        result = [ x.sequence for x in FastaIterator.iterate( StringIO.StringIO( out) ) ]

        os.remove( infile )
        
        return result
예제 #5
0
def index(filename, k):
    ''''''
    start = time.time()
    print 'indexing', filename

    mer_count = 4**k

    dbname = '.'.join(filename.split('.')[:-1]) + '.mfe_index'

    kmer_lookup = collections.defaultdict(list)

    is_empty = False
    is_db_new = True
    contig_lengths = []
    total_offset = 0

    for record in FastaIterator.parse(open(filename)):
        is_empty = False
        print record.id
        start_time = time.time()
        fasta_seq = record.seq
        dna2int.update_lookup(kmer_lookup, fasta_seq, total_offset, k)
        contig_lengths.append((record.id, len(fasta_seq)))
        total_offset += len(fasta_seq)
        print '%i bp took %.2f seconds' % (len(fasta_seq),
                                           time.time() - start_time)

    store_index(dbname, kmer_lookup, contig_lengths, k)

    print "Time used: %s" % str(time.time() - start)
    print 'Done.'
def collectGenomeSizes(infile, outfile):
    '''
    output the genome sizes for each genome
    '''
    to_cluster = True
    outf = open(outfile, "w")
    outf.write("genome\tlength\n")
    # assume single fasta entry
    for fasta in FastaIterator.iterate(iotools.openFile(infile)):
        name = P.snip(os.path.basename(infile), ".fna")
        length = len(list(fasta.sequence))
        outf.write("%s\t%s\n" % (name, str(length)))
    outf.close()
def collectGenomeSizes(infile, outfile):
    '''
    output the genome sizes for each genome
    '''
    to_cluster = True
    outf = open(outfile, "w")
    outf.write("genome\tlength\n")
    # assume single fasta entry
    for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
        name = P.snip(os.path.basename(infile), ".fna")
        length = len(list(fasta.sequence))
        outf.write("%s\t%s\n" % (name, str(length)))
    outf.close()
예제 #8
0
def buildNrdb50( infile, outfile ):
    '''build nrdb50
    
    Renumber seqences.'''
    
    outf_fasta = IOTools.openFile( outfile, "w" )
    outf_table = IOTools.openFile( outfile + ".tsv", "w" )
    outf_table.write("nid\tpid\thid\tdescription\tcluster_size\ttaxon\trepid\n" )

    rx = re.compile( "(\S+) (.*) n=(\d+) Tax=(.*) RepID=(\S+)" )

    nid = 1
    for entry in FastaIterator.iterate( IOTools.openFile( infile )):
        outf_fasta.write(">%i\n%s\n" % (nid, entry.sequence ) )
        cluster_name, description, cluster_size, taxon, repid = rx.match( entry.title ).groups()
        hid = computeHID( entry.sequence )
        outf_table.write( "\t".join( (str(nid), cluster_name, hid, description, cluster_size, taxon, repid)) + "\n" )
        nid += 1

    outf_fasta.close()
    outf_table.close()
예제 #9
0
def checkBlastRun( infiles, outfile ):
    '''build summary stats on file.'''

    pairsdbfile, seqfile = infiles
    
    nids = set()
    with IOTools.openFile( seqfile ) as inf:
        for r in FastaIterator.iterate( inf ):
            nids.add( int(r.title) )

    with IOTools.openFile( pairsdbfile ) as inf:
        query_ids, sbjct_ids = set(), set()
        total_results, self_links = 0, 0
        for l in inf:
            l = inf.readline()
            if l.startswith("#//"): continue
            query_id, sbjct_id = l.split("\t")[:2]
            query_ids.add( int(query_id) )
            sbjct_ids.add( int(sbjct_id) )
            if query_id == sbjct_id: self_links += 1
            total_results += 1

    outf = IOTools.openFile( outfile, "w" )
    outf.write( "category\tcounts\n")
    outf.write( "\t".join( map(str, ('nids', len(nids)))) + "\n" )
    outf.write( "\t".join( map(str, ('links', total_results))) + "\n" )
    outf.write( "\t".join( map(str, ('self', self_links))) + "\n" )
    outf.write( "\t".join( map(str, ('queries', len(query_ids)))) + "\n" )
    outf.write( "\t".join( map(str, ('sbjcts', len(sbjct_ids)))) + "\n" )
    outf.close()

    outf = IOTools.openFile( outfile + '.missing_queries.gz', 'w' )
    outf.write( 'nid\n' )
    outf.write( "\n".join( map(str, sorted( list( nids.difference( query_ids )) ) )) + "\n" )
    outf.close()

    outf = IOTools.openFile( outfile + '.missing_sbjcts.gz', 'w' )
    outf.write( 'nid\n' )
    outf.write( "\n".join( map(str, sorted( list( nids.difference( sbjct_ids )) ) )) + "\n" )
    outf.close()
예제 #10
0
    def RunOnFile(self, infile, outfile, errfile):

        self.CreateTemporaryFiles()

        statement = string.join((self.mExecutable, self.mFilenameTempInput,
                                 self.mFilenameTempOutput), " ")

        i = FastaIterator.FastaIterator(infile)

        outfile.write("GENE\tBl2Seq\n")

        while 1:
            f = i.next()
            if f == None: break

            file = open(self.mFilenameTempInput, "w")
            file.write(">%s\n%s" % (f.title, f.sequence))
            file.close()

            s = subprocess.Popen(statement,
                                 shell=True,
                                 stdout=subprocess.PIPE,
                                 stderr=subprocess.PIPE,
                                 cwd=self.mTempDirectory,
                                 close_fds=True)

            (out, err) = s.communicate()

            if s.returncode != 0:
                raise Bl2SeqError, "Error in calculating Bl2Seq\n%s" % err

            d = open(self.mFilenameTempOutput).readlines()[2][:-1]
            enc = d.split(" ")[2]

            outfile.write((string.join((f.title, enc), "\t")) + "\n")

            errfile.write(err)

        self.DeleteTemporaryFiles()
예제 #11
0
def buildSCOPDomains( infiles, outfile ):
    '''reconcile mapped domains into a single domain file.

    * fragments are removed - a domain must map at least 90%
      of its length.

    * domains overlapping on the same sequence with the same
      superfamily classification are merged.
    '''
    
    linksfile, fastafile = infiles

    # filtering criteria
    min_coverage = 0.9
    # only take first four fold classes
    classes = 'abcd'

    rx = re.compile('(\S+)\s(\S+)\s(.*)' )
    id2class = {}
    with IOTools.openFile( fastafile ) as inf:
        for x in FastaIterator.iterate( inf ):
            pid, cls, description = rx.match(x.title).groups()
            id2class[pid] = (cls, len(x.sequence) )
            
    E.info('read mappings for %i sequences' % len(id2class))
    counter = E.Counter()

    with IOTools.openFile( linksfile ) as inf:
        nid2domains = collections.defaultdict( list )
        ndomains = 0
        for line in inf:
            if line.startswith('query_nid'): continue
            if line.startswith('#'): continue
            counter.links += 1
            
            domain_id, nid, evalue, domain_start, domain_end, sbjct_start, sbjct_end, \
                block_sizes, domain_starts, sbjct_starts, \
                bitscore, pid = line[:-1].split()
            
            nid, domain_start, domain_end, sbjct_start, sbjct_end = map(int, \
                                                                       ( nid, domain_start, domain_end, sbjct_start, sbjct_end ))

            family, length = id2class[domain_id]

            cls, fold, superfamily, family = family.split('.')
            if cls not in classes: continue
            if float(domain_end - domain_start) / length < min_coverage: continue
            counter.unmerged_domains += 1
            superfamily = '00%c%03i%03i' % (cls, int(fold), int(superfamily))

            nid2domains[nid].append( (superfamily, sbjct_start, sbjct_end ) )

        counter.sequences = len(nid2domains)

    E.info( 'merging %i domains in %i sequences' % (counter.unmerged_domains, counter.sequences))

    outf = IOTools.openFile( outfile, 'w' )
    outf.write('nid\tstart\tend\tfamily\n')
    for nid, dd in sorted(nid2domains.iteritems()):
        for family, domains in itertools.groupby( dd, key = lambda x: x[0] ):
            unmerged_domains = [ (x[1],x[2]) for x in domains ]
            merged_domains = Intervals.combine( unmerged_domains )
            for start, end in merged_domains:
                counter.domains += 1
                outf.write( '%i\t%i\t%i\t%s\n' % (nid, start, end, family ) )
    outf.close()

    E.info( counter )
예제 #12
0
def index(filename, k):
    ''''''
    start = time()

    mer_count = 4**k

    dbname = '.'.join(filename.split('.')[:-1]) + '.sqlite3.db'

    conn = sqlite3.connect(dbname)
    cur = conn.cursor()
    cur.executescript('''
    drop table if exists pos;
    create table pos(
    mer_id integer primary key, 
    plus text,
    minus text
    );''')

    plus = ['']*mer_count
    minus = ['']*mer_count

    is_empty = False
    is_db_new = True

    for record in FastaIterator.parse(open(filename)):
        is_empty = False
        print record.id

        fasta_seq = record.seq
	#print 'Time used: ', time() - start

        plus_mer_list = [''] * mer_count
        minus_mer_list = [''] * mer_count

        i_max = len(fasta_seq) - k
        i = 0
        kmer = fasta_seq[:k]
        while i < i_max:
            #print i, len(fasta_seq), i_max
            #print kmer
            try:
                plus_mer_id, minus_mer_id = DNA2int_2(kmer)
            except:
                #print 'Unrecognized base: %s' % fasta_seq[i+k]
                # Skip the unrecognized base, such as 'N'
                i += 1
                kmer = kmer[1:] + fasta_seq[i+k-1]
                continue

            if plus_mer_list[plus_mer_id]:
                plus_mer_list[plus_mer_id] += ',%i' % (i+k-1)
            else:
                plus_mer_list[plus_mer_id] = str(i+k-1)

            if minus_mer_list[minus_mer_id]:
                minus_mer_list[minus_mer_id] += ',%i' % (i)
            else:
                minus_mer_list[minus_mer_id] = str(i)

            i += 1
            kmer = kmer[1:] + fasta_seq[i+k-1]
            if not i % 100000:
                print "%s: %.2f%%, %s" % (record.id, i/i_max*100, str(datetime.timedelta(seconds=(time() - start))))
        else:
            pass

	#print 'Time used: ', time() - start
        for mer_id in xrange(mer_count):
            if plus_mer_list[mer_id]:
                if plus[mer_id]:
                    plus[mer_id] += ';%s:%s' % (record.id, plus_mer_list[mer_id])
                else:
                    plus[mer_id] = '%s:%s' % (record.id, plus_mer_list[mer_id])

            if minus_mer_list[mer_id]:
                if minus[mer_id]:
                    minus[mer_id] += ';%s:%s' % (record.id, minus_mer_list[mer_id])
                else:
                    minus[mer_id] = '%s:%s' % (record.id, minus_mer_list[mer_id])

        memory_percent = get_memory_percent()
        if memory_percent > 50:
            if is_db_new:
                insert_db(conn, mer_count, plus, minus)
                is_db_new = False
            else:
                update_db(conn, mer_count, plus, minus)

            # Empty the container
            plus = ['']*mer_count
            minus = ['']*mer_count
            is_empty = True

            print 'Empty plus and minus due to the memory: %s.' % memory_percent


    if not is_empty:
        if is_db_new:
            insert_db(conn, mer_count, plus, minus)
        else:
            update_db(conn, mer_count, plus, minus)

    print "Time used: %s" % str(datetime.timedelta(seconds=(time() - start)))
    print 'Done.'