Example #1
0
def initDatabase(dbFilename, iFilename, clobber=False):
    db = Alchemy()
    gene_table = Table("Gene", db.metadata,
        Column('id', Integer, primary_key=True),
        Column('geneId', Text),
        Column('chrom', Text, index=True),
        Column('start', Integer, index=True),
        Column('end', Integer, index=True),
        Column('strand', String(1))
    )
    mapper(Gene, gene_table)
    
    populateDb = clobber or not os.path.exists(dbFilename)
    session = db.startSession('sqlite:///%s' % dbFilename)
    if populateDb:
        iFile = open(iFilename)
        headers = iFile.readline()
        strandTable = {'1': '+', '-1': '-'}
        for i,line in enumerate(iFile):
            if (i % 1000): progressMessage('# genes %s', i)
            tokens = line.strip().split('\t')
            g = Gene(None, tokens[0], tokens[5], int(tokens[6]), int(tokens[7]), 
                strandTable[tokens[8]])
            session.save(g)
        progressMessage('# genes %s', i)
        session.commit()
    
    return session
Example #2
0
def initDatabase(dbFilename, iFilename, clobber=False):
    db = Alchemy()
    gene_table = Table(
        "Gene",
        db.metadata,
        Column("id", Integer, primary_key=True),
        Column("geneId", Text),
        Column("chrom", Text, index=True),
        Column("start", Integer, index=True),
        Column("end", Integer, index=True),
        Column("strand", String(1)),
    )
    mapper(Gene, gene_table)

    populateDb = clobber or not os.path.exists(dbFilename)
    session = db.startSession("sqlite:///%s" % dbFilename)
    if populateDb:
        iFile = open(iFilename)
        headers = iFile.readline()
        strandTable = {"1": "+", "-1": "-"}
        for i, line in enumerate(iFile):
            if i % 1000:
                progressMessage("# genes %s", i)
            tokens = line.strip().split("\t")
            g = Gene(None, tokens[0], tokens[5], int(tokens[6]), int(tokens[7]), strandTable[tokens[8]])
            session.save(g)
        progressMessage("# genes %s", i)
        session.commit()

    return session
def initDatabase(dbFilename, iFilename, clobber=False):
    db = Alchemy()
    gene_table = Table("Gene", db.metadata,
        Column('id', Integer, primary_key=True),
        Column('geneId', Text),
        Column('chrom', Text, index=True),
        Column('start', Integer, index=True),
        Column('end', Integer, index=True),
        Column('strand', String(1))
    )
    mapper(Gene, gene_table)
    
    populateDb = clobber or not os.path.exists(dbFilename)
    session = db.startSession('sqlite:///%s' % dbFilename)
    if populateDb:
        iFile = open(iFilename)
        headers = iFile.readline()
        strandTable = {'1': '+', '-1': '-'}
        for i,line in enumerate(iFile):
            if (i % 1000): progressMessage('# genes %s', i)
            tokens = line.strip().split('\t')
            g = Gene(None, tokens[0], tokens[5], int(tokens[6]), int(tokens[7]), 
                strandTable[tokens[8]])
            session.save(g)
        progressMessage('# genes %s', i)
        session.commit()
    
    return session
Example #4
0
 def build(self, clobber=False, separator='!'):
     """Build index file.
     
     @keyword clobber: Overwrite existing index file (default=False) 
     """
     if clobber:
         try:
             self.cursor.execute('drop table Offsets;')
         except:
             pass
     
     if not self.isIndexed():
         schema = """
             CREATE TABLE Offsets (
                 id INTEGER,
                 accession TEXT,
                 offset INTEGER
                 );
                 create index idx_offsets_id on Offsets (id);
                 create index idx_offsets_accession on Offsets (accession);
         """
         self.connection.executescript(schema)
         tmpFile = tempfile.NamedTemporaryFile()
         for i,(accession,offset) in enumerate(self._byteOffsetGenerator()):
             print >> tmpFile, "%i%s%s%s%i" % (i,separator,accession,separator,offset)
             if i % 1000==0:
                 progressMessage("Sequences: %s", i+1)
         progressMessage("Sequences: %s\n", i+1)
         tmpFile.flush()
         
         cmd = """sqlite3 -separator '%s' %s '.import "%s" Offsets'""" \
             % (separator, self.idxFilename, tmpFile.name)
         os.system(cmd)
         tmpFile.close()
Example #5
0
    def build(self, clobber=False, separator='!'):
        """Build index file.
        
        @keyword clobber: Overwrite existing index file (default=False) 
        """
        if clobber:
            try:
                self.cursor.execute('drop table Offsets;')
            except:
                pass

        if not self.isIndexed():
            schema = """
                CREATE TABLE Offsets (
                    id INTEGER,
                    accession TEXT,
                    offset INTEGER
                    );
                    create index idx_offsets_id on Offsets (id);
                    create index idx_offsets_accession on Offsets (accession);
            """
            self.connection.executescript(schema)
            tmpFile = tempfile.NamedTemporaryFile()
            for i, (accession,
                    offset) in enumerate(self._byteOffsetGenerator()):
                print >> tmpFile, "%i%s%s%s%i" % (i, separator, accession,
                                                  separator, offset)
                if i % 1000 == 0:
                    progressMessage("Sequences: %s", i + 1)
            progressMessage("Sequences: %s\n", i + 1)
            tmpFile.flush()

            cmd = """sqlite3 -separator '%s' %s '.import "%s" Offsets'""" \
                % (separator, self.idxFilename, tmpFile.name)
            os.system(cmd)
            tmpFile.close()
Example #6
0
#         contigData[chrom].add_interval(contig)
#     except KeyError:
#         contigData[chrom] = Intersecter()
#         contigData[chrom].add_interval(contig)



print 'Parse genes'
iFilename = '/Users/papenfuss/databases/platypus/ensembl/Release50/mart_names_locations.txt'
iFile = open(iFilename)
headers = iFile.readline()

annotated = set()
for i,line in enumerate(iFile):
    if (i % 1000)==0:
        progressMessage('# genes %s', i)
    
    tokens = line.strip().split('\t')
    geneId = tokens[0]
    transId = tokens[1]
    name = tokens[3]
    chrom = tokens[5]
    start = int(tokens[6])
    end = int(tokens[7])
    strand = {'1': '+', '-1': '-'}[tokens[8]]
    
    try:
        for contig in contigData[chrom].find(start-500, end+500):
            annotated.add(contig.value[0])
    except:
        pass
Example #7
0
iFilename = os.path.join(iDir, 'mart_names_locations.txt')
dbFilename = os.path.join(iDir, 'mart_names_locations.sqlite')
# dbFilename = ':memory:'
session = initDatabase(dbFilename, iFilename)


# 2a. Parse read alignment file and has results
# 2b. Write out unannotated reads
flankSize = 1000
maqFilename = '/Users/papenfuss/databases/platypus/venom/solexa/mapview.txt'
data = {}
unannFile = open('unannotated.txt', 'w')
multFile = open('multiple.txt', 'w')
multFile2 = open('multiple_gene.txt', 'w')
for i,m in enumerate(MaqViewFile(maqFilename)):
    if (i % 1000)==0: progressMessage('# maq alns %s', i)
    q = session.query(Gene).filter(Gene.chrom==m.chrom) \
        .filter(Gene.start<m.start+flankSize) \
        .filter(Gene.end>m.start+32-flankSize).all()
    
    if len(q)==0:
        print >> unannFile, m
        continue
    elif len(q)>1:
        x = set([r.geneId for r in q])
        if len(x)>1:
            print >> multFile, m
            print >> multFile2, "%s\t%s" % (m.name, ','.join(x))
            continue
    
    try:
Example #8
0
dbFilename = os.path.join(iDir, "mart_names_locations.sqlite")
# dbFilename = ':memory:'
session = initDatabase(dbFilename, iFilename)


# 2a. Parse read alignment file and has results
# 2b. Write out unannotated reads
flankSize = 1000
maqFilename = "/Users/papenfuss/databases/platypus/venom/solexa/mapview.txt"
data = {}
unannFile = open("unannotated.txt", "w")
multFile = open("multiple.txt", "w")
multFile2 = open("multiple_gene.txt", "w")
for i, m in enumerate(MaqViewFile(maqFilename)):
    if (i % 1000) == 0:
        progressMessage("# maq alns %s", i)
    q = (
        session.query(Gene)
        .filter(Gene.chrom == m.chrom)
        .filter(Gene.start < m.start + flankSize)
        .filter(Gene.end > m.start + 32 - flankSize)
        .all()
    )

    if len(q) == 0:
        print >> unannFile, m
        continue
    elif len(q) > 1:
        x = set([r.geneId for r in q])
        if len(x) > 1:
            print >> multFile, m
Example #9
0
hsps_table = createTable(tableName, metadata, h.attributes, h.converters, 
    indexedAttributes=['subjectId', 'sStart', 'sEnd'])
mapper(HSP, hsps_table)

# Start a session & initialize database
session = createSession(dsn, metadata)

if case in [1,2]:
    # Devil 454 reads
    for i,line in enumerate(open(iFilename)):
        tokens = line.strip().split('\t')
        h = HSP(tokens[0:-2])
        h.convertBlockToGenomeCoords()
        session.save(h)
        if (i % 5000)==0:
            progressMessage("# HSPs %s", i, n)
            session.commit()
    progressMessage("# HSPs %s\n", i, n)
    session.commit()
elif case==3:
    # Platypus 454 reads
    for i,h in enumerate(BlastFile(iFilename)):
        h.subjectId = h.subjectId.split('|')[1]
        h.convertBlockToGenomeCoords()
        session.save(h)
        if (i % 5000)==0:
            progressMessage("# HSPs %s", i, n)
            session.commit()
    progressMessage("# HSPs %s\n", i, n)
    session.commit()
Example #10
0
    sys.exit(__doc__)

iFilename = '/Users/papenfuss/databases/platypus/venom/solexa/mapview_filtered.txt'  # sys.argv[1]
oFilename = '/Users/papenfuss/platy/venom/gbrowse/solexa250.gff'  # sys.argv[2]
windowSize = 250

iFile = open(iFilename)
headers = iFile.readline().strip().split('\t')
oFile = open(oFilename, 'w')

chrom = None
lastChrom = None
countDict = {}
for i, line in enumerate(iFile):
    if (i % 1000) == 0:
        progressMessage('# reads %s', i, 28000000)
    tokens = line.strip().split('\t')
    d = dict(zip(headers, tokens))
    chrom = "%s" % d['chrom']
    start = int(d['start'])
    # print chrom, start, lastChrom, (chrom!=lastChrom and len(countDict)!=0)

    if chrom != lastChrom and len(countDict) != 0:
        countData = countDict.items()
        countData.sort(key=lambda x: x[0])
        for (_chrom, _wStart), _counts in countData:
            g = Feature(reference=_chrom,
                        source='solexa250',
                        type='tlevel',
                        start=_wStart,
                        end=_wStart + windowSize - 1,
Example #11
0
#!/usr/bin/env python
"""
loadSolexa.py

Author: Tony Papenfuss
Date: Tue Jun 24 14:27:34 EST 2008

"""

import os, sys
from maq import *
from useful import progressMessage

oFilename = 'tmp/PlatySolexa.txt'
if not os.path.exists(oFilename):
    oFile = open(oFilename, 'w')

    dataDir = '/Users/papenfuss/databases/platypus/venom/solexa/'
    for i, read in enumerate(
            MaqViewFile(os.path.join(dataDir, 'mapview.txt'), mQ_cutoff=40)):
        if (i % 1000) == 0:
            progressMessage("# maq %s", i, 28395347)
        tokens = str(read).split('\t')
        tokens.append(i)
        print >> oFile, "|".join([str(x) for x in tokens])
    oFile.close()
    progressMessage("# maq %s\n", i, 28395347)

os.system(
    """sqlite3 alignedReads.db '.import "tmp/PlatySolexa.txt" PlatySolexa'""")
Example #12
0
import os, sys
from useful import progressMessage


iFilename = sys.argv[1]
oFilename = sys.argv[2]

mQ_cutoff = 40
nSeqs = 280000000

oFile = open(oFilename, 'w')
headers = ['name','chrom','start','strand','mQ','numTied','score','numZeroMismatches']
format = '\t'.join(['%s','%s','%i','%s','%i','%i','%i','%i'])

print >> oFile, '\t'.join(headers)

for i,line in enumerate(open(iFilename)):
    tokens = line.strip().split('\t')
    mQ = int(tokens[7])
    if mQ>=mQ_cutoff:
        name = tokens[0]
        chrom = tokens[1]
        start = int(tokens[2])
        strand = tokens[3]
        numTied = int(tokens[10])
        score = int(tokens[11])
        numZeroMismatches = int(tokens[12])
        print >> oFile, format % (name,chrom,start,strand,mQ,numTied,score,numZeroMismatches)
        if (i % 1000)==0: progressMessage('# maq hits %s', i, nSeqs)
oFile.close()
Example #13
0
                         h.converters,
                         indexedAttributes=['subjectId', 'sStart', 'sEnd'])
mapper(HSP, hsps_table)

# Start a session & initialize database
session = createSession(dsn, metadata)

if case in [1, 2]:
    # Devil 454 reads
    for i, line in enumerate(open(iFilename)):
        tokens = line.strip().split('\t')
        h = HSP(tokens[0:-2])
        h.convertBlockToGenomeCoords()
        session.save(h)
        if (i % 5000) == 0:
            progressMessage("# HSPs %s", i, n)
            session.commit()
    progressMessage("# HSPs %s\n", i, n)
    session.commit()
elif case == 3:
    # Platypus 454 reads
    for i, h in enumerate(BlastFile(iFilename)):
        h.subjectId = h.subjectId.split('|')[1]
        h.convertBlockToGenomeCoords()
        session.save(h)
        if (i % 5000) == 0:
            progressMessage("# HSPs %s", i, n)
            session.commit()
    progressMessage("# HSPs %s\n", i, n)
    session.commit()
Example #14
0
"""
loadSolexa.py

Author: Tony Papenfuss
Date: Tue Jun 24 14:27:34 EST 2008

"""

import os, sys
from maq import *
from useful import progressMessage


oFilename = 'tmp/PlatySolexa.txt'
if not os.path.exists(oFilename):
    oFile = open(oFilename, 'w')
    
    dataDir = '/Users/papenfuss/databases/platypus/venom/solexa/'
    for i,read in enumerate(MaqViewFile(os.path.join(dataDir, 'mapview.txt'), mQ_cutoff=40)):
        if (i % 1000)==0:
            progressMessage("# maq %s", i, 28395347)
        tokens = str(read).split('\t')
        tokens.append(i)
        print >> oFile, "|".join([str(x) for x in tokens])
    oFile.close()
    progressMessage("# maq %s\n", i, 28395347)


os.system("""sqlite3 alignedReads.db '.import "tmp/PlatySolexa.txt" PlatySolexa'""")
Example #15
0
# dbFilename = ':memory:'
session = initDatabase(dbFilename, iFilename)


# 2a. Parse read alignment file and has results
# 2b. Write out unannotated reads
flankSize = 1000
maqFilename = '/Users/papenfuss/databases/platypus/venom/solexa/mapview.txt'
data = {}
unannFile = open('test_unannotated.txt', 'w')
multFile = open('test_multiple.txt', 'w')
multFile2 = open('test_multiple_gene.txt', 'w')
for i,m in enumerate(MaqViewFile(maqFilename)):
    if i==10000: break
    if (i % 1000)==0: 
        progressMessage('# maq alns %s', i)
    
    q = session.query(Gene).filter(Gene.chrom==m.chrom) \
        .filter(Gene.start<m.start+flankSize) \
        .filter(Gene.end>m.start+32-flankSize).all()
    
    if len(q)==0:
        print >> unannFile, m
        continue
    elif len(q)>1:
        x = set([r.geneId for r in q])
        if len(x)>1:
            print >> multFile, m
            print >> multFile2, "%s\t%s" % (m.name, ','.join(x))
            continue
    
Example #16
0
tileSize = 35
chrSizeFilename = '/Users/papenfuss/databases/chromSizes/ornAna5.txt'

chrSizes = loadChrSizes(chrSizeFilename)

iFile = open(iFilename)
headers = iFile.readline().strip().split('\t')
oFile = open(oFilename, 'w')
format = "%s\t%i\t%i"

chrom = None
lastChrom = None
countDict = {}
for i,line in enumerate(iFile):
    if (i % 1000)==0:
        progressMessage('# reads %s', i, 28000000)
    tokens = line.strip().split('\t')
    d = dict(zip(headers, tokens))
    chrom = d['chrom']
    if chrom=='MT':
        continue
    elif 'Ultra' in chrom or 'Contig' in chrom:
        pass
    else:
        chrom = 'chr%s' % chrom
    start = int(d['start'])
    
    if chrom!=lastChrom and countDict:
        print chrom
        for _wStart in xrange(1, chrSizes[lastChrom], tileSize):
            counts = countDict.get((lastChrom, _wStart), 0)
Example #17
0
dbFilename = os.path.join(iDir, 'mart_names_locations.sqlite')
# dbFilename = ':memory:'
session = initDatabase(dbFilename, iFilename)

# 2a. Parse read alignment file and has results
# 2b. Write out unannotated reads
flankSize = 1000
maqFilename = '/Users/papenfuss/databases/platypus/venom/solexa/mapview.txt'
data = {}
unannFile = open('test_unannotated.txt', 'w')
multFile = open('test_multiple.txt', 'w')
multFile2 = open('test_multiple_gene.txt', 'w')
for i, m in enumerate(MaqViewFile(maqFilename)):
    if i == 10000: break
    if (i % 1000) == 0:
        progressMessage('# maq alns %s', i)

    q = session.query(Gene).filter(Gene.chrom==m.chrom) \
        .filter(Gene.start<m.start+flankSize) \
        .filter(Gene.end>m.start+32-flankSize).all()

    if len(q) == 0:
        print >> unannFile, m
        continue
    elif len(q) > 1:
        x = set([r.geneId for r in q])
        if len(x) > 1:
            print >> multFile, m
            print >> multFile2, "%s\t%s" % (m.name, ','.join(x))
            continue