Ejemplo n.º 1
0
 def get_annot_db(self,
                  table,
                  primaryKey='name',
                  sliceAttrDict=dict(id='chrom',
                                     start='chromStart',
                                     stop='chromEnd')):
     '''generic method to obtain an AnnotationDB for any
     annotation table in UCSC, e.g. snp130.  If your target table
     has non-standard name, start, end columns, specify them in
     the primaryKey and sliceAttrDict args.
     Saves table as named attribute on this package object.'''
     try:  # return existing db if already cached here
         return getattr(self, table)
     except AttributeError:
         pass
     sliceDB = sqlgraph.SQLTable(self.ucsc_db + '.' + table,
                                 primaryKey=primaryKey,
                                 serverInfo=self.ucsc_server,
                                 itemClass=UCSCSeqIntervalRow)
     annoDB = annotation.AnnotationDB(sliceDB,
                                      self.genome_seq,
                                      checkFirstID=False,
                                      sliceAttrDict=sliceAttrDict)
     setattr(self, table, annoDB)  # cache this db on named attribute
     return annoDB
Ejemplo n.º 2
0
    def test_slice_descr(self):
        aa_db = annotation.AnnotationDB(
            {},
            self.db,
            itemClass=annotation.TranslationAnnot,
            itemSliceClass=annotation.TranslationAnnotSlice,
            sliceAttrDict=dict(id=0, start=1, stop=2))

        aa = aa_db.new_annotation('bar', (self.FLIM.id, 0, 12))
        assert str(aa) == 'FLIM'
        assert str(aa[1:3].sequence) == 'CTAATT'
Ejemplo n.º 3
0
def makeResourceFromBed(fileLines, genome, docstring='Temp Resource From BED', dataPath='memory'):
    'Generate a sqlite table, annotDB, and NLMSA from the given bed lines'
    bedLines = readBedLines(fileLines)
    bedDict = makeDictFromBed(bedLines)
    tableName = os.path.split(dataPath)[1]
    sqlDataPath = dataPath if dataPath != 'memory' else ':memory:'  # SQLite has special name for in-memory tables
    dataTable = convertDictToSQLite(bedDict, tableName, sqlDataPath)
    annotDB = annotation.AnnotationDB(dataTable, genome,
                                      sliceAttrDict=eval(defaultSliceAttrs))
    annotMap = makeNLMSA([annotDB], dataPath)
    return dataTable, annotDB, annotMap
Ejemplo n.º 4
0
def bed2pygr(dbprefix, referencefile, bedfile, indir):

    collision_counter = defaultdict(int)
    chrdb = seqdb.SequenceFileDB(referencefile)
    annodb = annotation.AnnotationDB({}, chrdb)

    al = cnestedlist.NLMSA(dbprefix, 'w', pairwiseMode=True)

    load_bed(al, annodb, bedfile, collision_counter)

    al.build(saveSeqDict=True)

    genomeprefix = os.path.basename(referencefile).rsplit('.', 1)[0]
    print >> open(os.path.join(dbprefix) + '.genome', 'w'), genomeprefix
Ejemplo n.º 5
0
    def test_translation_db(self):
        aa_db = annotation.AnnotationDB(
            {},
            self.db,
            itemClass=annotation.TranslationAnnot,
            itemSliceClass=annotation.TranslationAnnotSlice,
            sliceAttrDict=dict(id=0, start=1, stop=2))

        aa = aa_db.new_annotation('foo', (self.M.id, 0, 3))
        orf = aa_db['foo']
        assert str(orf) == 'M'

        aa2 = aa_db.new_annotation('bar', (self.FLIM.id, 0, 12))
        orf = aa_db['bar']
        assert str(orf) == 'FLIM'
Ejemplo n.º 6
0
def read_genbank_annots(gbfile, fastafile=None, featureType='CDS',
                        geneQualifier='gene'):
    '''construct annotation DB for gene CDS intervals.
    NB: this assumes each gene consists of ONE interval.
    This cannot be used for multi-exon genes!'''
    try:
        gbparse = SeqIO.parse(gbfile, 'genbank')
    except TypeError: # SeqIO changed its interface?
        ifile = open(gbfile)
        try:
            gbparse = SeqIO.parse(ifile, 'genbank')
            gbseqs = list(gbparse)
        finally:
            ifile.close()
    else:
        gbseqs = list(gbparse)
    if fastafile is None:
        fastafile = gbfile.split('.')[0] + '.fna'
    genome = seqdb.SequenceFileDB(fastafile)
    genomeIndex = blast.BlastIDIndex(genome) # handle NCBI ID blobs properly
    annodb = annotation.AnnotationDB({}, genome,
                                     sliceAttrDict=dict(id=0, start=1, stop=2,
                                                        orientation=3))
    i = 0
    for s in gbseqs:
        seqID = genomeIndex[s.id].id # find the right seq and get its actual ID
        for f in s.features:
            if f.type == featureType:
                try:
                    name = f.qualifiers[geneQualifier][0]
                except KeyError: # keep the annotation even if label missing
                    warnings.warn('Missing gene qualifier "%s" on %s annotation'
                                  % (geneQualifier, featureType))
                    name = 'unlabeled_%s_%d' % (featureType, i)
                    i += 1
                annodb.new_annotation(name,
                        (seqID, f.location.start.position,
                         f.location.end.position, f.strand))
    al = cnestedlist.NLMSA('tmp', 'memory', pairwiseMode=True)
    for a in annodb.itervalues():
        al.addAnnotation(a)
    al.build()
    return annodb, al, genome
Ejemplo n.º 7
0
    def test_negative_frames(self):
        aa_db = annotation.AnnotationDB(
            {},
            self.db,
            itemClass=annotation.TranslationAnnot,
            itemSliceClass=annotation.TranslationAnnotSlice,
            sliceAttrDict=dict(id=0, start=1, stop=2, orientation=3))

        f1 = aa_db.new_annotation('f1', (self.FLIM.id, 0, 12, -1))
        assert str(f1) == 'HN*K'
        assert f1.frame == -2

        f2 = aa_db.new_annotation('f2', (self.FLIM.id, 1, 10, -1))
        assert str(f2) == '*LE'
        assert f2.frame == -1

        f3 = aa_db.new_annotation('f3', (self.FLIM.id, 2, 11, -1))
        assert str(f3) == 'IIR'
        assert f3.frame == -3
Ejemplo n.º 8
0
    def test_positive_frames(self):
        aa_db = annotation.AnnotationDB(
            {},
            self.db,
            itemClass=annotation.TranslationAnnot,
            itemSliceClass=annotation.TranslationAnnotSlice,
            sliceAttrDict=dict(id=0, start=1, stop=2))

        f1 = aa_db.new_annotation('f1', (self.FLIM.id, 0, 12))
        assert str(f1) == 'FLIM'
        assert f1.frame == +1

        f2 = aa_db.new_annotation('f2', (self.FLIM.id, 1, 10))
        assert str(f2) == 'F*L'
        assert f2.frame == +2

        f3 = aa_db.new_annotation('f3', (self.FLIM.id, 2, 11))
        assert str(f3) == 'SNY'
        assert f3.frame == +3
Ejemplo n.º 9
0
def read_exon_annots(genome, genesFile='knownGene.txt'):
    '''read multi-exon transcript set and build exon annotation db
    and exon-to-gene mapping'''
    exonDict, genes, trLen = read_known_genes(genesFile)
    geneLengths = get_gene_maxlengths(genes, trLen)
    totalSize = sum(geneLengths.values())
    annodb = annotation.AnnotationDB({}, genome,
                                     sliceAttrDict=dict(id=0, orientation=1,
                                                        start=2, stop=3))
    al = cnestedlist.NLMSA('tmp', 'memory', pairwiseMode=True,
                           maxlen=1000000000)
    i = 0
    exonGene = {}
    for t,geneID in exonDict.iteritems():
        a = annodb.new_annotation(i, t)
        exonGene[i] = geneID
        i += 1
        al.addAnnotation(a)
    al.build()
    return annodb, al, exonGene, totalSize, geneLengths
Ejemplo n.º 10
0
    def __init__(self, O):
        self.species = O
        dir(worldbase.Bio.Seq)
        dir(worldbase.Bio.Seq.Genome)
        Genome = eval('dir(worldbase.Bio.Seq.Genome.' + O.genome_name +
                      ')[-1]')
        O.genome_build = Genome

        serverInfo = sqlgraph.DBServerInfo(host='genome-mysql.cse.ucsc.edu',
                                           user='******')
        txInfo = sqlgraph.SQLTable(O.genome_build + '.ensGene',
                                   serverInfo=serverInfo,
                                   itemClass=UCSCSeqIntervalRow,
                                   primaryKey='name')
        self.chromosome = eval('worldbase.Bio.Seq.Genome.' + O.genome_name +
                               '.' + Genome + '(download=True)')
        for i in self.chromosome:
            print i, len(self.chromosome[i])
        self.annodb = annotation.AnnotationDB(txInfo,
                                              self.chromosome,
                                              sliceAttrDict=dict(
                                                  id='chrom',
                                                  start='txStart',
                                                  stop='txEnd'))
Ejemplo n.º 11
0
def bedToNLMSA(bedlines,
               genome,
               field_locations=dict(id=0,
                                    start=1,
                                    stop=2,
                                    name=3,
                                    score=4,
                                    orientation=-1)):
    "Build a pygr resource off of the BED file in_name"
    annotDB = annotation.AnnotationDB(None,
                                      genome,
                                      verbose=False,
                                      sliceAttrDict=field_locations)
    nlmsa = cnestedlist.NLMSA('tmp_bed',
                              mode='memory',
                              pairwiseMode=True,
                              bidirectional=False)
    index = 0
    skipped = 0
    for line in bedlines:
        if not line:
            continue
        fields = line.strip().split('\t')
        orientation = 1 if len(fields) < 6 or fields[5] == '+' else -1
        #print fields, orientation
        try:
            curAnnot = annotDB.new_annotation(index, fields + [orientation])
            nlmsa.addAnnotation(curAnnot)
            index += 1
        except KeyError as e:
            print ('Skipping row without matching chromosome: %s,' +\
                    'message: %s') % (row.id, e.message)
            skipped += 1
    #annotDB.close()
    nlmsa.build()
    return annotDB, nlmsa
Ejemplo n.º 12
0
    def __init__(self,
                 ucsc_genome_name,
                 ens_species=None,
                 ucsc_serverInfo=None,
                 ens_serverInfo=None,
                 ens_db=None,
                 trackVersion='hgFixed.trackVersion'):
        '''Construct interfaces to UCSC/Ensembl annotation databases.
        ucsc_genome_name must be a worldbase ID specifying a UCSC genome.
        naming convention.
        ens_species should be the Ensembl database name (generally
        the name of the species).  If not specified, we will try
        to autodetect it based on ucsc_genome_name.
        The interface uses the standard UCSC and Ensembl mysql servers
        by default, unless you provide serverInfo argument(s).
        trackVersion must be the fully qualified MySQL table name
        of the trackVersion table containing information about the
        Ensembl version that each genome dataset connects to.'''
        # Connect to both servers and prepare database names.
        if ucsc_serverInfo is not None:
            if isinstance(ucsc_serverInfo, str):  # treat as worldbase ID
                self.ucsc_server = worldbase(ucsc_serverInfo)
            else:
                self.ucsc_server = ucsc_serverInfo
        else:
            self.ucsc_server = sqlgraph.DBServerInfo(
                host='genome-mysql.cse.ucsc.edu', user='******')
        if ens_serverInfo is not None:
            if isinstance(ens_serverInfo, str):  # treat as worldbase ID
                self.ens_server = worldbase(ens_serverInfo)
            else:
                self.ens_server = ens_serverInfo
        else:
            self.ens_server = sqlgraph.DBServerInfo(
                host='ensembldb.ensembl.org', port=5306, user='******')
        self.ucsc_db = ucsc_genome_name.split('.')[-1]
        if ens_db is None:  # auto-set ensembl database name
            self.ens_db = self.get_ensembl_db_name(ens_species, trackVersion)
        else:
            self.ens_db = ens_db
        # Connect to all the necessary tables.
        self.ucsc_ensGene_trans = sqlgraph.SQLTable(
            '%s.ensGene' % self.ucsc_db,
            serverInfo=self.ucsc_server,
            primaryKey='name',
            itemClass=UCSCSeqIntervalRow)
        self.ucsc_ensGene_gene = sqlgraph.SQLTable(
            '%s.ensGene' % self.ucsc_db,
            serverInfo=self.ucsc_server,
            primaryKey='name2',
            allowNonUniqueID=True,
            itemClass=UCSCSeqIntervalRow,
            attrAlias=dict(minTxStart='min(txStart)', maxTxEnd='max(txEnd)'))
        self.ucsc_ensGtp_gene = sqlgraph.SQLTable('%s.ensGtp' % self.ucsc_db,
                                                  serverInfo=self.ucsc_server,
                                                  primaryKey='gene',
                                                  allowNonUniqueID=True)
        self.prot_db = sqlgraph.SQLTable('%s.ensGtp' % self.ucsc_db,
                                         serverInfo=self.ucsc_server,
                                         primaryKey='protein',
                                         itemClass=EnsemblProteinRow)
        self.prot_db.gRes = self
        self.ucsc_ensPep = sqlgraph.SQLTable(
            '%s.ensPep' % self.ucsc_db,
            serverInfo=self.ucsc_server,
            itemClass=sqlgraph.ProteinSQLSequenceCached,
            itemSliceClass=seqdb.SeqDBSlice)
        self.ens_exon_stable_id = sqlgraph.SQLTable('%s.exon_stable_id' %
                                                    self.ens_db,
                                                    serverInfo=self.ens_server,
                                                    primaryKey='stable_id')
        self.ens_transcript_stable_id = sqlgraph.SQLTable(
            '%s.transcript_stable_id' % self.ens_db,
            serverInfo=self.ens_server,
            primaryKey='stable_id')
        # We will need this too.
        self.genome_seq = worldbase(ucsc_genome_name)
        # Finally, initialise all UCSC-Ensembl databases.
        self.trans_db = annotation.AnnotationDB(
            self.ucsc_ensGene_trans,
            self.genome_seq,
            checkFirstID=False,
            sliceAttrDict=dict(id='chrom', start='txStart', stop='txEnd'),
            itemClass=EnsemblTranscriptAnnotationSeq)
        self.gene_db = annotation.AnnotationDB(self.ucsc_ensGene_gene,
                                               self.genome_seq,
                                               checkFirstID=False,
                                               sliceAttrDict=dict(
                                                   id='chrom',
                                                   start='txStart',
                                                   stop='txEnd'))
        exon_slicedb = EnsemblExonOnDemandSliceDB(self)
        self.exon_db = annotation.AnnotationDB(exon_slicedb,
                                               self.genome_seq,
                                               checkFirstID=False,
                                               sliceAttrDict=dict(
                                                   id=0,
                                                   start=1,
                                                   stop=2,
                                                   orientation=3))
        # Mappings.
        self.protein_transcript_id_map = sqlgraph.MapView(
            self.prot_db,
            self.trans_db,
            'select transcript from %s.ensGtp \
            where protein=%%s' % self.ucsc_db,
            inverseSQL='select protein \
            from %s.ensGtp where transcript=%%s' % self.ucsc_db,
            serverInfo=self.ucsc_server)
        self.transcripts_in_genes_map = sqlgraph.GraphView(
            self.gene_db,
            self.trans_db,
            "select transcript from %s.ensGtp where gene=%%s" % self.ucsc_db,
            inverseSQL="select gene from %s.ensGtp where transcript=%%s" %
            self.ucsc_db,
            serverInfo=self.ucsc_server)
        self.ens_transcripts_of_exons_map = sqlgraph.GraphView(
            self.exon_db,
            self.trans_db,
            """\
select trans.stable_id from %s.exon_stable_id exon, \
%s.transcript_stable_id trans, %s.exon_transcript et where \
exon.exon_id=et.exon_id and trans.transcript_id=et.transcript_id and \
exon.stable_id=%%s""" % (self.ens_db, self.ens_db, self.ens_db),
            serverInfo=self.ens_server)
        self.ens_transcripts_of_exons_map2 = sqlgraph.GraphView(
            self.ens_exon_stable_id,
            self.trans_db,
            """\
select trans.stable_id from %s.exon_stable_id exon, \
%s.transcript_stable_id trans, %s.exon_transcript et where \
exon.exon_id=et.exon_id and trans.transcript_id=et.transcript_id and \
exon.stable_id=%%s""" % (self.ens_db, self.ens_db, self.ens_db),
            serverInfo=self.ens_server)
        self.ens_exons_in_transcripts_map = sqlgraph.GraphView(
            self.trans_db,
            self.exon_db,
            """\
select exon.stable_id from %s.exon_stable_id exon, %s.transcript_stable_id \
trans, %s.exon_transcript et where exon.exon_id=et.exon_id and \
trans.transcript_id=et.transcript_id and trans.stable_id=%%s order by \
et.rank""" % (self.ens_db, self.ens_db, self.ens_db),
            serverInfo=self.ens_server)
        self.ens_exons_in_transcripts_map2 = sqlgraph.GraphView(
            self.trans_db,
            self.ens_exon_stable_id,
            """\
select exon.stable_id from %s.exon_stable_id exon, %s.transcript_stable_id \
trans, %s.exon_transcript et where exon.exon_id=et.exon_id and \
trans.transcript_id=et.transcript_id and trans.stable_id=%%s order by \
et.rank""" % (self.ens_db, self.ens_db, self.ens_db),
            serverInfo=self.ens_server)
        self.trans_db.exons_map = self.ens_exons_in_transcripts_map2
Ejemplo n.º 13
0
def main():
    """ Load the given csv file into an sqlite table, saving an
        annotationDB and an NLMSA version of the original file """

    parser = optparse.OptionParser("%prog [options] infile.csv\n"+main.__doc__)
    parser.add_option("--datapath", '-p', dest="datapath", type="string",
                      default='/home/shared/pygrdata/annotations/HUMAN/hg18',
                      help="""Sets the datafile path.  Default=%default""")
    parser.add_option("--table_name", '-t', dest="table_name", type="string",
                      help="""The resource table's name and data stem, e.g.,
                      refGene => datapath/refGene.sqlite """)
    parser.add_option("--genome", '-g', dest="genome_resource", type="string", default='hg18',
                      help="""The pygr resource for the genome, default=%default""")
    parser.add_option("--save_resource", '-r', dest="save_resource", type="string",
                      help="""Where to save the created annotationDB and NLMSA. eg, 
                      Bio.Annotation.HUMAN.hg18.MotifMap.M0001""")
    parser.add_option("--bind_attribute", '-b', dest="bind_attribute", type="string", 
                      help="""The attribute to access annotationDB from genome region, eg, 
                      'officialGenes' would be accessible via triCas3['ChLG2'][100:200].officialGenes 
                      Default is not to bind an attribute to genome""")
    parser.add_option("--slice_attrs", '-s', dest="slice_attrs", type="string",
                      default='dict(id="chromosome", start="start", stop="stop", orientation="orientation")',
                      help="""dictionary providing aliases in csv file for id, start, stop, etc. 
                      default=%default'""")
    parser.add_option("--bed_format", dest="bed_format", action='store_true',
                      help="""csv file is in BED file format, without headers.""")
    opts, args = parser.parse_args()
    if len(args) < 1: 
        parser.print_help()
        print 'Please specify at least one csv file to read'
        sys.exit(-1)
    if None in [opts.save_resource, opts.table_name]:
        parser.print_help()
        print 'Required options: save_resource, table_name'
        sys.exit(-1)
    
    fileIn = open(args[0])
    if not opts.bed_format:
        reader = csv.DictReader(fileIn, delimiter='\t')
    else:
        fileIn = itertools.ifilter(bedCommentFilter, fileIn)
        reader = csv.DictReader(fileIn, delimiter='\t', fieldnames=['chromosome', 'start', 'stop'], restkey='junkData')
    fieldnames = reader.fieldnames
    print fieldnames
    
    print '# Loading genome %s' % opts.genome_resource
    genome = getGenome(opts.genome_resource)
    
    opts.table_name = opts.table_name.replace('.','_')      # SQL interprets . as membership
    tablePath = os.path.join(opts.datapath,opts.table_name + '.sqlite')
    print '# Creating sqlite table for %s at %s' % (opts.table_name, tablePath)
    dataTable = convertBedToSQLite(reader, opts.table_name, fieldNames=fieldnames)
 
 
    
    print '# Making AnnotationDB and NLMSA...'
    annotDB = annotation.AnnotationDB(dataTable, genome, annotationType=opts.table_name+':',
                                      sliceAttrDict=eval(opts.slice_attrs))
    annotDB.__doc__ = 'AnnotationDB for %s on %s' % (opts.table_name, opts.genome_resource)
    
    msaName = os.path.join(opts.datapath, opts.table_name + '_')
    annotMap = makeNLMSA([annotDB], dataPath=msaName)

    print '# Saving results to worldbase as %s and %s...' % (opts.save_resource,
                                                             opts.save_resource+'_db')
    worldbase.add_resource(opts.save_resource, annotMap)
    worldbase.add_resource(opts.save_resource+'_db', annotDB)
    worldbase.commit()