コード例 #1
0
def kmerhomology(k, kmerlist, fastadict):
    k = int(k)
    homologydict = {} #{kmer:[conserved_occurences, total occurences]}
    UTRcounter = 0
    account = HostAccount('sugarman', 'ensembl', 'ensembl')
    #account = None
    compara = Compara(['mouse', 'human'], Release=61, account=account)
    sqlalchemyfails = 0
    
    if k != len(kmerlist[0]):
        sys.stderr.write('Warning! Provided value of k does not match length of given kmer!')
    for UTR in fastadict:
        UTRcounter +=1
        if UTRcounter % 50 == 0:
            sys.stderr.write('Determining motif conservation in UTR {0} of {1}...\n'.format(UTRcounter, len(fastadict)))
        UTRsequence = fastadict[UTR]
        UTR = UTR.replace(';', '\t').split('\t')
        ID = UTR[0]
        chrm = UTR[1].replace('chr','') #change to ensembl style
        start = int(UTR[2])
        stop = int(UTR[3])
        strand = UTR[4]

        for i in range(len(UTRsequence) - k + 1):
            if strand == '+':
                mousekmer = UTRsequence[i:i+k]
                mousekmerstart = start + i
                mousekmerstop = start + i + k - 1
            elif strand == '-': #actually counting back from the end...and remember the last 50 and stop codons are removed!!
                mousekmer = UTRsequence[i:i+k]
                mousekmerstart = stop - i - k + 1
                mousekmerstop = stop - i
            if mousekmer in kmerlist:
                if homologydict.has_key(mousekmer) == False:
                    homologydict[mousekmer] = [0, 1] #if not in dictionary, initiate entry with 1 in total occurences
                elif homologydict.has_key(mousekmer):
                    homologydict[mousekmer][1] +=1 #if in dictionary, add one to total occurences
                    
                #stupid f*****g sqlalchemy timeouts
                try:
                    for synt_region in compara.getSyntenicRegions(Species = 'mouse', CoordName = chrm, Start = mousekmerstart, End = mousekmerstop, Strand = strand, ensembl_coord = True, align_method = 'PECAN', align_clade = '19 amniota vertebrates Pecan'):
                        membs = synt_region.Members
                        if len(membs) == 2: #if there is no aligned human seq just skip it
                            completed = True
                            mouse = membs[0]
                            human = membs[1]
                            #strand sometimes seems to not get picked up by compara. If on minus strand, this seq may be rev comp of motif.
                            mouseseq = str(mouse.AlignedSeq) 
                            humanseq = str(human.AlignedSeq)
                            if mouseseq == humanseq:
                                homologydict[mousekmer][0] += 1 #add conserved occurence to dictionary
                        elif len(membs) != 2:
                            pass
                except (OE, mOE):
                    sys.stderr.write('Genome mysql error!!!')
                    sqlalchemyfails +=1
                    i = i-1 #try again
                    continue
            
    return homologydict, sqlalchemyfails
コード例 #2
0
def kmerhomologydict(k, fastadict): #for multiple kmers at once, homologydict of every kmer in every UTR sequence
    k = int(k)
    kmerhomologydict = {} #{kmer:[conserved_occurences, total_occurences]}
    UTRcounter = 0
    analyzedUTRs = 0
    account = HostAccount('sugarman', 'ensembl', 'ensembl')
    #account = None
    compara = Compara(['mouse', 'human'], Release=61, account=account)
    sqlalchemyfails = 0

    for UTR in fastadict:
        UTRcounter +=1
        UTRsequence = fastadict[UTR]
        UTR = UTR.replace(';', '\t').split('\t')
        ID = UTR[0]
        chrm = UTR[1].replace('chr','') #change to ensembl style
        start = int(UTR[2])
        stop = int(UTR[3])
        strand = UTR[4]
        if UTRcounter % 1 == 0:
            sys.stderr.write('Determining motif conservation in UTR {0}, number {1} of {2}, (interrogated {3} so far)...\n'.format(ID, UTRcounter, len(fastadict), analyzedUTRs))

        for i in range(len(UTRsequence) - k + 1):
            if strand == '+':
                mousekmer = UTRsequence[i:i+k]
                mousekmerstart = start + i
                mousekmerstop = start + i + k - 1
            elif strand == '-': #actually counting back from the end...and remember the last 50 and stop codons are removed!!
                mousekmer = UTRsequence[i:i+k]
                mousekmerstart = stop - i - k + 1
                mousekmerstop = stop - i
            if kmerhomologydict.has_key(mousekmer) == False:
                kmerhomologydict[mousekmer] = [0, 1] #if not in dictionary, initiate entry with 1 in total occurences
            elif kmerhomologydict.has_key(mousekmer):
                kmerhomologydict[mousekmer][1] +=1 #if in dictionary, add one to total occurences
                
            for synt_region in compara.getSyntenicRegions(Species = 'mouse', CoordName = chrm, Start = mousekmerstart, End = mousekmerstop, Strand = strand, ensembl_coord = True, align_method = 'PECAN', align_clade = '19 amniota vertebrates Pecan'):
                if mousekmerstart == start and strand == '+' or mousekmerstop == stop and strand == '-': #this will be true once per UTR
                    analyzedUTRs +=1
                membs = synt_region.Members
                if len(membs) == 2: #if there is no aligned human seq just skip it
                    completed = True
                    mouse = membs[0]
                    human = membs[1]
                    #strand sometimes seems to not get picked up by compara. If on minus strand, this seq may be rev comp of motif.
                    mouseseq = str(mouse.AlignedSeq) 
                    humanseq = str(human.AlignedSeq)
                    if mouseseq == humanseq:
                        kmerhomologydict[mousekmer][0] += 1 #add conserved occurence to dictionary
                elif len(membs) != 2:
                    pass
                
    sys.stderr.write('Analyzed {0} of {1} UTRs.\n'.format(analyzedUTRs, len(fastadict)))

    return kmerhomologydict, sqlalchemyfails
コード例 #3
0
def test():

    comp = Compara(species, account=account, Release=release)
    #print comp.method_species_links
    coords = [('15', 85273455, 85273507, '+'), ('18', 12423976, 12424060, '+')]
    for c in coords:
        getSyntenicAlignment(comp, 'cow', c)
        #getAlignmentTree('ensembl_aln.fa')
    return
コード例 #4
0
def getsynteny(gcoords):
	print 'Looking for synteny...'
	genecounter = 0
	compara = Compara(['mouse', 'human'], Release = 83, account = None)
	gcoords_matches = {} #just like gcoords but with a list of matches {gene : [chrm, strand, [coords], [matches]]}
	for gene in gcoords:
		genecounter +=1
		if genecounter % 100 == 0:
			print 'Analyzing gene {0} of {1}...'.format(genecounter, len(gcoords))
		info = gcoords[gene]
		chrm = info[0].replace('chr', '') #ensembl style chromosome names
		#Strand is not used by compara.getSyntenicRegions
		strand = info[1]
		coords = info[2]
		#If there were no g coordinates for this gene
		if not coords:
			continue
		#Get alignment for entire length of region using min(coords) and max(coords) to minimize number of calls.
		#Parse out the results later.
		for synt_region in compara.getSyntenicRegions(Species = 'mouse', CoordName = chrm, Start = min(coords), End = max(coords), Strand = strand, ensembl_coord = True, align_method = 'PECAN', align_clade = '23 amniota vertebrates Pecan'):
			membs = synt_region.Members
			if len(membs) != 2:
				#Not really sure how this can happen.  Usually if there is no human alignment, it returns None
				continue
			mouse = membs[0]
			human = membs[1]
			mouseseq = str(mouse.AlignedSeq)
			humanseq = str(human.AlignedSeq)
			mouseseq, humanseq = checkstrand(mouseseq, humanseq, 'chr' + chrm, coords, seq_dict)
			if mouseseq and humanseq:
				matches = checkifmatch(mouseseq, humanseq, coords)
				info.append(matches)
				#Not every gene makes it this far.  Some have no syntentic regions I think.
				gcoords_matches[gene] = info
			else:
				print 'Sequence problems with {0}.  Alignment and genome sequence did not match.'.format(gene)
	
	return gcoords_matches
コード例 #5
0
def get_host_genes(df, ref='cow'):
    """Get all genes containing the given miRNAs using ensembl"""

    results = []
    comp = Compara(species, account=None, Release='79')
    for i, r in list(df.iterrows()):
        name = r['#miRNA']
        c, locs, strand = r['precursor coordinate'].split(':')
        start, end = locs.split('..')
        coords = c, int(start), int(end), strand
        genes = get_genes_from_location(ref, coords)
        for g in genes:
            if g.BioType != 'miRNA':
                tu = findinGene(g, start, end)
                results.append(
                    (name, g.Symbol, g.Location, g.BioType, tu, g.StableId))
                #print name,g.Symbol,tu
    if len(results) > 0:
        results = pd.DataFrame(
            results,
            columns=['#miRNA', 'gene', 'location', 'biotype', 'tu', 'ensid'])
        return results
    return
コード例 #6
0
def get_mirna_orthologs(df, comp=None, ref='cow'):
    """Get all possible orthologs/conservation for miRNAs using ensembl"""

    if comp == None:
        comp = Compara(species, account=None, Release='79')
    results = []
    for i, r in list(df.iterrows()):
        #base.RNAfold(r['consensus precursor sequence'], r['#miRNA']+'_'+ref)
        mature = r['consensus mature sequence'].replace('u', 't').upper()
        seed = r['seed'].replace('u', 't').upper()
        c, locs, strand = r['precursor coordinate'].split(':')
        start, end = locs.split('..')
        print(r['#miRNA'], seed, mature, locs, strand)
        coords = (c, int(start), int(end), strand)
        regions, aln = getSyntenicAlignment(comp,
                                            ref,
                                            coords,
                                            fname=r['#miRNA'] + '.aln.fa')
        if aln == None:
            x = pd.DataFrame([r])
            a = getHostGenes(x)
            if a is not None:
                results.append(a)
            continue
        region = regions[0]
        a = base.cogentAlignment2DataFrame(aln.degap())
        a['#miRNA'] = r['#miRNA']
        a['ident'] = getIdentities(aln)
        print('max identity: %s' % a.ident.max())
        a['seedcons'] = getSeqConservation(aln, seed)
        orthgenes = getGenesinRegion(region)

        a['gene'] = [g[0].Symbol if len(g) > 0 else np.nan for g in orthgenes]
        a['gene_loc'] = [
            g[0].Location if len(g) > 0 else np.nan for g in orthgenes
        ]
        locs = getLocations(region)
        a['location'] = [':'.join(str(l).split(':')[2:]) for l in locs]
        a['biotype'] = [
            g[0].BioType if len(g) > 0 else np.nan for g in orthgenes
        ]
        a['ensid'] = [
            g[0].StableId if len(g) > 0 else np.nan for g in orthgenes
        ]
        #find where in gene the miRNA is, usually introns
        trpts = []
        for l, g in zip(locs, orthgenes):
            if len(g) == 0:
                trpts.append(np.nan)
            else:
                tr = findinGene(g[0], l.Start, l.End)
                trpts.append(tr)
        a['tu'] = trpts

        #get RNAfold energy for each sequence
        a['energy'] = a.apply(lambda x: base.RNAfold(x.seq)[1], 1)
        results.append(a)
        print(a)
        print('--------------------------------------------------------')
    results = pd.concat(results).reset_index(drop=True)
    #results.drop(columns='seq')
    results.to_csv('novel_orthologs.csv')
    return
コード例 #7
0
def ComparaLiftover(gff):
    account = HostAccount('sugarman', 'ensembl', 'ensembl')
    compara = Compara(['mouse', 'rat', 'human', 'cow', 'dog'],
                      Release=61,
                      account=account)
    ratgff = []
    humangff = []
    cowgff = []
    doggff = []

    #Make gff database
    gff_fn = gff
    db_fn = os.path.basename(gff_fn) + '.db'
    if os.path.isfile(db_fn) == False:
        gffutils.create_db(gff_fn, db_fn)

    db = gffutils.FeatureDB(db_fn)
    UTRs = db.features_of_type('3\'UTR')

    for UTR in UTRs:
        #Remove stop codons and last 50 nt of UTR
        if UTR.strand == '+':
            UTRstart = int(UTR.start) + 3
            UTRstop = int(UTR.stop) - 50
        elif UTR.strand == '-':
            UTRstart = int(UTR.start) + 50
            UTRstop = int(UTR.stop) - 3

        for synt_region in compara.getSyntenicRegions(
                Species='mouse',
                CoordName=UTR.chrom.replace('chr', ''),
                Start=UTRstart,
                End=UTRstop,
                Strand=UTR.strand,
                ensembl_coord=True,
                align_method='PECAN',
                align_clade='19 amniota vertebrates Pecan'):
            for region in synt_region.Members:
                if region.Region:
                    locdata = str(region.Region.Location).replace(
                        '-', ':', 1).replace(' ', '_').split(':')
                    species = str(locdata[0])
                    chrm = 'chr' + str(locdata[2])
                    start = locdata[3]
                    stop = locdata[4]
                    if str(locdata[5]) == '1' and UTR.strand == '+':
                        strand = '+'
                    elif str(locdata[5]) == '-1' and UTR.strand == '+':
                        strand = '-'
                    elif str(locdata[5]) == '-1' and UTR.strand == '-':
                        strand = '+'
                    elif str(locdata[5]) == '1' and UTR.strand == '-':
                        strand = '-'
                    ID = (str(UTR.id) + '_' + species)
                    if species == 'Rattus_norvegicus':
                        ratgff.append([
                            chrm, 'ALE', '3\'UTR', start, stop, '.', strand,
                            '.', ID
                        ])
                    elif species == 'Homo_sapiens':
                        humangff.append([
                            chrm, 'ALE', '3\'UTR', start, stop, '.', strand,
                            '.', ID
                        ])
                    elif species == 'Bos_taurus':
                        cowgff.append([
                            chrm, 'ALE', '3\'UTR', start, stop, '.', strand,
                            '.', ID
                        ])
                    elif species == 'Canis_familiaris':
                        doggff.append([
                            chrm, 'ALE', '3\'UTR', start, stop, '.', strand,
                            '.', ID
                        ])

    os.remove(db_fn)
    sys.stderr.write(
        'Succesfully found matches in {0} rat regions, {1} human regions, {2} cow regions and {3} dog regions.\n'
        .format(len(ratgff), len(humangff), len(cowgff), len(doggff)))
    return ratgff, humangff, cowgff, doggff
コード例 #8
0
ファイル: ensembl_pycogent.py プロジェクト: ajm/glutton
def download_database_pycogent(species, release, database_name='ensembl', nucleotide=False) :
    log = get_log()

    #try :
    import cogent
    from cogent.db.ensembl import Species, Genome, Compara, HostAccount
    from cogent.db.ensembl.database import Database

    #except ImportError :
    #    log.fatal("pycogent import failed, exiting...")
    #    exit(1)

    if cogent.version_info != (1,5,3) :
        log.warning("only tested with pycogent version 1.5.3 (you are running %s)" % cogent.version)


    release, db_name, db_details = get_missing_info(species, release, database_name)

    account = HostAccount(
                db_details['hostname'],
                db_details['username'],
                db_details['password'],
                port=db_details['port'])

    if Species.getSpeciesName(species) == 'None' : # this is not an error, it returns the string "None"
        log.warning("%s not found in pycogent, attempting to add it manually" % species)
        Species.amendSpecies(species.capitalize().replace('_', ' '), species)

    genome = Genome(species, Release=release, account=account)
    compara = Compara([species], Release=release, account=account)



    # DON'T TRY THIS AT HOME!
    #
    # what happens is it searches for compara databases, but unfortunately finds more than one
    # in this situation pycogent just connects to the first one, which is always compara_bacteria
    # so one solution is to dig through all the compara objects internals to provide a connection
    # to the correct database ... obviously not the best solution, but at 6 lines of code definitely
    # the shortest ;-P
    #
    if db_name not in ('ensembl', 'bacteria') :
        log.warning("accessing compara from pycogent with species outside of ensembl-main and ensembl-bacteria is problematic, attempting to patch...")

        from cogent.db.ensembl.host import DbConnection
        from cogent.db.ensembl.name import EnsemblDbName
        import sqlalchemy

        new_db_name = EnsemblDbName(compara.ComparaDb.db_name.Name.replace('bacteria', db_name))
        compara.ComparaDb._db = DbConnection(account=account, db_name=new_db_name)
        compara.ComparaDb._meta = sqlalchemy.MetaData(compara.ComparaDb._db)
    # end of DON'T TRY THIS AT HOME!



    genes = set()
    families = []

    stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes)))

    for gene in genome.getGenesMatching(BioType='protein_coding') :
        stableid = gene.StableId

        # ignore genes that have already been seen as members of other gene families
        if stableid in genes :
            continue

        genes.add(stableid)

        paralogs = compara.getRelatedGenes(StableId=stableid, Relationship='within_species_paralog')
        
        current = []
        
        if paralogs is None :
            stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes)))
            current.append((stableid, str(gene.CanonicalTranscript.Cds) if nucleotide else str(gene.CanonicalTranscript.ProteinSeq)))

        else :
            for paralog in paralogs.Members :
                paralogid = paralog.StableId
                genes.add(paralogid)

                stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes)))

                try :
                    current.append((paralogid, str(paralog.CanonicalTranscript.Cds) if nucleotide else str(paralog.CanonicalTranscript.ProteinSeq)))
                
                except AttributeError :
                    log.fatal("pycogent did not find a canonical transcript for %s" % paralogid)
                    exit(1)

        #print ','.join([ i for i,j in current ])
        families.append(current)

    stderr.write("\r[downloading %s] got %d sequences\n" % ("CDS" if nucleotide else "protein", len(genes)))

    return families