def kmerhomology(k, kmerlist, fastadict): k = int(k) homologydict = {} #{kmer:[conserved_occurences, total occurences]} UTRcounter = 0 account = HostAccount('sugarman', 'ensembl', 'ensembl') #account = None compara = Compara(['mouse', 'human'], Release=61, account=account) sqlalchemyfails = 0 if k != len(kmerlist[0]): sys.stderr.write('Warning! Provided value of k does not match length of given kmer!') for UTR in fastadict: UTRcounter +=1 if UTRcounter % 50 == 0: sys.stderr.write('Determining motif conservation in UTR {0} of {1}...\n'.format(UTRcounter, len(fastadict))) UTRsequence = fastadict[UTR] UTR = UTR.replace(';', '\t').split('\t') ID = UTR[0] chrm = UTR[1].replace('chr','') #change to ensembl style start = int(UTR[2]) stop = int(UTR[3]) strand = UTR[4] for i in range(len(UTRsequence) - k + 1): if strand == '+': mousekmer = UTRsequence[i:i+k] mousekmerstart = start + i mousekmerstop = start + i + k - 1 elif strand == '-': #actually counting back from the end...and remember the last 50 and stop codons are removed!! mousekmer = UTRsequence[i:i+k] mousekmerstart = stop - i - k + 1 mousekmerstop = stop - i if mousekmer in kmerlist: if homologydict.has_key(mousekmer) == False: homologydict[mousekmer] = [0, 1] #if not in dictionary, initiate entry with 1 in total occurences elif homologydict.has_key(mousekmer): homologydict[mousekmer][1] +=1 #if in dictionary, add one to total occurences #stupid f*****g sqlalchemy timeouts try: for synt_region in compara.getSyntenicRegions(Species = 'mouse', CoordName = chrm, Start = mousekmerstart, End = mousekmerstop, Strand = strand, ensembl_coord = True, align_method = 'PECAN', align_clade = '19 amniota vertebrates Pecan'): membs = synt_region.Members if len(membs) == 2: #if there is no aligned human seq just skip it completed = True mouse = membs[0] human = membs[1] #strand sometimes seems to not get picked up by compara. If on minus strand, this seq may be rev comp of motif. mouseseq = str(mouse.AlignedSeq) humanseq = str(human.AlignedSeq) if mouseseq == humanseq: homologydict[mousekmer][0] += 1 #add conserved occurence to dictionary elif len(membs) != 2: pass except (OE, mOE): sys.stderr.write('Genome mysql error!!!') sqlalchemyfails +=1 i = i-1 #try again continue return homologydict, sqlalchemyfails
def kmerhomologydict(k, fastadict): #for multiple kmers at once, homologydict of every kmer in every UTR sequence k = int(k) kmerhomologydict = {} #{kmer:[conserved_occurences, total_occurences]} UTRcounter = 0 analyzedUTRs = 0 account = HostAccount('sugarman', 'ensembl', 'ensembl') #account = None compara = Compara(['mouse', 'human'], Release=61, account=account) sqlalchemyfails = 0 for UTR in fastadict: UTRcounter +=1 UTRsequence = fastadict[UTR] UTR = UTR.replace(';', '\t').split('\t') ID = UTR[0] chrm = UTR[1].replace('chr','') #change to ensembl style start = int(UTR[2]) stop = int(UTR[3]) strand = UTR[4] if UTRcounter % 1 == 0: sys.stderr.write('Determining motif conservation in UTR {0}, number {1} of {2}, (interrogated {3} so far)...\n'.format(ID, UTRcounter, len(fastadict), analyzedUTRs)) for i in range(len(UTRsequence) - k + 1): if strand == '+': mousekmer = UTRsequence[i:i+k] mousekmerstart = start + i mousekmerstop = start + i + k - 1 elif strand == '-': #actually counting back from the end...and remember the last 50 and stop codons are removed!! mousekmer = UTRsequence[i:i+k] mousekmerstart = stop - i - k + 1 mousekmerstop = stop - i if kmerhomologydict.has_key(mousekmer) == False: kmerhomologydict[mousekmer] = [0, 1] #if not in dictionary, initiate entry with 1 in total occurences elif kmerhomologydict.has_key(mousekmer): kmerhomologydict[mousekmer][1] +=1 #if in dictionary, add one to total occurences for synt_region in compara.getSyntenicRegions(Species = 'mouse', CoordName = chrm, Start = mousekmerstart, End = mousekmerstop, Strand = strand, ensembl_coord = True, align_method = 'PECAN', align_clade = '19 amniota vertebrates Pecan'): if mousekmerstart == start and strand == '+' or mousekmerstop == stop and strand == '-': #this will be true once per UTR analyzedUTRs +=1 membs = synt_region.Members if len(membs) == 2: #if there is no aligned human seq just skip it completed = True mouse = membs[0] human = membs[1] #strand sometimes seems to not get picked up by compara. If on minus strand, this seq may be rev comp of motif. mouseseq = str(mouse.AlignedSeq) humanseq = str(human.AlignedSeq) if mouseseq == humanseq: kmerhomologydict[mousekmer][0] += 1 #add conserved occurence to dictionary elif len(membs) != 2: pass sys.stderr.write('Analyzed {0} of {1} UTRs.\n'.format(analyzedUTRs, len(fastadict))) return kmerhomologydict, sqlalchemyfails
def test(): comp = Compara(species, account=account, Release=release) #print comp.method_species_links coords = [('15', 85273455, 85273507, '+'), ('18', 12423976, 12424060, '+')] for c in coords: getSyntenicAlignment(comp, 'cow', c) #getAlignmentTree('ensembl_aln.fa') return
def getsynteny(gcoords): print 'Looking for synteny...' genecounter = 0 compara = Compara(['mouse', 'human'], Release = 83, account = None) gcoords_matches = {} #just like gcoords but with a list of matches {gene : [chrm, strand, [coords], [matches]]} for gene in gcoords: genecounter +=1 if genecounter % 100 == 0: print 'Analyzing gene {0} of {1}...'.format(genecounter, len(gcoords)) info = gcoords[gene] chrm = info[0].replace('chr', '') #ensembl style chromosome names #Strand is not used by compara.getSyntenicRegions strand = info[1] coords = info[2] #If there were no g coordinates for this gene if not coords: continue #Get alignment for entire length of region using min(coords) and max(coords) to minimize number of calls. #Parse out the results later. for synt_region in compara.getSyntenicRegions(Species = 'mouse', CoordName = chrm, Start = min(coords), End = max(coords), Strand = strand, ensembl_coord = True, align_method = 'PECAN', align_clade = '23 amniota vertebrates Pecan'): membs = synt_region.Members if len(membs) != 2: #Not really sure how this can happen. Usually if there is no human alignment, it returns None continue mouse = membs[0] human = membs[1] mouseseq = str(mouse.AlignedSeq) humanseq = str(human.AlignedSeq) mouseseq, humanseq = checkstrand(mouseseq, humanseq, 'chr' + chrm, coords, seq_dict) if mouseseq and humanseq: matches = checkifmatch(mouseseq, humanseq, coords) info.append(matches) #Not every gene makes it this far. Some have no syntentic regions I think. gcoords_matches[gene] = info else: print 'Sequence problems with {0}. Alignment and genome sequence did not match.'.format(gene) return gcoords_matches
def get_host_genes(df, ref='cow'): """Get all genes containing the given miRNAs using ensembl""" results = [] comp = Compara(species, account=None, Release='79') for i, r in list(df.iterrows()): name = r['#miRNA'] c, locs, strand = r['precursor coordinate'].split(':') start, end = locs.split('..') coords = c, int(start), int(end), strand genes = get_genes_from_location(ref, coords) for g in genes: if g.BioType != 'miRNA': tu = findinGene(g, start, end) results.append( (name, g.Symbol, g.Location, g.BioType, tu, g.StableId)) #print name,g.Symbol,tu if len(results) > 0: results = pd.DataFrame( results, columns=['#miRNA', 'gene', 'location', 'biotype', 'tu', 'ensid']) return results return
def get_mirna_orthologs(df, comp=None, ref='cow'): """Get all possible orthologs/conservation for miRNAs using ensembl""" if comp == None: comp = Compara(species, account=None, Release='79') results = [] for i, r in list(df.iterrows()): #base.RNAfold(r['consensus precursor sequence'], r['#miRNA']+'_'+ref) mature = r['consensus mature sequence'].replace('u', 't').upper() seed = r['seed'].replace('u', 't').upper() c, locs, strand = r['precursor coordinate'].split(':') start, end = locs.split('..') print(r['#miRNA'], seed, mature, locs, strand) coords = (c, int(start), int(end), strand) regions, aln = getSyntenicAlignment(comp, ref, coords, fname=r['#miRNA'] + '.aln.fa') if aln == None: x = pd.DataFrame([r]) a = getHostGenes(x) if a is not None: results.append(a) continue region = regions[0] a = base.cogentAlignment2DataFrame(aln.degap()) a['#miRNA'] = r['#miRNA'] a['ident'] = getIdentities(aln) print('max identity: %s' % a.ident.max()) a['seedcons'] = getSeqConservation(aln, seed) orthgenes = getGenesinRegion(region) a['gene'] = [g[0].Symbol if len(g) > 0 else np.nan for g in orthgenes] a['gene_loc'] = [ g[0].Location if len(g) > 0 else np.nan for g in orthgenes ] locs = getLocations(region) a['location'] = [':'.join(str(l).split(':')[2:]) for l in locs] a['biotype'] = [ g[0].BioType if len(g) > 0 else np.nan for g in orthgenes ] a['ensid'] = [ g[0].StableId if len(g) > 0 else np.nan for g in orthgenes ] #find where in gene the miRNA is, usually introns trpts = [] for l, g in zip(locs, orthgenes): if len(g) == 0: trpts.append(np.nan) else: tr = findinGene(g[0], l.Start, l.End) trpts.append(tr) a['tu'] = trpts #get RNAfold energy for each sequence a['energy'] = a.apply(lambda x: base.RNAfold(x.seq)[1], 1) results.append(a) print(a) print('--------------------------------------------------------') results = pd.concat(results).reset_index(drop=True) #results.drop(columns='seq') results.to_csv('novel_orthologs.csv') return
def ComparaLiftover(gff): account = HostAccount('sugarman', 'ensembl', 'ensembl') compara = Compara(['mouse', 'rat', 'human', 'cow', 'dog'], Release=61, account=account) ratgff = [] humangff = [] cowgff = [] doggff = [] #Make gff database gff_fn = gff db_fn = os.path.basename(gff_fn) + '.db' if os.path.isfile(db_fn) == False: gffutils.create_db(gff_fn, db_fn) db = gffutils.FeatureDB(db_fn) UTRs = db.features_of_type('3\'UTR') for UTR in UTRs: #Remove stop codons and last 50 nt of UTR if UTR.strand == '+': UTRstart = int(UTR.start) + 3 UTRstop = int(UTR.stop) - 50 elif UTR.strand == '-': UTRstart = int(UTR.start) + 50 UTRstop = int(UTR.stop) - 3 for synt_region in compara.getSyntenicRegions( Species='mouse', CoordName=UTR.chrom.replace('chr', ''), Start=UTRstart, End=UTRstop, Strand=UTR.strand, ensembl_coord=True, align_method='PECAN', align_clade='19 amniota vertebrates Pecan'): for region in synt_region.Members: if region.Region: locdata = str(region.Region.Location).replace( '-', ':', 1).replace(' ', '_').split(':') species = str(locdata[0]) chrm = 'chr' + str(locdata[2]) start = locdata[3] stop = locdata[4] if str(locdata[5]) == '1' and UTR.strand == '+': strand = '+' elif str(locdata[5]) == '-1' and UTR.strand == '+': strand = '-' elif str(locdata[5]) == '-1' and UTR.strand == '-': strand = '+' elif str(locdata[5]) == '1' and UTR.strand == '-': strand = '-' ID = (str(UTR.id) + '_' + species) if species == 'Rattus_norvegicus': ratgff.append([ chrm, 'ALE', '3\'UTR', start, stop, '.', strand, '.', ID ]) elif species == 'Homo_sapiens': humangff.append([ chrm, 'ALE', '3\'UTR', start, stop, '.', strand, '.', ID ]) elif species == 'Bos_taurus': cowgff.append([ chrm, 'ALE', '3\'UTR', start, stop, '.', strand, '.', ID ]) elif species == 'Canis_familiaris': doggff.append([ chrm, 'ALE', '3\'UTR', start, stop, '.', strand, '.', ID ]) os.remove(db_fn) sys.stderr.write( 'Succesfully found matches in {0} rat regions, {1} human regions, {2} cow regions and {3} dog regions.\n' .format(len(ratgff), len(humangff), len(cowgff), len(doggff))) return ratgff, humangff, cowgff, doggff
def download_database_pycogent(species, release, database_name='ensembl', nucleotide=False) : log = get_log() #try : import cogent from cogent.db.ensembl import Species, Genome, Compara, HostAccount from cogent.db.ensembl.database import Database #except ImportError : # log.fatal("pycogent import failed, exiting...") # exit(1) if cogent.version_info != (1,5,3) : log.warning("only tested with pycogent version 1.5.3 (you are running %s)" % cogent.version) release, db_name, db_details = get_missing_info(species, release, database_name) account = HostAccount( db_details['hostname'], db_details['username'], db_details['password'], port=db_details['port']) if Species.getSpeciesName(species) == 'None' : # this is not an error, it returns the string "None" log.warning("%s not found in pycogent, attempting to add it manually" % species) Species.amendSpecies(species.capitalize().replace('_', ' '), species) genome = Genome(species, Release=release, account=account) compara = Compara([species], Release=release, account=account) # DON'T TRY THIS AT HOME! # # what happens is it searches for compara databases, but unfortunately finds more than one # in this situation pycogent just connects to the first one, which is always compara_bacteria # so one solution is to dig through all the compara objects internals to provide a connection # to the correct database ... obviously not the best solution, but at 6 lines of code definitely # the shortest ;-P # if db_name not in ('ensembl', 'bacteria') : log.warning("accessing compara from pycogent with species outside of ensembl-main and ensembl-bacteria is problematic, attempting to patch...") from cogent.db.ensembl.host import DbConnection from cogent.db.ensembl.name import EnsemblDbName import sqlalchemy new_db_name = EnsemblDbName(compara.ComparaDb.db_name.Name.replace('bacteria', db_name)) compara.ComparaDb._db = DbConnection(account=account, db_name=new_db_name) compara.ComparaDb._meta = sqlalchemy.MetaData(compara.ComparaDb._db) # end of DON'T TRY THIS AT HOME! genes = set() families = [] stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes))) for gene in genome.getGenesMatching(BioType='protein_coding') : stableid = gene.StableId # ignore genes that have already been seen as members of other gene families if stableid in genes : continue genes.add(stableid) paralogs = compara.getRelatedGenes(StableId=stableid, Relationship='within_species_paralog') current = [] if paralogs is None : stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes))) current.append((stableid, str(gene.CanonicalTranscript.Cds) if nucleotide else str(gene.CanonicalTranscript.ProteinSeq))) else : for paralog in paralogs.Members : paralogid = paralog.StableId genes.add(paralogid) stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes))) try : current.append((paralogid, str(paralog.CanonicalTranscript.Cds) if nucleotide else str(paralog.CanonicalTranscript.ProteinSeq))) except AttributeError : log.fatal("pycogent did not find a canonical transcript for %s" % paralogid) exit(1) #print ','.join([ i for i,j in current ]) families.append(current) stderr.write("\r[downloading %s] got %d sequences\n" % ("CDS" if nucleotide else "protein", len(genes))) return families