def main(): option_parser, opts, args =\ parse_command_line_parameters(**script_info) if None in (opts.hostname, opts.user, opts.passwd): assert len(set((opts.hostname, opts.user, opts.passwd))) == 1,\ 'You must provide all MySQL options, or none at all.' if opts.hostname is not None: account = HostAccount(opts.hostname,opts.user,opts.passwd) elif 'ENSEMBL_ACCOUNT' in os.environ: h, u, p = os.environ['ENSEMBL_ACCOUNT'].split() account = HostAccount(h,u,p) else: account = None if opts.test_run: print account outdir = os.path.abspath(opts.outdir) if not os.path.exists(outdir): print 'FAIL: %s directory does not exist' % outdir exit(-1) if not opts.by_chrom: outfile_name = os.path.join(outdir, '%s-%s.fasta' % (opts.species, opts.release)) if not opts.test_run: outfile = open(outfile_name, 'w') if opts.test_run: print 'Will write to: %s' % outdir if not opts.by_chrom: print outfile_name for chrom in get_chrom_seqs(opts.species, opts.release, account, debug=opts.test_run): fasta = chrom.toFasta() if opts.by_chrom: outfile_name = os.path.join(outdir, '%s.fasta' % chrom.Name) if opts.test_run: print 'Will write to: %s' % outfile_name break if opts.by_chrom: outfile = open(outfile_name, 'w') outfile.write(fasta+'\n') if opts.by_chrom: outfile.close()
def kmerhomology(k, kmerlist, fastadict): k = int(k) homologydict = {} #{kmer:[conserved_occurences, total occurences]} UTRcounter = 0 account = HostAccount('sugarman', 'ensembl', 'ensembl') #account = None compara = Compara(['mouse', 'human'], Release=61, account=account) sqlalchemyfails = 0 if k != len(kmerlist[0]): sys.stderr.write('Warning! Provided value of k does not match length of given kmer!') for UTR in fastadict: UTRcounter +=1 if UTRcounter % 50 == 0: sys.stderr.write('Determining motif conservation in UTR {0} of {1}...\n'.format(UTRcounter, len(fastadict))) UTRsequence = fastadict[UTR] UTR = UTR.replace(';', '\t').split('\t') ID = UTR[0] chrm = UTR[1].replace('chr','') #change to ensembl style start = int(UTR[2]) stop = int(UTR[3]) strand = UTR[4] for i in range(len(UTRsequence) - k + 1): if strand == '+': mousekmer = UTRsequence[i:i+k] mousekmerstart = start + i mousekmerstop = start + i + k - 1 elif strand == '-': #actually counting back from the end...and remember the last 50 and stop codons are removed!! mousekmer = UTRsequence[i:i+k] mousekmerstart = stop - i - k + 1 mousekmerstop = stop - i if mousekmer in kmerlist: if homologydict.has_key(mousekmer) == False: homologydict[mousekmer] = [0, 1] #if not in dictionary, initiate entry with 1 in total occurences elif homologydict.has_key(mousekmer): homologydict[mousekmer][1] +=1 #if in dictionary, add one to total occurences #stupid f*****g sqlalchemy timeouts try: for synt_region in compara.getSyntenicRegions(Species = 'mouse', CoordName = chrm, Start = mousekmerstart, End = mousekmerstop, Strand = strand, ensembl_coord = True, align_method = 'PECAN', align_clade = '19 amniota vertebrates Pecan'): membs = synt_region.Members if len(membs) == 2: #if there is no aligned human seq just skip it completed = True mouse = membs[0] human = membs[1] #strand sometimes seems to not get picked up by compara. If on minus strand, this seq may be rev comp of motif. mouseseq = str(mouse.AlignedSeq) humanseq = str(human.AlignedSeq) if mouseseq == humanseq: homologydict[mousekmer][0] += 1 #add conserved occurence to dictionary elif len(membs) != 2: pass except (OE, mOE): sys.stderr.write('Genome mysql error!!!') sqlalchemyfails +=1 i = i-1 #try again continue return homologydict, sqlalchemyfails
def kmerhomologydict(k, fastadict): #for multiple kmers at once, homologydict of every kmer in every UTR sequence k = int(k) kmerhomologydict = {} #{kmer:[conserved_occurences, total_occurences]} UTRcounter = 0 analyzedUTRs = 0 account = HostAccount('sugarman', 'ensembl', 'ensembl') #account = None compara = Compara(['mouse', 'human'], Release=61, account=account) sqlalchemyfails = 0 for UTR in fastadict: UTRcounter +=1 UTRsequence = fastadict[UTR] UTR = UTR.replace(';', '\t').split('\t') ID = UTR[0] chrm = UTR[1].replace('chr','') #change to ensembl style start = int(UTR[2]) stop = int(UTR[3]) strand = UTR[4] if UTRcounter % 1 == 0: sys.stderr.write('Determining motif conservation in UTR {0}, number {1} of {2}, (interrogated {3} so far)...\n'.format(ID, UTRcounter, len(fastadict), analyzedUTRs)) for i in range(len(UTRsequence) - k + 1): if strand == '+': mousekmer = UTRsequence[i:i+k] mousekmerstart = start + i mousekmerstop = start + i + k - 1 elif strand == '-': #actually counting back from the end...and remember the last 50 and stop codons are removed!! mousekmer = UTRsequence[i:i+k] mousekmerstart = stop - i - k + 1 mousekmerstop = stop - i if kmerhomologydict.has_key(mousekmer) == False: kmerhomologydict[mousekmer] = [0, 1] #if not in dictionary, initiate entry with 1 in total occurences elif kmerhomologydict.has_key(mousekmer): kmerhomologydict[mousekmer][1] +=1 #if in dictionary, add one to total occurences for synt_region in compara.getSyntenicRegions(Species = 'mouse', CoordName = chrm, Start = mousekmerstart, End = mousekmerstop, Strand = strand, ensembl_coord = True, align_method = 'PECAN', align_clade = '19 amniota vertebrates Pecan'): if mousekmerstart == start and strand == '+' or mousekmerstop == stop and strand == '-': #this will be true once per UTR analyzedUTRs +=1 membs = synt_region.Members if len(membs) == 2: #if there is no aligned human seq just skip it completed = True mouse = membs[0] human = membs[1] #strand sometimes seems to not get picked up by compara. If on minus strand, this seq may be rev comp of motif. mouseseq = str(mouse.AlignedSeq) humanseq = str(human.AlignedSeq) if mouseseq == humanseq: kmerhomologydict[mousekmer][0] += 1 #add conserved occurence to dictionary elif len(membs) != 2: pass sys.stderr.write('Analyzed {0} of {1} UTRs.\n'.format(analyzedUTRs, len(fastadict))) return kmerhomologydict, sqlalchemyfails
def get_genes_from_Ensembl_multiple(args, sourcedata, targetdata): inputtype = args.inputType if (inputtype == "id"): geneidlistfile = args.geneIdListFile if (geneidlistfile == None): print "Argument -gidlf <geneidlistfilename> is required" else: for line in open(geneidlistfile, "r").readlines(): parse = line.split("\n")[0].split(" ") if (len(parse) > 1): species = parse[0] geneid = parse[1] print "Retreving Gene", geneid account = HostAccount('ensembldb.ensembl.org', 'anonymous', '') genome = Genome(species, ENSEMBL_VERSION, account) gene = genome.getGeneByStableId(StableId=geneid) sourcedata += get_cds_data(gene) targetdata += get_gene_data(gene) if (inputtype == "name"): gene = args.gene if (gene == None): print "Argument -g <genename> is required" specieslistfile = args.specieslistfile if (specieslistfile == None): print "Argument -slf <specieslistfilename> is required" if (gene != None and specieslistfile != None): for line in open(specieslistfile, "r").readlines(): parse = line.split("\n")[0].split(" ") if (len(parse) > 0): species = parse[0] print "Retreving Gene", gene, "from species", species account = HostAccount('ensembldb.ensembl.org', 'anonymous', '') genome = Genome(species, ENSEMBL_VERSION, account) gene = get_gene_from_Ensembl_by_name(gene, genome) sourcedata += get_cds_data(gene) targetdata += get_gene_data(gene) return sourcedata, targetdata
def ensembl_to_hgnc(gene_list): account = HostAccount('blackrussian', 'bioinfo', 'A29bcd1234#', port=3306) human = Genome('human', Release=73, account=account) hgnc_list = [] for gene in gene_list: hgnc_list.append(human.getGeneByStableId(StableId=gene).Symbol) hgnc_list = set(hgnc_list) return hgnc_list
def get_genes_from_Ensembl_pairwise(args, sourcedata, targetdata): genelist = [] inputtype = args.inputType firstspecies = args.firstspecies secondspecies = args.secondspecies if (firstspecies == None): print "Argument -s1 <firstspeciesname> is required" elif (secondspecies == None): print "Argument -s2 <secondspeciesname> is required" else: account = HostAccount('ensembldb.ensembl.org', 'anonymous', '') firstgenome = Genome(firstspecies, ENSEMBL_VERSION, account) secondgenome = Genome(secondspecies, ENSEMBL_VERSION, account) if (inputtype == "id"): firstgeneid = args.firstgeneid if (firstgeneid == None): print "Argument -gid1 <firstgeneid> is required" secondgeneid = args.secondgeneid if (secondgeneid == None): print "Argument -gid2 <secondgeneid> is required" if (firstgeneid != None and secondgeneid != None): print "Retreving Genes", firstgeneid, secondgeneid firstgene = firstgenome.getGeneByStableId(StableId=firstgeneid) secondgene = secondgenome.getGeneByStableId( StableId=secondgeneid) if (inputtype == "name"): gene = args.gene if (gene == None): print "Argument -g <genename> is required" else: print "Retreving Genes", name, "from species", firstspecies, secondspecies firstgene = get_gene_from_Ensembl_by_name(gene, firstgenome) secondgene = get_gene_from_Ensembl_by_name(gene, secondgenome) sourcedata += get_cds_data(firstgene) + get_cds_data(secondgene) targetdata += get_gene_data(firstgene) + get_gene_data(secondgene) return sourcedata, targetdata
def main(): rr = RunRecord('start_chippy_db') rr.addCommands(sys.argv) args = script_info['args'].parse() create_path(args.save_db_dir) if not os.path.isdir(args.save_db_dir): sys.stderr.write('The save_db_dir must be an existing directory.\n') return release = args.ensembl_release species = args.species chippy_db_name = args.save_db_prefix + '_chippy_' + str(release) +\ '_' + species + '.db' db_path = os.path.join(args.save_db_dir, chippy_db_name) if not os.path.exists(db_path): session = make_session(db_path) hostname = args.hostname username = args.username password = args.password account = HostAccount(hostname, username, password, port=args.port) add_ensembl_gene_data(session, args.species, ensembl_release=args.ensembl_release, account=account) success = create_dummy_expr(session) if success: rr.addInfo('Dummy data added successfully', 'Expr=1.') else: rr.addError('Dummy data failed to upload to DB', 'Expect bigger problems') rr.addInfo('Chippy DB written', db_path) print os.path.realpath(db_path) else: rr.addError('Chippy DB with this name already exists', db_path) if args.show_log: rr.display()
def main(): import os script_dir = os.path.dirname(os.path.abspath(__file__)) """ Neccesary to log into the ensembl database """ import os from cogent.db.ensembl import HostAccount if 'ENSEMBL_ACCOUNT' in os.environ: host, username, password = os.environ['ENSEMBL_ACCOUNT'].split() account = HostAccount(host, username, password) else: account = None """ gathers the transcript id and protein sequence from gene """ sp = "zebrafish" gn = "ENSDARG00000027279" from cogent.db.ensembl import Genome specie = Genome(Species=sp, Release="81", account=None) gene = specie.getGeneByStableId(StableId=gn) for tr in gene.Transcripts: print(tr.StableId) for ex in tr.Exons: print(ex.Symbol)
def hgnc_to_ensembl_id(hgnc_list): account = HostAccount('blackrussian', 'bioinfo', 'A29bcd1234#', port=3306) human = Genome('human', Release=73, account=account) ensembl_stable_id_list = [] for gene in hgnc_list: gene_query = human.getGenesMatching(Symbol=gene) for gene_obj in gene_query: if gene_obj.Symbol == gene: ensembl_stable_id_list.append(gene_obj.StableId) #Remove duplicates ensembl_stable_id_list = set(ensembl_stable_id_list) #Keep list elements starting with 'ENSG' ensembl_stable_id_list = [ x for x in ensembl_stable_id_list if x.startswith('ENSG') ] return ensembl_stable_id_list
print codon_B #get chaging index if codon_B[0] != codon_A[0]: codon_index = 1 if codon_B[1] != codon_A[1]: codon_index = 2 if codon_B[2] != codon_A[2]: codon_index = 3 result['codon_index'] = codon_index result['codon_B'] = codon_B return result if 'ENSEMBL_ACCOUNT' in os.environ: host, username, password = os.environ['ENSEMBL_ACCOUNT'].split() account = HostAccount(host, username, password) else: account = None protein_mutation = 'A203T' protein_A = protein_mutation[0] protein_B = protein_mutation[-1] codon_index = int(protein_mutation[1:-1]) #result = find_codon_index(protein_A, 'GCA', protein_B) #print result human = Genome(Species='human', Release=Release, account=account) print human #seqs = {'original' : 'A',
# Code to merge ensembl gene information with annotation of probes and junctions to get an idea of the gene structure import pandas as pd import re from collections import Counter from itertools import permutations import csv import string from cogent.db.ensembl import Genome, HostAccount from difflib import SequenceMatcher #account = HostAccount('ensembldb.ensembl.org', 'anonymous', '', port=5306) account = HostAccount('127.0.0.1', 'root', 'ensembl', port=3306) Release = 89 HumanDB = Genome(Species='human', Release=Release, account=account) class SettingTCID_GeneIDError(Exception): def __init__(self, msg): self.msg = msg def __str__(self): return self.msg def GeneStructure(TCID, GeneID, Probesets=None, MappingFile="HTA_2_0_Probeset_SequenceIndices.txt", SequenceFile="HTA_2_0_Probeset_Sequences.txt", location="Output",
def ComparaLiftover(gff): account = HostAccount('sugarman', 'ensembl', 'ensembl') compara = Compara(['mouse', 'rat', 'human', 'cow', 'dog'], Release=61, account=account) ratgff = [] humangff = [] cowgff = [] doggff = [] #Make gff database gff_fn = gff db_fn = os.path.basename(gff_fn) + '.db' if os.path.isfile(db_fn) == False: gffutils.create_db(gff_fn, db_fn) db = gffutils.FeatureDB(db_fn) UTRs = db.features_of_type('3\'UTR') for UTR in UTRs: #Remove stop codons and last 50 nt of UTR if UTR.strand == '+': UTRstart = int(UTR.start) + 3 UTRstop = int(UTR.stop) - 50 elif UTR.strand == '-': UTRstart = int(UTR.start) + 50 UTRstop = int(UTR.stop) - 3 for synt_region in compara.getSyntenicRegions( Species='mouse', CoordName=UTR.chrom.replace('chr', ''), Start=UTRstart, End=UTRstop, Strand=UTR.strand, ensembl_coord=True, align_method='PECAN', align_clade='19 amniota vertebrates Pecan'): for region in synt_region.Members: if region.Region: locdata = str(region.Region.Location).replace( '-', ':', 1).replace(' ', '_').split(':') species = str(locdata[0]) chrm = 'chr' + str(locdata[2]) start = locdata[3] stop = locdata[4] if str(locdata[5]) == '1' and UTR.strand == '+': strand = '+' elif str(locdata[5]) == '-1' and UTR.strand == '+': strand = '-' elif str(locdata[5]) == '-1' and UTR.strand == '-': strand = '+' elif str(locdata[5]) == '1' and UTR.strand == '-': strand = '-' ID = (str(UTR.id) + '_' + species) if species == 'Rattus_norvegicus': ratgff.append([ chrm, 'ALE', '3\'UTR', start, stop, '.', strand, '.', ID ]) elif species == 'Homo_sapiens': humangff.append([ chrm, 'ALE', '3\'UTR', start, stop, '.', strand, '.', ID ]) elif species == 'Bos_taurus': cowgff.append([ chrm, 'ALE', '3\'UTR', start, stop, '.', strand, '.', ID ]) elif species == 'Canis_familiaris': doggff.append([ chrm, 'ALE', '3\'UTR', start, stop, '.', strand, '.', ID ]) os.remove(db_fn) sys.stderr.write( 'Succesfully found matches in {0} rat regions, {1} human regions, {2} cow regions and {3} dog regions.\n' .format(len(ratgff), len(humangff), len(cowgff), len(doggff))) return ratgff, humangff, cowgff, doggff
#!/usr/bin/env python2 # -*- coding: utf-8 -*- # cogent and sqlalchemy modules need to be installed: # pip2 install cogent # pip2 install sqlalchemy from cogent.db.ensembl import HostAccount, Species, Genome from pypath import mapping Release = 78 account = HostAccount('ensembldb.ensembl.org', 'anonymous', '') human = Genome(Species='human', Release=Release, account=account) # UniProt, seq offset, residue, isoform positions = [('P00533', 40, 'Q', 1), ('P60520', 30, 'P', 1)] m = mapping.Mapper() m.load_uniprot_mappings(['ensg'], bi=True) positions_ens = [] for p in positions: ensgs = m.map_name(p[0], 'uniprot', 'ensg') for ensg in ensgs: genes = human.getGenesMatching(StableId=ensg) for gene in genes: positions_ens.append( tuple([ensg, gene.Location, gene.CanonicalTranscript.Exons] + list(p))) # another attempts with biopython --
reference files. It uses the genomic coordinates to pull the sequence from the genome and reverse complement if on the negative strand. The sequence currently in the reference file may refer to the sequence synthesized on the chip, in which case it is the version with the lowest A's (easier to synthesize). It could also be the sequence without strand consideration. To be sure, let's just pull from the genome. ''' import MySQLdb # use cogent to extract sequences from the genome from cogent.db.ensembl import HostAccount, Genome import pandas as pd # set up connection # host, user, password, port pycog = HostAccount('ensembldb.ensembl.org', 'anonymous', '', 3306) hs37 = Genome('human', Release=78, account=pycog) # read in reference data = pd.read_table('../produced_data/splicemod_data_clean.txt', sep='\t') # reformat so intron/exon length columns are int and not float, convert NA to 0 so we can use int data[['intron1_len', 'exon_len', 'intron2_len']] = data[['intron1_len', 'exon_len', 'intron2_len']].fillna(0.0).astype(int) # grab sequences with chr, start and end information lib = data[data.chr.notnull() & data.start.notnull() & data.end.notnull()] def grab_region(chr, start, end, strand, genome):