def get_sequence_from_location(species, coords): """Get sequence from a genomic location in an ensembl species genome.""" from cogent.db.ensembl import HostAccount, Genome, Compara, Species genome = Genome(Species=species, Release='87', account=None) chrom, start, end, strand = coords #print coords r = genome.getRegion(CoordName=str(chrom), Start=start, End=end, Strand=strand) return r.Seq
def ensembl_to_hgnc(gene_list): account = HostAccount('blackrussian', 'bioinfo', 'A29bcd1234#', port=3306) human = Genome('human', Release=73, account=account) hgnc_list = [] for gene in gene_list: hgnc_list.append(human.getGeneByStableId(StableId=gene).Symbol) hgnc_list = set(hgnc_list) return hgnc_list
def get_genes_from_location(ref, coords, pad=0): """Get genes from a set of genome coordinates. pad will add n bases to either side to expand area""" genome = Genome(Species=ref, Release=release, account=account) chrom, start, end, strand = coords genes = list( genome.getFeatures(CoordName=chrom, Start=start - pad, End=end + pad, feature_types='gene')) return genes
def get_chrom_seqs(species, release, account=None, debug=False): """yields sequence objects for the indicated chromosomes from Ensembl""" genome = Genome(species, Release=release, account=account) for chrom in chroms[species]: region = genome.getRegion(CoordName=chrom) seq = region.Seq name = 'chr_%s' % chrom seq.Name = name if debug: print name print repr(seq) yield seq
def Main(): global args,out args=ParseArg() if args.output=="stdout": out=sys.stdout else: try: out=open(args.output,"w") except IOError: print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead" out=sys.stdout count={} dbi1=DBI.init(args.db,"bed") # the DBI init file for bed6 file of all kinds of RNA dbi2=DBI.init(args.db_detail,"bed") # the DBI init file for bed12 file of lincRNA and mRNA with intron, exon, UTR genome=Genome('mouse', Release=67, account=None) for bed in TableIO.parse(args.input,args.format): [typ,name,subtype]=annotation(bed,dbi1,dbi2,genome) if count.has_key(typ): count[typ]+=1 else: count[typ]=1 print >>out, "\t".join (str(f) for f in [bed.chr,bed.start,bed.stop,bed.id,name,bed.strand,typ, subtype]) print >>out, "\n".join ("#"+typ+"\t%d"%(count[typ]) for typ in count.keys())
def hgnc_to_ensembl_id(hgnc_list): account = HostAccount('blackrussian', 'bioinfo', 'A29bcd1234#', port=3306) human = Genome('human', Release=73, account=account) ensembl_stable_id_list = [] for gene in hgnc_list: gene_query = human.getGenesMatching(Symbol=gene) for gene_obj in gene_query: if gene_obj.Symbol == gene: ensembl_stable_id_list.append(gene_obj.StableId) #Remove duplicates ensembl_stable_id_list = set(ensembl_stable_id_list) #Keep list elements starting with 'ENSG' ensembl_stable_id_list = [x for x in ensembl_stable_id_list if x.startswith('ENSG')] return ensembl_stable_id_list
def get_genes_from_Ensembl_multiple(args, sourcedata, targetdata): inputtype = args.inputType if (inputtype == "id"): geneidlistfile = args.geneIdListFile if (geneidlistfile == None): print "Argument -gidlf <geneidlistfilename> is required" else: for line in open(geneidlistfile, "r").readlines(): parse = line.split("\n")[0].split(" ") if (len(parse) > 1): species = parse[0] geneid = parse[1] print "Retreving Gene", geneid account = HostAccount('ensembldb.ensembl.org', 'anonymous', '') genome = Genome(species, ENSEMBL_VERSION, account) gene = genome.getGeneByStableId(StableId=geneid) sourcedata += get_cds_data(gene) targetdata += get_gene_data(gene) if (inputtype == "name"): gene = args.gene if (gene == None): print "Argument -g <genename> is required" specieslistfile = args.specieslistfile if (specieslistfile == None): print "Argument -slf <specieslistfilename> is required" if (gene != None and specieslistfile != None): for line in open(specieslistfile, "r").readlines(): parse = line.split("\n")[0].split(" ") if (len(parse) > 0): species = parse[0] print "Retreving Gene", gene, "from species", species account = HostAccount('ensembldb.ensembl.org', 'anonymous', '') genome = Genome(species, ENSEMBL_VERSION, account) gene = get_gene_from_Ensembl_by_name(gene, genome) sourcedata += get_cds_data(gene) targetdata += get_gene_data(gene) return sourcedata, targetdata
def hgnc_to_ensembl_id(hgnc_list): account = HostAccount('blackrussian', 'bioinfo', 'A29bcd1234#', port=3306) human = Genome('human', Release=73, account=account) ensembl_stable_id_list = [] for gene in hgnc_list: gene_query = human.getGenesMatching(Symbol=gene) for gene_obj in gene_query: if gene_obj.Symbol == gene: ensembl_stable_id_list.append(gene_obj.StableId) #Remove duplicates ensembl_stable_id_list = set(ensembl_stable_id_list) #Keep list elements starting with 'ENSG' ensembl_stable_id_list = [ x for x in ensembl_stable_id_list if x.startswith('ENSG') ] return ensembl_stable_id_list
def main(): import os script_dir = os.path.dirname(os.path.abspath(__file__)) """ Neccesary to log into the ensembl database """ import os from cogent.db.ensembl import HostAccount if 'ENSEMBL_ACCOUNT' in os.environ: host, username, password = os.environ['ENSEMBL_ACCOUNT'].split() account = HostAccount(host, username, password) else: account = None """ gathers the transcript id and protein sequence from gene """ sp = "zebrafish" gn = "ENSDARG00000027279" from cogent.db.ensembl import Genome specie = Genome(Species=sp, Release="81", account=None) gene = specie.getGeneByStableId(StableId=gn) for tr in gene.Transcripts: print(tr.StableId) for ex in tr.Exons: print(ex.Symbol)
def get_genes_from_Ensembl_pairwise(args, sourcedata, targetdata): genelist = [] inputtype = args.inputType firstspecies = args.firstspecies secondspecies = args.secondspecies if (firstspecies == None): print "Argument -s1 <firstspeciesname> is required" elif (secondspecies == None): print "Argument -s2 <secondspeciesname> is required" else: account = HostAccount('ensembldb.ensembl.org', 'anonymous', '') firstgenome = Genome(firstspecies, ENSEMBL_VERSION, account) secondgenome = Genome(secondspecies, ENSEMBL_VERSION, account) if (inputtype == "id"): firstgeneid = args.firstgeneid if (firstgeneid == None): print "Argument -gid1 <firstgeneid> is required" secondgeneid = args.secondgeneid if (secondgeneid == None): print "Argument -gid2 <secondgeneid> is required" if (firstgeneid != None and secondgeneid != None): print "Retreving Genes", firstgeneid, secondgeneid firstgene = firstgenome.getGeneByStableId(StableId=firstgeneid) secondgene = secondgenome.getGeneByStableId( StableId=secondgeneid) if (inputtype == "name"): gene = args.gene if (gene == None): print "Argument -g <genename> is required" else: print "Retreving Genes", name, "from species", firstspecies, secondspecies firstgene = get_gene_from_Ensembl_by_name(gene, firstgenome) secondgene = get_gene_from_Ensembl_by_name(gene, secondgenome) sourcedata += get_cds_data(firstgene) + get_cds_data(secondgene) targetdata += get_gene_data(firstgene) + get_gene_data(secondgene) return sourcedata, targetdata
# Code to merge ensembl gene information with annotation of probes and junctions to get an idea of the gene structure import pandas as pd import re from collections import Counter from itertools import permutations import csv import string from cogent.db.ensembl import Genome, HostAccount from difflib import SequenceMatcher #account = HostAccount('ensembldb.ensembl.org', 'anonymous', '', port=5306) account = HostAccount('127.0.0.1', 'root', 'ensembl', port=3306) Release = 89 HumanDB = Genome(Species='human', Release=Release, account=account) class SettingTCID_GeneIDError(Exception): def __init__(self, msg): self.msg = msg def __str__(self): return self.msg def GeneStructure(TCID, GeneID, Probesets=None, MappingFile="HTA_2_0_Probeset_SequenceIndices.txt", SequenceFile="HTA_2_0_Probeset_Sequences.txt", location="Output",
genome and reverse complement if on the negative strand. The sequence currently in the reference file may refer to the sequence synthesized on the chip, in which case it is the version with the lowest A's (easier to synthesize). It could also be the sequence without strand consideration. To be sure, let's just pull from the genome. ''' import MySQLdb # use cogent to extract sequences from the genome from cogent.db.ensembl import HostAccount, Genome import pandas as pd # set up connection # host, user, password, port pycog = HostAccount('ensembldb.ensembl.org', 'anonymous', '', 3306) hs37 = Genome('human', Release=78, account=pycog) # read in reference data = pd.read_table('../produced_data/splicemod_data_clean.txt', sep='\t') # reformat so intron/exon length columns are int and not float, convert NA to 0 so we can use int data[['intron1_len', 'exon_len', 'intron2_len']] = data[['intron1_len', 'exon_len', 'intron2_len']].fillna(0.0).astype(int) # grab sequences with chr, start and end information lib = data[data.chr.notnull() & data.start.notnull() & data.end.notnull()] def grab_region(chr, start, end, strand, genome): start = int(start)
Release=80 from cogent.db.ensembl import Species, Genome human = Genome(Species='human', Release=Release, account=None) gene = human.getGeneByStableId(StableId='ENSG00000205274') print gene.Symbol
host, username, password = os.environ['ENSEMBL_ACCOUNT'].split() account = HostAccount(host, username, password) else: account = None protein_mutation = 'A203T' protein_A = protein_mutation[0] protein_B = protein_mutation[-1] codon_index = int(protein_mutation[1:-1]) #result = find_codon_index(protein_A, 'GCA', protein_B) #print result human = Genome(Species='human', Release=Release, account=account) print human #seqs = {'original' : 'A', #'mutation' : 'T' #} #protein = LoadSeqs(data = seqs, moltype = PROTEIN) #print protein.getTranslation() #protein = 'METSASATASEKQEAKSGILEAAGFPDPGKKASPLVVAAAAAAAVAAQGVPQHLLPPFHAPLPIDMRHQEGRYHYEPHSVHGVHGPPALSGSPVISDISLIRLSPHPAGPGESPFNAPHPYVNPHMEHYLRSVHSSPTLSMISAARGLSPADVAQEHLKERGLFGLPAPGTTPSDYYHQMTLVAGHPAPYGDLLMQSGGAASAPHLHDYLNPVDVSRFSSPRVTPRLSRKRALSISPLSDASLDLQRMIRTSPNSLVAYINNSRSSSAASGSYGHLSAGALSPAFTFPHPINPVAYQQILSQQRGLGSAFGHTPPLIQPSPTFLAQQPMALTSINATPTQLSSSSNCLSDTNQNKQSSESAVSSTVNPVAIHKRSKVKTEPEGLRPASPLALTQGQVSGHGSCGCALPLSQEQLADLKEDLDRDDCKQEAEVVIYETNCHWEDCTKEYDTQEQLVHHINNEHIHGEKKEFVCRWQACTREQKPFKAQYMLVVHMRRHTGEKPHKCTFEGCSKAYSRLENLKTHLRSHTGEKPYVCEHEGCNKAFSNASDRAKHQNRTHSNEKPYICKIPGCTKRYTDPSSLRKHVKTVHGPDAHVTKKQRNDVHLRTPLLKENGDSEAGTEPGGPESTEASSTSQAVEDCLHVRAIKTESSGLCQSSPGAQSSCSSEPSPLGSAPNNDSGVEMPGTGPGSLGDLTALDDTPPGADTSALAAPSAGGLQLRKHMTTMHRFEQLKKEKLKSLKDSCSWAGPTPHTRNTKLPPLPGSGSILENFSGSGGGGPAGLLPNPRLSELSASEVTMLSQLQERRDSSTSTVSSAYTVSRRSSGISPYFSSRRSSEASPLGAGRPHNASSADSYDPISTDASRRSSEASQCSGGSGLLNLTPAQQYSLRAKYAAATGGPPPTPLPGLERMSLRTRLALLDAPERTLPAGCPRPLGPRRGSDGPTYGHGHAGAAPAFPHEAPGGGARRASDPVRRPDALSLPRVQRFHSTHNVNPGPLPPCADRRGLRLQSHPSTDGGLARGAYSPRPPSISENVAMEAVAAGVDGAGPEADLGLPEDDLVLPDDVVQYIKAHASGALDEGTGQVYPTESTGFSDNPRLPSPGLHGQRRMVAADSNVGPSAPMLGGCQLGFGAPSSLNKNNMPVQWNEVSSGTVDALASQVKPPPFPQGNLAVVQQKPAFGQYPGYSPQGLQASPGGLDSTQPHLQPRSGAPSQGIPRVNYMQQLRQPVAGSQCPGMTTTMSPHACYGQVHPQLSPSTISGALNQFPQSCSNMPAKPGHLGHPQQTEVAPDPTTMGNRHRELGVPDSALAGVPPPHPVQSYPQQSHHLAASMSQEGYHQVPSLLPARQPGFMEPQTGPMGVATAGFGLVQPRPPLEPSPTGRHRGVRAVQQQLAYARATGHAMAAMPSSQETAEAVPKGAMGNMGSVPPQPPPQDAGGAPDHSMLYYYGQIHMYEQDGGLENLGSCQVMRSQPPQPQACQDSIQPQPLPSPGVNQVSSTVDSQLLEAPQIDFDAIMDDGDHSSLFSGALSPSLLHSLSQNSSRLTTPRNSLTLPSIPAGISNMAVGDMSSMLTSLAEESKFLNMMT' #print len(protein) #cds = 'ATGGAGACGTCTGCCTCAGCCACTGCCTCCGAGAAGCAAGAAGCCAAAAGTGGGATCCTGGAGGCCGCTGGCTTCCCCGACCCGGGTAAAAAGGCCTCTCCTTTGGTGGTGGCTGCAGCGGCAGCAGCAGCGGTAGCTGCCCAAGGAGTGCCGCAGCATCTCTTGCCACCATTCCATGCGCCCCTACCGATTGACATGCGACACCAGGAAGGAAGGTACCATTACGAGCCTCATTCTGTCCACGGTGTGCACGGGCCCCCTGCCCTCAGCGGCAGCCCTGTCATCTCTGACATCTCCTTGATCCGGCTTTCCCCGCACCCGGCTGGCCCTGGGGAGTCCCCCTTCAACGCCCCCCACCCGTACGTGAACCCCCACATGGAGCACTACCTCCGTTCTGTGCACAGCAGCCCCACGCTCTCCATGATCTCTGCAGCCAGGGGCCTCAGCCCCGCTGATGTGGCCCAGGAGCACCTTAAGGAGAGGGGACTGTTTGGCCTTCCTGCTCCAGGCACCACCCCCTCAGACTATTACCACCAGATGACCCTCGTGGCAGGCCACCCCGCGCCCTACGGGGACCTGCTGATGCAGAGCGGGGGCGCTGCCAGCGCACCCCATCTCCACGACTACCTCAACCCCGTGGACGTGTCCCGTTTCTCCAGCCCGCGGGTGACGCCCCGCCTGAGCCGCAAGCGGGCGCTGTCCATCTCCCCACTCTCAGACGCCAGCCTGGACCTGCAGCGGATGATCCGCACCTCACCCAACTCGCTAGTGGCCTACATCAACAACTCCCGAAGCAGCTCGGCGGCCAGCGGTTCCTACGGGCATCTGTCAGCGGGTGCCCTCAGCCCAGCCTTCACCTTCCCCCACCCCATCAACCCCGTGGCCTACCAGCAGATTCTGAGCCAGCAGAGGGGTCTGGGGTCAGCCTTTGGACACACACCACCCCTGATCCAGCCCTCACCCACCTTCCTGGCCCAGCAGCCCATGGCCCTCACCTCCATCAATGCCACGCCCACCCAGCTCAGCAGCAGCAGCAACTGTCTGAGTGACACCAACCAGAACAAGCAGAGCAGTGAGTCGGCCGTCAGCAGCACCGTCAACCCTGTCGCCATTCACAAGCGCAGCAAGGTCAAGACCGAGCCTGAGGGCCTGCGGCCGGCCTCCCCTCTGGCGCTGACGCAGGGCCAGGTGTCTGGACACGGCTCATGTGGGTGTGCCCTTCCCCTCTCCCAGGAGCAGCTGGCTGACCTCAAGGAAGATCTGGACAGGGATGACTGTAAGCAGGAGGCTGAGGTGGTCATCTATGAGACCAACTGCCACTGGGAAGACTGCACCAAGGAGTACGACACCCAGGAGCAGCTGGTGCATCACATCAACAACGAGCACATCCACGGGGAGAAGAAGGAGTTTGTGTGCCGCTGGCAGGCCTGCACGCGGGAGCAGAAGCCCTTCAAGGCGCAGTACATGCTGGTGGTGCACATGCGGCGACACACGGGCGAGAAGCCCCACAAGTGCACGTTCGAGGGCTGCTCGAAGGCCTACTCCCGCCTGGAGAACCTGAAGACACACCTGCGGTCCCACACCGGGGAGAAGCCATATGTGTGTGAGCACGAGGGCTGCAACAAAGCCTTCTCCAACGCCTCGGACCGCGCCAAGCACCAGAATCGCACCCACTCCAACGAGAAACCCTACATCTGCAAGATCCCAGGCTGCACCAAGAGATACACAGACCCCAGCTCTCTCCGGAAGCATGTGAAAACGGTCCACGGCCCAGATGCCCACGTCACCAAGAAGCAGCGCAATGACGTGCACCTCCGCACACCGCTGCTCAAAGAGAATGGGGACAGTGAGGCCGGCACGGAGCCTGGCGGCCCAGAGAGCACCGAGGCCAGCAGCACCAGCCAGGCCGTGGAGGACTGCCTGCACGTCAGAGCCATCAAGACCGAGAGCTCCGGGCTGTGTCAGTCCAGCCCCGGGGCCCAGTCGTCCTGCAGCAGCGAGCCCTCTCCTCTGGGCAGTGCCCCCAACAATGACAGTGGCGTGGAGATGCCGGGGACGGGGCCCGGGAGCCTGGGAGACCTGACGGCACTGGATGACACACCCCCAGGGGCCGACACCTCAGCCCTGGCTGCCCCCTCCGCTGGTGGCCTCCAGCTGCGCAAACACATGACCACCATGCACCGGTTCGAGCAGCTCAAGAAGGAGAAGCTCAAGTCACTCAAGGATTCCTGCTCATGGGCCGGGCCGACTCCACACACGCGGAACACCAAGCTGCCTCCCCTCCCGGGAAGTGGCTCCATCCTGGAAAACTTCAGTGGCAGTGGGGGCGGCGGGCCCGCGGGGCTGCTGCCGAACCCGCGGCTGTCGGAGCTGTCCGCGAGCGAGGTGACCATGCTGAGCCAGCTGCAGGAGCGCCGCGACAGCTCCACCAGCACGGTCAGCTCGGCCTACACCGTGAGCCGCCGCTCCTCCGGCATCTCCCCCTACTTCTCCAGCCGCCGCTCCAGCGAGGCCTCGCCCCTGGGCGCCGGCCGCCCGCACAACGCGAGCTCCGCTGACTCCTACGACCCCATCTCCACGGACGCGTCGCGGCGCTCGAGCGAGGCCAGCCAGTGCAGCGGCGGCTCCGGGCTGCTCAACCTCACGCCGGCGCAGCAGTACAGCCTGCGGGCCAAGTACGCGGCAGCCACTGGCGGCCCCCCGCCCACTCCGCTGCCGGGCCTGGAGCGCATGAGCCTGCGGACCAGGCTGGCGCTGCTGGACGCGCCCGAGCGCACGCTGCCCGCCGGCTGCCCACGCCCACTGGGGCCGCGGCGTGGCAGCGACGGGCCGACCTATGGCCACGGCCACGCGGGGGCTGCGCCCGCCTTCCCCCACGAGGCTCCAGGCGGCGGAGCCAGGCGGGCCAGCGACCCTGTGCGGCGGCCCGATGCCCTGTCCCTGCCGCGGGTGCAGCGCTTCCACAGCACCCACAACGTGAACCCCGGCCCGCTGCCGCCCTGTGCCGACAGGCGAGGCCTCCGCCTGCAGAGCCACCCGAGCACCGACGGCGGCCTGGCCCGCGGCGCCTACTCGCCCCGGCCGCCTAGCATCAGCGAGAACGTGGCGATGGAGGCCGTGGCGGCAGGAGTGGACGGCGCGGGGCCCGAGGCCGACCTGGGGCTGCCGGAGGACGACCTGGTGCTTCCAGACGACGTGGTGCAGTACATCAAGGCGCACGCCAGTGGCGCTCTGGACGAGGGCACCGGGCAGGTGTATCCCACGGAAAGCACTGGCTTCTCTGACAACCCCAGACTACCCAGCCCGGGGCTGCACGGCCAGCGCAGGATGGTGGCTGCGGACTCCAACGTGGGCCCCTCCGCCCCTATGCTGGGAGGATGCCAGTTAGGCTTTGGGGCGCCCTCCAGCCTGAACAAAAATAACATGCCTGTGCAGTGGAATGAGGTGAGCTCCGGCACCGTAGACGCCCTGGCCAGCCAGGTGAAGCCTCCACCCTTTCCTCAGGGCAACCTGGCGGTGGTGCAGCAGAAGCCTGCCTTTGGCCAGTACCCGGGCTACAGTCCGCAAGGCCTACAGGCTAGCCCTGGGGGCCTGGACAGCACGCAGCCACACCTGCAGCCCCGCAGCGGAGCCCCCTCCCAGGGCATCCCCAGGGTAAACTACATGCAGCAGCTGCGACAGCCAGTGGCAGGCAGCCAGTGTCCTGGCATGACTACCACTATGAGCCCCCATGCCTGCTATGGCCAAGTCCACCCCCAGCTGAGCCCCAGCACCATCAGTGGGGCCCTCAACCAGTTCCCCCAATCCTGCAGCAACATGCCAGCCAAGCCAGGGCATCTGGGGCACCCTCAGCAGACAGAAGTGGCACCTGACCCCACCACGATGGGCAATCGCCACAGGGAACTTGGGGTCCCCGATTCAGCCCTGGCTGGAGTGCCACCACCTCACCCAGTCCAGAGCTACCCACAGCAGAGCCATCACCTGGCAGCCTCCATGAGCCAGGAGGGCTACCACCAGGTCCCCAGCCTTCTGCCTGCCCGCCAGCCTGGCTTCATGGAGCCCCAAACAGGCCCGATGGGGGTGGCTACAGCAGGCTTTGGCCTAGTGCAGCCCCGGCCTCCCCTCGAGCCCAGCCCCACTGGCCGCCACCGTGGGGTACGTGCTGTGCAGCAGCAGCTGGCCTACGCCAGGGCCACAGGCCATGCCATGGCTGCCATGCCGTCCAGTCAGGAAACAGCAGAGGCTGTGCCCAAGGGAGCGATGGGCAACATGGGGTCGGTGCCTCCCCAGCCGCCTCCGCAGGACGCAGGTGGGGCCCCGGACCACAGCATGCTCTACTACTACGGCCAGATCCACATGTACGAACAGGATGGAGGCCTGGAGAACCTCGGGAGCTGCCAGGTCATGCGGTCCCAGCCACCACAGCCACAGGCCTGTCAGGACAGCATCCAGCCCCAGCCCTTGCCCTCACCAGGGGTCAACCAGGTGTCCAGCACTGTGGACTCCCAGCTCCTGGAGGCCCCCCAGATTGACTTCGATGCCATCATGGATGATGGCGATCACTCGAGTTTGTTCTCGGGTGCTCTGAGCCCCAGCCTCCTCCACAGCCTCTCCCAGAACTCCTCCCGCCTCACCACCCCCCGAAACTCCTTGACCCTGCCCTCCATCCCCGCAGGCATCAGCAACATGGCTGTCGGGGACATGAGCTCCATGCTCACCAGCCTCGCCGAGGAGAGCAAGTTCCTGAACATGATGACCTAG' #my_seq = DNA.makeSequence(cds,'gli2') #seq = my_seq.withoutTerminalStopCodon() #pep = seq.getTranslation() #print pep.toFasta()
hitsOrganisms[items[7]] = True print "> Number of loaded hits: " + str(len(hits)) outfile.close() print "> SPECIES <" for key in hitsOrganisms: print key exit(1) ##### SET CONNECTION TO THE ENSEMBL DATABASE ########################## Release = 67 account = None yeast = Genome(Species='Neosartorya fischeris', Release=Release, account=account) outfile = open("dataset_fungal_homologs_sequences.csv", "w") print Species ##### GET SEQUENCES FROM ENSEMBL ###################################### i = 0 for hit in hits: i += 1 print "i: " + str(i) identifier = hit print "\n***** ZPRACOVANI ZAZNAMU " + identifier + " *****" # Selection of species genes = yeast.getGenesMatching(StableId="CADNFIAP00000001")
def download_database_pycogent(species, release, database_name='ensembl', nucleotide=False) : log = get_log() #try : import cogent from cogent.db.ensembl import Species, Genome, Compara, HostAccount from cogent.db.ensembl.database import Database #except ImportError : # log.fatal("pycogent import failed, exiting...") # exit(1) if cogent.version_info != (1,5,3) : log.warning("only tested with pycogent version 1.5.3 (you are running %s)" % cogent.version) release, db_name, db_details = get_missing_info(species, release, database_name) account = HostAccount( db_details['hostname'], db_details['username'], db_details['password'], port=db_details['port']) if Species.getSpeciesName(species) == 'None' : # this is not an error, it returns the string "None" log.warning("%s not found in pycogent, attempting to add it manually" % species) Species.amendSpecies(species.capitalize().replace('_', ' '), species) genome = Genome(species, Release=release, account=account) compara = Compara([species], Release=release, account=account) # DON'T TRY THIS AT HOME! # # what happens is it searches for compara databases, but unfortunately finds more than one # in this situation pycogent just connects to the first one, which is always compara_bacteria # so one solution is to dig through all the compara objects internals to provide a connection # to the correct database ... obviously not the best solution, but at 6 lines of code definitely # the shortest ;-P # if db_name not in ('ensembl', 'bacteria') : log.warning("accessing compara from pycogent with species outside of ensembl-main and ensembl-bacteria is problematic, attempting to patch...") from cogent.db.ensembl.host import DbConnection from cogent.db.ensembl.name import EnsemblDbName import sqlalchemy new_db_name = EnsemblDbName(compara.ComparaDb.db_name.Name.replace('bacteria', db_name)) compara.ComparaDb._db = DbConnection(account=account, db_name=new_db_name) compara.ComparaDb._meta = sqlalchemy.MetaData(compara.ComparaDb._db) # end of DON'T TRY THIS AT HOME! genes = set() families = [] stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes))) for gene in genome.getGenesMatching(BioType='protein_coding') : stableid = gene.StableId # ignore genes that have already been seen as members of other gene families if stableid in genes : continue genes.add(stableid) paralogs = compara.getRelatedGenes(StableId=stableid, Relationship='within_species_paralog') current = [] if paralogs is None : stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes))) current.append((stableid, str(gene.CanonicalTranscript.Cds) if nucleotide else str(gene.CanonicalTranscript.ProteinSeq))) else : for paralog in paralogs.Members : paralogid = paralog.StableId genes.add(paralogid) stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes))) try : current.append((paralogid, str(paralog.CanonicalTranscript.Cds) if nucleotide else str(paralog.CanonicalTranscript.ProteinSeq))) except AttributeError : log.fatal("pycogent did not find a canonical transcript for %s" % paralogid) exit(1) #print ','.join([ i for i,j in current ]) families.append(current) stderr.write("\r[downloading %s] got %d sequences\n" % ("CDS" if nucleotide else "protein", len(genes))) return families
''' import sys import csv import os from cogent.db.ensembl import HostAccount, Species, Genome import sqlalchemy as sql Release = 70 if 'ENSEMBL_ACCOUNT' in os.environ: host, username, password = os.environ['ENSEMBL_ACCOUNT'].split() account = HostAccount(host, username, password) else: account = None human = Genome(Species='human', Release=Release, account=account) def Getexons(ENSG,start,end): gene = human.getGeneByStableId(StableId = ENSG) print gene.BioType print len(gene.Transcripts) #print dir(gene.Location) #print "\t\t\tstart\t\tEnd" #print "from csv\t\t",start,"\t",end #print "from location\t",gene.Location.Start,"\t",gene.Location.End #print "from ensembl\t",gene.Location.EnsemblStart,"\t", gene.Location.EnsemblEnd Not_Found = True for transcript in gene.Transcripts: #print dir(transcript.Location) #print start,end #if int(start)-1 == transcript.Location.Start and int(end) == transcript.Location.End:
import os Release = 93 from cogent.db.ensembl import HostAccount if 'ENSEMBL_ACCOUNT' in os.environ: host, username, password = os.environ['ENSEMBL_ACCOUNT'].split() account = HostAccount(host, username, password) else: account = None from cogent.db.ensembl import HostAccount, Genome human = Genome(Species='human', Release=Release, account=account) variants = human.getVariation(Symbol='rs369202065') for variant in variants: print(variant) # die() ['AlleleFreqs', 'Alleles', 'Ancestral', 'Effect', 'FlankingSeq', 'Location', 'MapWeight', 'NULL_VALUE', 'NumAlleles', 'PeptideAlleles', 'Seq', 'Somatic', 'Symbol', 'TranslationLocation', 'Type', 'Validation', 'Variants', '__class__', '__cmp__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__len__', '__module__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_attr_ensembl_table_map', '_cached', '_get_allele_freqs', '_get_allele_table_record', '_get_alleles', '_get_ancestral', '_get_ancestral_data', '_get_cached_value', '_get_effect', '_get_flanking_seq', '_get_flanking_seq_data', '_get_flanking_seq_data_ge_70', '_get_flanking_seq_data_lt_70', '_get_location', '_get_location_record', '_get_map_weight', '_get_number_alleles', '_get_peptide_variation', '_get_seq_region_record', '_get_sequence', '_get_somatic', '_get_symbol', '_get_transcript_record', '_get_translation_location', '_get_validation', '_get_variants', '_get_variation_table_record', '_location_column_prefix', '_make_location', '_populate_cache_from_record', '_set_null_values', '_split_alleles', '_table_rows', 'allele_code_table', 'allele_table', 'db', 'featureData', 'genome', 'getAnnotatedSeq', 'getFeatures', 'transcript_variation_table', 'variation_feature_table', 'variation_table']
#!/usr/bin/env python2 # -*- coding: utf-8 -*- # cogent and sqlalchemy modules need to be installed: # pip2 install cogent # pip2 install sqlalchemy from cogent.db.ensembl import HostAccount, Species, Genome from pypath import mapping Release = 78 account = HostAccount('ensembldb.ensembl.org', 'anonymous', '') human = Genome(Species='human', Release=Release, account=account) # UniProt, seq offset, residue, isoform positions = [('P00533', 40, 'Q', 1), ('P60520', 30, 'P', 1)] m = mapping.Mapper() m.load_uniprot_mappings(['ensg'], bi=True) positions_ens = [] for p in positions: ensgs = m.map_name(p[0], 'uniprot', 'ensg') for ensg in ensgs: genes = human.getGenesMatching(StableId=ensg) for gene in genes: positions_ens.append( tuple([ensg, gene.Location, gene.CanonicalTranscript.Exons] + list(p))) # another attempts with biopython --
def add_ensembl_gene_data(session, species, ensembl_release, account=None, debug=False): """add Ensembl genes and their transcripts to the db session""" rr = RunRecord('add_ensembl_gene_data') genome = Genome(species, Release=ensembl_release, account=account) skip = set(['processed_transcript', 'pseudogene']) biotypes = [b for b in genome.getDistinct('BioType') if b not in skip] data = [] unique_gene_ids = set() unique_exon_ids = set() chromSet = set() n = 0 total_objects = 0 for biotype in biotypes: for gene in genome.getGenesMatching(BioType=biotype): # gene.Location.CoordName is the chromosome name min_chrom_length = 5 # likely an unconfirmed scaffold if len(gene.Location.CoordName) > min_chrom_length: rr.addWarning('Skipping chrom', gene.Location.CoordName) continue chromSet.add(gene.Location.CoordName) if gene.StableId not in unique_gene_ids: db_gene = Gene(ensembl_id=gene.StableId, symbol=gene.Symbol, biotype=gene.BioType, description=gene.Description, status=gene.Status, chrom=gene.Location.CoordName, start=gene.Location.Start, end=gene.Location.End, strand=gene.Location.Strand) unique_gene_ids.add(gene.StableId) data.append(db_gene) else: rr.addWarning('Duplicate gene', gene.StableId) for exon in gene.CanonicalTranscript.Exons: if exon.StableId not in unique_exon_ids: db_exon = Exon(exon.StableId, exon.Rank, exon.Location.Start, exon.Location.End) db_exon.gene = db_gene unique_exon_ids.add(exon.StableId) data.append(db_exon) else: rr.addWarning('Duplicate exon', exon.StableId) n += 1 if n % 100 == 0: print 'Genes processed:', n, '; Db objects created:', len(data) if debug: session.add_all(data) session.commit() return rr.addInfo('Instantiating chromosomes', chromSet) chroms = Chroms(species, chromSet) data.append(chroms) rr.addInfo('Writing objects into db', len(data)) session.add_all(data) session.commit() return chroms
if 'ENSEMBL_ACCOUNT' in os.environ: host, username, password = os.environ['ENSEMBL_ACCOUNT'].split() account = HostAccount(host, username, password) else: account = None protein_mutation = 'A203T' protein_A = protein_mutation[0] protein_B = protein_mutation[-1] codon_index = int(protein_mutation[1:-1]) #result = find_codon_index(protein_A, 'GCA', protein_B) #print result human = Genome(Species='human', Release=Release, account=account) print human #seqs = {'original' : 'A', #'mutation' : 'T' #} #protein = LoadSeqs(data = seqs, moltype = PROTEIN) #print protein.getTranslation() #protein = 'METSASATASEKQEAKSGILEAAGFPDPGKKASPLVVAAAAAAAVAAQGVPQHLLPPFHAPLPIDMRHQEGRYHYEPHSVHGVHGPPALSGSPVISDISLIRLSPHPAGPGESPFNAPHPYVNPHMEHYLRSVHSSPTLSMISAARGLSPADVAQEHLKERGLFGLPAPGTTPSDYYHQMTLVAGHPAPYGDLLMQSGGAASAPHLHDYLNPVDVSRFSSPRVTPRLSRKRALSISPLSDASLDLQRMIRTSPNSLVAYINNSRSSSAASGSYGHLSAGALSPAFTFPHPINPVAYQQILSQQRGLGSAFGHTPPLIQPSPTFLAQQPMALTSINATPTQLSSSSNCLSDTNQNKQSSESAVSSTVNPVAIHKRSKVKTEPEGLRPASPLALTQGQVSGHGSCGCALPLSQEQLADLKEDLDRDDCKQEAEVVIYETNCHWEDCTKEYDTQEQLVHHINNEHIHGEKKEFVCRWQACTREQKPFKAQYMLVVHMRRHTGEKPHKCTFEGCSKAYSRLENLKTHLRSHTGEKPYVCEHEGCNKAFSNASDRAKHQNRTHSNEKPYICKIPGCTKRYTDPSSLRKHVKTVHGPDAHVTKKQRNDVHLRTPLLKENGDSEAGTEPGGPESTEASSTSQAVEDCLHVRAIKTESSGLCQSSPGAQSSCSSEPSPLGSAPNNDSGVEMPGTGPGSLGDLTALDDTPPGADTSALAAPSAGGLQLRKHMTTMHRFEQLKKEKLKSLKDSCSWAGPTPHTRNTKLPPLPGSGSILENFSGSGGGGPAGLLPNPRLSELSASEVTMLSQLQERRDSSTSTVSSAYTVSRRSSGISPYFSSRRSSEASPLGAGRPHNASSADSYDPISTDASRRSSEASQCSGGSGLLNLTPAQQYSLRAKYAAATGGPPPTPLPGLERMSLRTRLALLDAPERTLPAGCPRPLGPRRGSDGPTYGHGHAGAAPAFPHEAPGGGARRASDPVRRPDALSLPRVQRFHSTHNVNPGPLPPCADRRGLRLQSHPSTDGGLARGAYSPRPPSISENVAMEAVAAGVDGAGPEADLGLPEDDLVLPDDVVQYIKAHASGALDEGTGQVYPTESTGFSDNPRLPSPGLHGQRRMVAADSNVGPSAPMLGGCQLGFGAPSSLNKNNMPVQWNEVSSGTVDALASQVKPPPFPQGNLAVVQQKPAFGQYPGYSPQGLQASPGGLDSTQPHLQPRSGAPSQGIPRVNYMQQLRQPVAGSQCPGMTTTMSPHACYGQVHPQLSPSTISGALNQFPQSCSNMPAKPGHLGHPQQTEVAPDPTTMGNRHRELGVPDSALAGVPPPHPVQSYPQQSHHLAASMSQEGYHQVPSLLPARQPGFMEPQTGPMGVATAGFGLVQPRPPLEPSPTGRHRGVRAVQQQLAYARATGHAMAAMPSSQETAEAVPKGAMGNMGSVPPQPPPQDAGGAPDHSMLYYYGQIHMYEQDGGLENLGSCQVMRSQPPQPQACQDSIQPQPLPSPGVNQVSSTVDSQLLEAPQIDFDAIMDDGDHSSLFSGALSPSLLHSLSQNSSRLTTPRNSLTLPSIPAGISNMAVGDMSSMLTSLAEESKFLNMMT' #print len(protein) #cds = 'ATGGAGACGTCTGCCTCAGCCACTGCCTCCGAGAAGCAAGAAGCCAAAAGTGGGATCCTGGAGGCCGCTGGCTTCCCCGACCCGGGTAAAAAGGCCTCTCCTTTGGTGGTGGCTGCAGCGGCAGCAGCAGCGGTAGCTGCCCAAGGAGTGCCGCAGCATCTCTTGCCACCATTCCATGCGCCCCTACCGATTGACATGCGACACCAGGAAGGAAGGTACCATTACGAGCCTCATTCTGTCCACGGTGTGCACGGGCCCCCTGCCCTCAGCGGCAGCCCTGTCATCTCTGACATCTCCTTGATCCGGCTTTCCCCGCACCCGGCTGGCCCTGGGGAGTCCCCCTTCAACGCCCCCCACCCGTACGTGAACCCCCACATGGAGCACTACCTCCGTTCTGTGCACAGCAGCCCCACGCTCTCCATGATCTCTGCAGCCAGGGGCCTCAGCCCCGCTGATGTGGCCCAGGAGCACCTTAAGGAGAGGGGACTGTTTGGCCTTCCTGCTCCAGGCACCACCCCCTCAGACTATTACCACCAGATGACCCTCGTGGCAGGCCACCCCGCGCCCTACGGGGACCTGCTGATGCAGAGCGGGGGCGCTGCCAGCGCACCCCATCTCCACGACTACCTCAACCCCGTGGACGTGTCCCGTTTCTCCAGCCCGCGGGTGACGCCCCGCCTGAGCCGCAAGCGGGCGCTGTCCATCTCCCCACTCTCAGACGCCAGCCTGGACCTGCAGCGGATGATCCGCACCTCACCCAACTCGCTAGTGGCCTACATCAACAACTCCCGAAGCAGCTCGGCGGCCAGCGGTTCCTACGGGCATCTGTCAGCGGGTGCCCTCAGCCCAGCCTTCACCTTCCCCCACCCCATCAACCCCGTGGCCTACCAGCAGATTCTGAGCCAGCAGAGGGGTCTGGGGTCAGCCTTTGGACACACACCACCCCTGATCCAGCCCTCACCCACCTTCCTGGCCCAGCAGCCCATGGCCCTCACCTCCATCAATGCCACGCCCACCCAGCTCAGCAGCAGCAGCAACTGTCTGAGTGACACCAACCAGAACAAGCAGAGCAGTGAGTCGGCCGTCAGCAGCACCGTCAACCCTGTCGCCATTCACAAGCGCAGCAAGGTCAAGACCGAGCCTGAGGGCCTGCGGCCGGCCTCCCCTCTGGCGCTGACGCAGGGCCAGGTGTCTGGACACGGCTCATGTGGGTGTGCCCTTCCCCTCTCCCAGGAGCAGCTGGCTGACCTCAAGGAAGATCTGGACAGGGATGACTGTAAGCAGGAGGCTGAGGTGGTCATCTATGAGACCAACTGCCACTGGGAAGACTGCACCAAGGAGTACGACACCCAGGAGCAGCTGGTGCATCACATCAACAACGAGCACATCCACGGGGAGAAGAAGGAGTTTGTGTGCCGCTGGCAGGCCTGCACGCGGGAGCAGAAGCCCTTCAAGGCGCAGTACATGCTGGTGGTGCACATGCGGCGACACACGGGCGAGAAGCCCCACAAGTGCACGTTCGAGGGCTGCTCGAAGGCCTACTCCCGCCTGGAGAACCTGAAGACACACCTGCGGTCCCACACCGGGGAGAAGCCATATGTGTGTGAGCACGAGGGCTGCAACAAAGCCTTCTCCAACGCCTCGGACCGCGCCAAGCACCAGAATCGCACCCACTCCAACGAGAAACCCTACATCTGCAAGATCCCAGGCTGCACCAAGAGATACACAGACCCCAGCTCTCTCCGGAAGCATGTGAAAACGGTCCACGGCCCAGATGCCCACGTCACCAAGAAGCAGCGCAATGACGTGCACCTCCGCACACCGCTGCTCAAAGAGAATGGGGACAGTGAGGCCGGCACGGAGCCTGGCGGCCCAGAGAGCACCGAGGCCAGCAGCACCAGCCAGGCCGTGGAGGACTGCCTGCACGTCAGAGCCATCAAGACCGAGAGCTCCGGGCTGTGTCAGTCCAGCCCCGGGGCCCAGTCGTCCTGCAGCAGCGAGCCCTCTCCTCTGGGCAGTGCCCCCAACAATGACAGTGGCGTGGAGATGCCGGGGACGGGGCCCGGGAGCCTGGGAGACCTGACGGCACTGGATGACACACCCCCAGGGGCCGACACCTCAGCCCTGGCTGCCCCCTCCGCTGGTGGCCTCCAGCTGCGCAAACACATGACCACCATGCACCGGTTCGAGCAGCTCAAGAAGGAGAAGCTCAAGTCACTCAAGGATTCCTGCTCATGGGCCGGGCCGACTCCACACACGCGGAACACCAAGCTGCCTCCCCTCCCGGGAAGTGGCTCCATCCTGGAAAACTTCAGTGGCAGTGGGGGCGGCGGGCCCGCGGGGCTGCTGCCGAACCCGCGGCTGTCGGAGCTGTCCGCGAGCGAGGTGACCATGCTGAGCCAGCTGCAGGAGCGCCGCGACAGCTCCACCAGCACGGTCAGCTCGGCCTACACCGTGAGCCGCCGCTCCTCCGGCATCTCCCCCTACTTCTCCAGCCGCCGCTCCAGCGAGGCCTCGCCCCTGGGCGCCGGCCGCCCGCACAACGCGAGCTCCGCTGACTCCTACGACCCCATCTCCACGGACGCGTCGCGGCGCTCGAGCGAGGCCAGCCAGTGCAGCGGCGGCTCCGGGCTGCTCAACCTCACGCCGGCGCAGCAGTACAGCCTGCGGGCCAAGTACGCGGCAGCCACTGGCGGCCCCCCGCCCACTCCGCTGCCGGGCCTGGAGCGCATGAGCCTGCGGACCAGGCTGGCGCTGCTGGACGCGCCCGAGCGCACGCTGCCCGCCGGCTGCCCACGCCCACTGGGGCCGCGGCGTGGCAGCGACGGGCCGACCTATGGCCACGGCCACGCGGGGGCTGCGCCCGCCTTCCCCCACGAGGCTCCAGGCGGCGGAGCCAGGCGGGCCAGCGACCCTGTGCGGCGGCCCGATGCCCTGTCCCTGCCGCGGGTGCAGCGCTTCCACAGCACCCACAACGTGAACCCCGGCCCGCTGCCGCCCTGTGCCGACAGGCGAGGCCTCCGCCTGCAGAGCCACCCGAGCACCGACGGCGGCCTGGCCCGCGGCGCCTACTCGCCCCGGCCGCCTAGCATCAGCGAGAACGTGGCGATGGAGGCCGTGGCGGCAGGAGTGGACGGCGCGGGGCCCGAGGCCGACCTGGGGCTGCCGGAGGACGACCTGGTGCTTCCAGACGACGTGGTGCAGTACATCAAGGCGCACGCCAGTGGCGCTCTGGACGAGGGCACCGGGCAGGTGTATCCCACGGAAAGCACTGGCTTCTCTGACAACCCCAGACTACCCAGCCCGGGGCTGCACGGCCAGCGCAGGATGGTGGCTGCGGACTCCAACGTGGGCCCCTCCGCCCCTATGCTGGGAGGATGCCAGTTAGGCTTTGGGGCGCCCTCCAGCCTGAACAAAAATAACATGCCTGTGCAGTGGAATGAGGTGAGCTCCGGCACCGTAGACGCCCTGGCCAGCCAGGTGAAGCCTCCACCCTTTCCTCAGGGCAACCTGGCGGTGGTGCAGCAGAAGCCTGCCTTTGGCCAGTACCCGGGCTACAGTCCGCAAGGCCTACAGGCTAGCCCTGGGGGCCTGGACAGCACGCAGCCACACCTGCAGCCCCGCAGCGGAGCCCCCTCCCAGGGCATCCCCAGGGTAAACTACATGCAGCAGCTGCGACAGCCAGTGGCAGGCAGCCAGTGTCCTGGCATGACTACCACTATGAGCCCCCATGCCTGCTATGGCCAAGTCCACCCCCAGCTGAGCCCCAGCACCATCAGTGGGGCCCTCAACCAGTTCCCCCAATCCTGCAGCAACATGCCAGCCAAGCCAGGGCATCTGGGGCACCCTCAGCAGACAGAAGTGGCACCTGACCCCACCACGATGGGCAATCGCCACAGGGAACTTGGGGTCCCCGATTCAGCCCTGGCTGGAGTGCCACCACCTCACCCAGTCCAGAGCTACCCACAGCAGAGCCATCACCTGGCAGCCTCCATGAGCCAGGAGGGCTACCACCAGGTCCCCAGCCTTCTGCCTGCCCGCCAGCCTGGCTTCATGGAGCCCCAAACAGGCCCGATGGGGGTGGCTACAGCAGGCTTTGGCCTAGTGCAGCCCCGGCCTCCCCTCGAGCCCAGCCCCACTGGCCGCCACCGTGGGGTACGTGCTGTGCAGCAGCAGCTGGCCTACGCCAGGGCCACAGGCCATGCCATGGCTGCCATGCCGTCCAGTCAGGAAACAGCAGAGGCTGTGCCCAAGGGAGCGATGGGCAACATGGGGTCGGTGCCTCCCCAGCCGCCTCCGCAGGACGCAGGTGGGGCCCCGGACCACAGCATGCTCTACTACTACGGCCAGATCCACATGTACGAACAGGATGGAGGCCTGGAGAACCTCGGGAGCTGCCAGGTCATGCGGTCCCAGCCACCACAGCCACAGGCCTGTCAGGACAGCATCCAGCCCCAGCCCTTGCCCTCACCAGGGGTCAACCAGGTGTCCAGCACTGTGGACTCCCAGCTCCTGGAGGCCCCCCAGATTGACTTCGATGCCATCATGGATGATGGCGATCACTCGAGTTTGTTCTCGGGTGCTCTGAGCCCCAGCCTCCTCCACAGCCTCTCCCAGAACTCCTCCCGCCTCACCACCCCCCGAAACTCCTTGACCCTGCCCTCCATCCCCGCAGGCATCAGCAACATGGCTGTCGGGGACATGAGCTCCATGCTCACCAGCCTCGCCGAGGAGAGCAAGTTCCTGAACATGATGACCTAG' #my_seq = DNA.makeSequence(cds,'gli2') #seq = my_seq.withoutTerminalStopCodon() #pep = seq.getTranslation() #print pep.toFasta()
if 'ENSEMBL_ACCOUNT' in os.environ: host, username, password = os.environ['ENSEMBL_ACCOUNT'].split() account = HostAccount(host, username, password) else: account = None print account ## What Species Are Available? #from cogent.db.ensembl import Species #print Species ## Interrogating a Genome from cogent.db.ensembl import HostAccount, Genome human = Genome(Species='human', Release=Release, account=account) print human ''' A Note on Coordinate Systems The positions employed on Ensembls web-site, and in their MySQL database differ from those used internally by cogent.db.ensembl.In all cases where you are querying cogent.db.ensembl objects directly inputting nucleotide positions you can indicate you are using Ensembl coordinates by setting ensembl_coord=True. If you are explicitly passing in a cogent.db.ensembl region, that argument has no effect. ''' ## Selecting Gene #Via StableID brca1 = human.getGeneByStableId(StableId='ENSG00000012048')
except IndexError: print "No gene entered" gene=False sys.exit() print "<html>" print "<head>" print "<style>\ntab1 { padding-left: 4em; }\ntab2 { padding-left: 8em; }\n" print "body{font-family:helvetica} \ntab3 { padding-left: 12em; }\n p{color:#AACCFF}\n" print "button {padding: 15px 32px;text-align: center; text-decoration: none;display: inline-block;font-size: 16px;}\n li {margin-top: 0px; margin-right: 5px;}\n</style>" print "</head>" ##AACCFF print "<body style=\"background-color:#222233;\" text=\"#FFFFA8\">" mouse=Genome(Species='mouse',Release=87,account=None) #coding=mouse.getGenesMatching(StableID=geneID) #print coding coding=mouse.getGenesMatching(Symbol=genename) #print mouse.getGenesMatching() #print dir(mouse.getGenesMatching()) #print mouse.getGenesMatching().__dict__ #print coding #sys.exit() if not coding: print "<p style=\"font-size: 55px;\">This is a fatal error. Can't find your gene<br>" print "</body>" sys.exit()
import os from cogent.db.ensembl import HostAccount, Genome if 'ENSEMBL_ACCOUNT' in os.environ: host, username, password = os.environ['ENSEMBL_ACCOUNT'].split() account = HostAccount(host, username, password) else: account = None human = Genome('human', Release=75, account=account) gene_symbols = ['brca1', 'brca2'] genomes1k_url = 'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502' for gene_symbol in gene_symbols: print gene_symbol genes = human.getGenesMatching(Symbol=gene_symbol) for gene in genes: print gene.Location.CoordName print gene.Location.Start, gene.Location.End command = './breastcancer/programs/htslib/tabix -h \ %s/ALL.chr%s.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz \ %s:%s-%s > breastcancer/%s.vcf' % ( genomes1k_url, gene.Location.CoordName, gene.Location.CoordName, gene.Location.Start, gene.Location.End, gene_symbol) print command #os.system(command) size = gene.Location.End - gene.Location.Start print 'Size', size print gene.Location.Strand
import os import sqlalchemy as sql from cogent.db.ensembl import HostAccount, Genome #account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split()) if 'ENSEMBL_ACCOUNT' in os.environ: host, username, password = os.environ['ENSEMBL_ACCOUNT'].split() account = HostAccount(host, username, password) else: account = None human = Genome('human', Release=69, account=account) # BRCA1 gene = human.getGeneByStableId(StableId="ENSG00000167131") # get the db tables we need external_db = human.CoreDb.getTable("external_db") object_xref = human.CoreDb.getTable("object_xref") xref = human.CoreDb.getTable("xref") # get the external db ID for refseq mrna refseq_mrna_id = sql.select([external_db.c.external_db_id], external_db.c.db_name.like('RefSeq_mRNA')).execute().fetchone() # query for a specific transcript ID print "Querying for mRNA REFSEQ entries for one transcript" query = sql.select([object_xref, xref], sql.and_(xref.c.xref_id==object_xref.c.xref_id, object_xref.c.ensembl_id == 1345831,
+ " Position: " + str(range.start) + "-" + str(range.end-1)\ + " Length: " + str(range.end - range.start) snp_count = seq.count("/") if snp_count > 0: header = header + " SNP count: " + str(snp_count) else: header = header + " no SNP" output.write("\n" + header) output.write("\n" + str(seq)) output.close #Setting ensembl parameters release = 81 species = 'Mus musculus' mouse = Genome(Species=species, Release=release) input = raw_input('\nEnter ensembl mouse transcript ID or txt file: ') if input.strip().split('.')[-1] == 'txt': try: fh = open(input.strip(), 'r') for transID in fh: transID = transID.strip().split('.')[0] print "\n" + "============ Start to manipulate", transID, "============" gene, transcript, range_seqs = specific_region(transID) if transcript is None: continue seq_output(gene, transcript, range_seqs) fh.close() except IOError: print "File", input, "not found!!"