def hgnc_to_ensembl_id(hgnc_list): account = HostAccount('blackrussian', 'bioinfo', 'A29bcd1234#', port=3306) human = Genome('human', Release=73, account=account) ensembl_stable_id_list = [] for gene in hgnc_list: gene_query = human.getGenesMatching(Symbol=gene) for gene_obj in gene_query: if gene_obj.Symbol == gene: ensembl_stable_id_list.append(gene_obj.StableId) #Remove duplicates ensembl_stable_id_list = set(ensembl_stable_id_list) #Keep list elements starting with 'ENSG' ensembl_stable_id_list = [x for x in ensembl_stable_id_list if x.startswith('ENSG')] return ensembl_stable_id_list
def hgnc_to_ensembl_id(hgnc_list): account = HostAccount('blackrussian', 'bioinfo', 'A29bcd1234#', port=3306) human = Genome('human', Release=73, account=account) ensembl_stable_id_list = [] for gene in hgnc_list: gene_query = human.getGenesMatching(Symbol=gene) for gene_obj in gene_query: if gene_obj.Symbol == gene: ensembl_stable_id_list.append(gene_obj.StableId) #Remove duplicates ensembl_stable_id_list = set(ensembl_stable_id_list) #Keep list elements starting with 'ENSG' ensembl_stable_id_list = [ x for x in ensembl_stable_id_list if x.startswith('ENSG') ] return ensembl_stable_id_list
sys.exit() print "<html>" print "<head>" print "<style>\ntab1 { padding-left: 4em; }\ntab2 { padding-left: 8em; }\n" print "body{font-family:helvetica} \ntab3 { padding-left: 12em; }\n p{color:#AACCFF}\n" print "button {padding: 15px 32px;text-align: center; text-decoration: none;display: inline-block;font-size: 16px;}\n li {margin-top: 0px; margin-right: 5px;}\n</style>" print "</head>" ##AACCFF print "<body style=\"background-color:#222233;\" text=\"#FFFFA8\">" mouse=Genome(Species='mouse',Release=87,account=None) #coding=mouse.getGenesMatching(StableID=geneID) #print coding coding=mouse.getGenesMatching(Symbol=genename) #print mouse.getGenesMatching() #print dir(mouse.getGenesMatching()) #print mouse.getGenesMatching().__dict__ #print coding #sys.exit() if not coding: print "<p style=\"font-size: 55px;\">This is a fatal error. Can't find your gene<br>" print "</body>" sys.exit() completed=[] Mexons=[] for g in coding:
#print len(protein) #cds = 'ATGGAGACGTCTGCCTCAGCCACTGCCTCCGAGAAGCAAGAAGCCAAAAGTGGGATCCTGGAGGCCGCTGGCTTCCCCGACCCGGGTAAAAAGGCCTCTCCTTTGGTGGTGGCTGCAGCGGCAGCAGCAGCGGTAGCTGCCCAAGGAGTGCCGCAGCATCTCTTGCCACCATTCCATGCGCCCCTACCGATTGACATGCGACACCAGGAAGGAAGGTACCATTACGAGCCTCATTCTGTCCACGGTGTGCACGGGCCCCCTGCCCTCAGCGGCAGCCCTGTCATCTCTGACATCTCCTTGATCCGGCTTTCCCCGCACCCGGCTGGCCCTGGGGAGTCCCCCTTCAACGCCCCCCACCCGTACGTGAACCCCCACATGGAGCACTACCTCCGTTCTGTGCACAGCAGCCCCACGCTCTCCATGATCTCTGCAGCCAGGGGCCTCAGCCCCGCTGATGTGGCCCAGGAGCACCTTAAGGAGAGGGGACTGTTTGGCCTTCCTGCTCCAGGCACCACCCCCTCAGACTATTACCACCAGATGACCCTCGTGGCAGGCCACCCCGCGCCCTACGGGGACCTGCTGATGCAGAGCGGGGGCGCTGCCAGCGCACCCCATCTCCACGACTACCTCAACCCCGTGGACGTGTCCCGTTTCTCCAGCCCGCGGGTGACGCCCCGCCTGAGCCGCAAGCGGGCGCTGTCCATCTCCCCACTCTCAGACGCCAGCCTGGACCTGCAGCGGATGATCCGCACCTCACCCAACTCGCTAGTGGCCTACATCAACAACTCCCGAAGCAGCTCGGCGGCCAGCGGTTCCTACGGGCATCTGTCAGCGGGTGCCCTCAGCCCAGCCTTCACCTTCCCCCACCCCATCAACCCCGTGGCCTACCAGCAGATTCTGAGCCAGCAGAGGGGTCTGGGGTCAGCCTTTGGACACACACCACCCCTGATCCAGCCCTCACCCACCTTCCTGGCCCAGCAGCCCATGGCCCTCACCTCCATCAATGCCACGCCCACCCAGCTCAGCAGCAGCAGCAACTGTCTGAGTGACACCAACCAGAACAAGCAGAGCAGTGAGTCGGCCGTCAGCAGCACCGTCAACCCTGTCGCCATTCACAAGCGCAGCAAGGTCAAGACCGAGCCTGAGGGCCTGCGGCCGGCCTCCCCTCTGGCGCTGACGCAGGGCCAGGTGTCTGGACACGGCTCATGTGGGTGTGCCCTTCCCCTCTCCCAGGAGCAGCTGGCTGACCTCAAGGAAGATCTGGACAGGGATGACTGTAAGCAGGAGGCTGAGGTGGTCATCTATGAGACCAACTGCCACTGGGAAGACTGCACCAAGGAGTACGACACCCAGGAGCAGCTGGTGCATCACATCAACAACGAGCACATCCACGGGGAGAAGAAGGAGTTTGTGTGCCGCTGGCAGGCCTGCACGCGGGAGCAGAAGCCCTTCAAGGCGCAGTACATGCTGGTGGTGCACATGCGGCGACACACGGGCGAGAAGCCCCACAAGTGCACGTTCGAGGGCTGCTCGAAGGCCTACTCCCGCCTGGAGAACCTGAAGACACACCTGCGGTCCCACACCGGGGAGAAGCCATATGTGTGTGAGCACGAGGGCTGCAACAAAGCCTTCTCCAACGCCTCGGACCGCGCCAAGCACCAGAATCGCACCCACTCCAACGAGAAACCCTACATCTGCAAGATCCCAGGCTGCACCAAGAGATACACAGACCCCAGCTCTCTCCGGAAGCATGTGAAAACGGTCCACGGCCCAGATGCCCACGTCACCAAGAAGCAGCGCAATGACGTGCACCTCCGCACACCGCTGCTCAAAGAGAATGGGGACAGTGAGGCCGGCACGGAGCCTGGCGGCCCAGAGAGCACCGAGGCCAGCAGCACCAGCCAGGCCGTGGAGGACTGCCTGCACGTCAGAGCCATCAAGACCGAGAGCTCCGGGCTGTGTCAGTCCAGCCCCGGGGCCCAGTCGTCCTGCAGCAGCGAGCCCTCTCCTCTGGGCAGTGCCCCCAACAATGACAGTGGCGTGGAGATGCCGGGGACGGGGCCCGGGAGCCTGGGAGACCTGACGGCACTGGATGACACACCCCCAGGGGCCGACACCTCAGCCCTGGCTGCCCCCTCCGCTGGTGGCCTCCAGCTGCGCAAACACATGACCACCATGCACCGGTTCGAGCAGCTCAAGAAGGAGAAGCTCAAGTCACTCAAGGATTCCTGCTCATGGGCCGGGCCGACTCCACACACGCGGAACACCAAGCTGCCTCCCCTCCCGGGAAGTGGCTCCATCCTGGAAAACTTCAGTGGCAGTGGGGGCGGCGGGCCCGCGGGGCTGCTGCCGAACCCGCGGCTGTCGGAGCTGTCCGCGAGCGAGGTGACCATGCTGAGCCAGCTGCAGGAGCGCCGCGACAGCTCCACCAGCACGGTCAGCTCGGCCTACACCGTGAGCCGCCGCTCCTCCGGCATCTCCCCCTACTTCTCCAGCCGCCGCTCCAGCGAGGCCTCGCCCCTGGGCGCCGGCCGCCCGCACAACGCGAGCTCCGCTGACTCCTACGACCCCATCTCCACGGACGCGTCGCGGCGCTCGAGCGAGGCCAGCCAGTGCAGCGGCGGCTCCGGGCTGCTCAACCTCACGCCGGCGCAGCAGTACAGCCTGCGGGCCAAGTACGCGGCAGCCACTGGCGGCCCCCCGCCCACTCCGCTGCCGGGCCTGGAGCGCATGAGCCTGCGGACCAGGCTGGCGCTGCTGGACGCGCCCGAGCGCACGCTGCCCGCCGGCTGCCCACGCCCACTGGGGCCGCGGCGTGGCAGCGACGGGCCGACCTATGGCCACGGCCACGCGGGGGCTGCGCCCGCCTTCCCCCACGAGGCTCCAGGCGGCGGAGCCAGGCGGGCCAGCGACCCTGTGCGGCGGCCCGATGCCCTGTCCCTGCCGCGGGTGCAGCGCTTCCACAGCACCCACAACGTGAACCCCGGCCCGCTGCCGCCCTGTGCCGACAGGCGAGGCCTCCGCCTGCAGAGCCACCCGAGCACCGACGGCGGCCTGGCCCGCGGCGCCTACTCGCCCCGGCCGCCTAGCATCAGCGAGAACGTGGCGATGGAGGCCGTGGCGGCAGGAGTGGACGGCGCGGGGCCCGAGGCCGACCTGGGGCTGCCGGAGGACGACCTGGTGCTTCCAGACGACGTGGTGCAGTACATCAAGGCGCACGCCAGTGGCGCTCTGGACGAGGGCACCGGGCAGGTGTATCCCACGGAAAGCACTGGCTTCTCTGACAACCCCAGACTACCCAGCCCGGGGCTGCACGGCCAGCGCAGGATGGTGGCTGCGGACTCCAACGTGGGCCCCTCCGCCCCTATGCTGGGAGGATGCCAGTTAGGCTTTGGGGCGCCCTCCAGCCTGAACAAAAATAACATGCCTGTGCAGTGGAATGAGGTGAGCTCCGGCACCGTAGACGCCCTGGCCAGCCAGGTGAAGCCTCCACCCTTTCCTCAGGGCAACCTGGCGGTGGTGCAGCAGAAGCCTGCCTTTGGCCAGTACCCGGGCTACAGTCCGCAAGGCCTACAGGCTAGCCCTGGGGGCCTGGACAGCACGCAGCCACACCTGCAGCCCCGCAGCGGAGCCCCCTCCCAGGGCATCCCCAGGGTAAACTACATGCAGCAGCTGCGACAGCCAGTGGCAGGCAGCCAGTGTCCTGGCATGACTACCACTATGAGCCCCCATGCCTGCTATGGCCAAGTCCACCCCCAGCTGAGCCCCAGCACCATCAGTGGGGCCCTCAACCAGTTCCCCCAATCCTGCAGCAACATGCCAGCCAAGCCAGGGCATCTGGGGCACCCTCAGCAGACAGAAGTGGCACCTGACCCCACCACGATGGGCAATCGCCACAGGGAACTTGGGGTCCCCGATTCAGCCCTGGCTGGAGTGCCACCACCTCACCCAGTCCAGAGCTACCCACAGCAGAGCCATCACCTGGCAGCCTCCATGAGCCAGGAGGGCTACCACCAGGTCCCCAGCCTTCTGCCTGCCCGCCAGCCTGGCTTCATGGAGCCCCAAACAGGCCCGATGGGGGTGGCTACAGCAGGCTTTGGCCTAGTGCAGCCCCGGCCTCCCCTCGAGCCCAGCCCCACTGGCCGCCACCGTGGGGTACGTGCTGTGCAGCAGCAGCTGGCCTACGCCAGGGCCACAGGCCATGCCATGGCTGCCATGCCGTCCAGTCAGGAAACAGCAGAGGCTGTGCCCAAGGGAGCGATGGGCAACATGGGGTCGGTGCCTCCCCAGCCGCCTCCGCAGGACGCAGGTGGGGCCCCGGACCACAGCATGCTCTACTACTACGGCCAGATCCACATGTACGAACAGGATGGAGGCCTGGAGAACCTCGGGAGCTGCCAGGTCATGCGGTCCCAGCCACCACAGCCACAGGCCTGTCAGGACAGCATCCAGCCCCAGCCCTTGCCCTCACCAGGGGTCAACCAGGTGTCCAGCACTGTGGACTCCCAGCTCCTGGAGGCCCCCCAGATTGACTTCGATGCCATCATGGATGATGGCGATCACTCGAGTTTGTTCTCGGGTGCTCTGAGCCCCAGCCTCCTCCACAGCCTCTCCCAGAACTCCTCCCGCCTCACCACCCCCCGAAACTCCTTGACCCTGCCCTCCATCCCCGCAGGCATCAGCAACATGGCTGTCGGGGACATGAGCTCCATGCTCACCAGCCTCGCCGAGGAGAGCAAGTTCCTGAACATGATGACCTAG' #my_seq = DNA.makeSequence(cds,'gli2') #seq = my_seq.withoutTerminalStopCodon() #pep = seq.getTranslation() #print pep.toFasta() #codons = my_seq.getInMotifSize(3) #print codons ##pep = my_seq.getTranslation() #print pep #print len(pep) #brca1 = human.getGeneByStableId(StableId='ENSG00000012048') #print brca1.Description genes = human.getGenesMatching(Symbol='gli2') for gene in genes: if gene.Symbol.lower() == 'gli2': break #print gene #print len(gene) #print brca2.Symbol #print brca2.Description #print brca2 location = str(gene.Location).split(':') chromossome = location[-3] start = location[-2].split('-')[0] end = location[-2].split('-')[1] print chromossome, start, end #print 'protein seq'
#cds = 'ATGGAGACGTCTGCCTCAGCCACTGCCTCCGAGAAGCAAGAAGCCAAAAGTGGGATCCTGGAGGCCGCTGGCTTCCCCGACCCGGGTAAAAAGGCCTCTCCTTTGGTGGTGGCTGCAGCGGCAGCAGCAGCGGTAGCTGCCCAAGGAGTGCCGCAGCATCTCTTGCCACCATTCCATGCGCCCCTACCGATTGACATGCGACACCAGGAAGGAAGGTACCATTACGAGCCTCATTCTGTCCACGGTGTGCACGGGCCCCCTGCCCTCAGCGGCAGCCCTGTCATCTCTGACATCTCCTTGATCCGGCTTTCCCCGCACCCGGCTGGCCCTGGGGAGTCCCCCTTCAACGCCCCCCACCCGTACGTGAACCCCCACATGGAGCACTACCTCCGTTCTGTGCACAGCAGCCCCACGCTCTCCATGATCTCTGCAGCCAGGGGCCTCAGCCCCGCTGATGTGGCCCAGGAGCACCTTAAGGAGAGGGGACTGTTTGGCCTTCCTGCTCCAGGCACCACCCCCTCAGACTATTACCACCAGATGACCCTCGTGGCAGGCCACCCCGCGCCCTACGGGGACCTGCTGATGCAGAGCGGGGGCGCTGCCAGCGCACCCCATCTCCACGACTACCTCAACCCCGTGGACGTGTCCCGTTTCTCCAGCCCGCGGGTGACGCCCCGCCTGAGCCGCAAGCGGGCGCTGTCCATCTCCCCACTCTCAGACGCCAGCCTGGACCTGCAGCGGATGATCCGCACCTCACCCAACTCGCTAGTGGCCTACATCAACAACTCCCGAAGCAGCTCGGCGGCCAGCGGTTCCTACGGGCATCTGTCAGCGGGTGCCCTCAGCCCAGCCTTCACCTTCCCCCACCCCATCAACCCCGTGGCCTACCAGCAGATTCTGAGCCAGCAGAGGGGTCTGGGGTCAGCCTTTGGACACACACCACCCCTGATCCAGCCCTCACCCACCTTCCTGGCCCAGCAGCCCATGGCCCTCACCTCCATCAATGCCACGCCCACCCAGCTCAGCAGCAGCAGCAACTGTCTGAGTGACACCAACCAGAACAAGCAGAGCAGTGAGTCGGCCGTCAGCAGCACCGTCAACCCTGTCGCCATTCACAAGCGCAGCAAGGTCAAGACCGAGCCTGAGGGCCTGCGGCCGGCCTCCCCTCTGGCGCTGACGCAGGGCCAGGTGTCTGGACACGGCTCATGTGGGTGTGCCCTTCCCCTCTCCCAGGAGCAGCTGGCTGACCTCAAGGAAGATCTGGACAGGGATGACTGTAAGCAGGAGGCTGAGGTGGTCATCTATGAGACCAACTGCCACTGGGAAGACTGCACCAAGGAGTACGACACCCAGGAGCAGCTGGTGCATCACATCAACAACGAGCACATCCACGGGGAGAAGAAGGAGTTTGTGTGCCGCTGGCAGGCCTGCACGCGGGAGCAGAAGCCCTTCAAGGCGCAGTACATGCTGGTGGTGCACATGCGGCGACACACGGGCGAGAAGCCCCACAAGTGCACGTTCGAGGGCTGCTCGAAGGCCTACTCCCGCCTGGAGAACCTGAAGACACACCTGCGGTCCCACACCGGGGAGAAGCCATATGTGTGTGAGCACGAGGGCTGCAACAAAGCCTTCTCCAACGCCTCGGACCGCGCCAAGCACCAGAATCGCACCCACTCCAACGAGAAACCCTACATCTGCAAGATCCCAGGCTGCACCAAGAGATACACAGACCCCAGCTCTCTCCGGAAGCATGTGAAAACGGTCCACGGCCCAGATGCCCACGTCACCAAGAAGCAGCGCAATGACGTGCACCTCCGCACACCGCTGCTCAAAGAGAATGGGGACAGTGAGGCCGGCACGGAGCCTGGCGGCCCAGAGAGCACCGAGGCCAGCAGCACCAGCCAGGCCGTGGAGGACTGCCTGCACGTCAGAGCCATCAAGACCGAGAGCTCCGGGCTGTGTCAGTCCAGCCCCGGGGCCCAGTCGTCCTGCAGCAGCGAGCCCTCTCCTCTGGGCAGTGCCCCCAACAATGACAGTGGCGTGGAGATGCCGGGGACGGGGCCCGGGAGCCTGGGAGACCTGACGGCACTGGATGACACACCCCCAGGGGCCGACACCTCAGCCCTGGCTGCCCCCTCCGCTGGTGGCCTCCAGCTGCGCAAACACATGACCACCATGCACCGGTTCGAGCAGCTCAAGAAGGAGAAGCTCAAGTCACTCAAGGATTCCTGCTCATGGGCCGGGCCGACTCCACACACGCGGAACACCAAGCTGCCTCCCCTCCCGGGAAGTGGCTCCATCCTGGAAAACTTCAGTGGCAGTGGGGGCGGCGGGCCCGCGGGGCTGCTGCCGAACCCGCGGCTGTCGGAGCTGTCCGCGAGCGAGGTGACCATGCTGAGCCAGCTGCAGGAGCGCCGCGACAGCTCCACCAGCACGGTCAGCTCGGCCTACACCGTGAGCCGCCGCTCCTCCGGCATCTCCCCCTACTTCTCCAGCCGCCGCTCCAGCGAGGCCTCGCCCCTGGGCGCCGGCCGCCCGCACAACGCGAGCTCCGCTGACTCCTACGACCCCATCTCCACGGACGCGTCGCGGCGCTCGAGCGAGGCCAGCCAGTGCAGCGGCGGCTCCGGGCTGCTCAACCTCACGCCGGCGCAGCAGTACAGCCTGCGGGCCAAGTACGCGGCAGCCACTGGCGGCCCCCCGCCCACTCCGCTGCCGGGCCTGGAGCGCATGAGCCTGCGGACCAGGCTGGCGCTGCTGGACGCGCCCGAGCGCACGCTGCCCGCCGGCTGCCCACGCCCACTGGGGCCGCGGCGTGGCAGCGACGGGCCGACCTATGGCCACGGCCACGCGGGGGCTGCGCCCGCCTTCCCCCACGAGGCTCCAGGCGGCGGAGCCAGGCGGGCCAGCGACCCTGTGCGGCGGCCCGATGCCCTGTCCCTGCCGCGGGTGCAGCGCTTCCACAGCACCCACAACGTGAACCCCGGCCCGCTGCCGCCCTGTGCCGACAGGCGAGGCCTCCGCCTGCAGAGCCACCCGAGCACCGACGGCGGCCTGGCCCGCGGCGCCTACTCGCCCCGGCCGCCTAGCATCAGCGAGAACGTGGCGATGGAGGCCGTGGCGGCAGGAGTGGACGGCGCGGGGCCCGAGGCCGACCTGGGGCTGCCGGAGGACGACCTGGTGCTTCCAGACGACGTGGTGCAGTACATCAAGGCGCACGCCAGTGGCGCTCTGGACGAGGGCACCGGGCAGGTGTATCCCACGGAAAGCACTGGCTTCTCTGACAACCCCAGACTACCCAGCCCGGGGCTGCACGGCCAGCGCAGGATGGTGGCTGCGGACTCCAACGTGGGCCCCTCCGCCCCTATGCTGGGAGGATGCCAGTTAGGCTTTGGGGCGCCCTCCAGCCTGAACAAAAATAACATGCCTGTGCAGTGGAATGAGGTGAGCTCCGGCACCGTAGACGCCCTGGCCAGCCAGGTGAAGCCTCCACCCTTTCCTCAGGGCAACCTGGCGGTGGTGCAGCAGAAGCCTGCCTTTGGCCAGTACCCGGGCTACAGTCCGCAAGGCCTACAGGCTAGCCCTGGGGGCCTGGACAGCACGCAGCCACACCTGCAGCCCCGCAGCGGAGCCCCCTCCCAGGGCATCCCCAGGGTAAACTACATGCAGCAGCTGCGACAGCCAGTGGCAGGCAGCCAGTGTCCTGGCATGACTACCACTATGAGCCCCCATGCCTGCTATGGCCAAGTCCACCCCCAGCTGAGCCCCAGCACCATCAGTGGGGCCCTCAACCAGTTCCCCCAATCCTGCAGCAACATGCCAGCCAAGCCAGGGCATCTGGGGCACCCTCAGCAGACAGAAGTGGCACCTGACCCCACCACGATGGGCAATCGCCACAGGGAACTTGGGGTCCCCGATTCAGCCCTGGCTGGAGTGCCACCACCTCACCCAGTCCAGAGCTACCCACAGCAGAGCCATCACCTGGCAGCCTCCATGAGCCAGGAGGGCTACCACCAGGTCCCCAGCCTTCTGCCTGCCCGCCAGCCTGGCTTCATGGAGCCCCAAACAGGCCCGATGGGGGTGGCTACAGCAGGCTTTGGCCTAGTGCAGCCCCGGCCTCCCCTCGAGCCCAGCCCCACTGGCCGCCACCGTGGGGTACGTGCTGTGCAGCAGCAGCTGGCCTACGCCAGGGCCACAGGCCATGCCATGGCTGCCATGCCGTCCAGTCAGGAAACAGCAGAGGCTGTGCCCAAGGGAGCGATGGGCAACATGGGGTCGGTGCCTCCCCAGCCGCCTCCGCAGGACGCAGGTGGGGCCCCGGACCACAGCATGCTCTACTACTACGGCCAGATCCACATGTACGAACAGGATGGAGGCCTGGAGAACCTCGGGAGCTGCCAGGTCATGCGGTCCCAGCCACCACAGCCACAGGCCTGTCAGGACAGCATCCAGCCCCAGCCCTTGCCCTCACCAGGGGTCAACCAGGTGTCCAGCACTGTGGACTCCCAGCTCCTGGAGGCCCCCCAGATTGACTTCGATGCCATCATGGATGATGGCGATCACTCGAGTTTGTTCTCGGGTGCTCTGAGCCCCAGCCTCCTCCACAGCCTCTCCCAGAACTCCTCCCGCCTCACCACCCCCCGAAACTCCTTGACCCTGCCCTCCATCCCCGCAGGCATCAGCAACATGGCTGTCGGGGACATGAGCTCCATGCTCACCAGCCTCGCCGAGGAGAGCAAGTTCCTGAACATGATGACCTAG' #my_seq = DNA.makeSequence(cds,'gli2') #seq = my_seq.withoutTerminalStopCodon() #pep = seq.getTranslation() #print pep.toFasta() #codons = my_seq.getInMotifSize(3) #print codons ##pep = my_seq.getTranslation() #print pep #print len(pep) #brca1 = human.getGeneByStableId(StableId='ENSG00000012048') #print brca1.Description genes = human.getGenesMatching(Symbol='gli2') for gene in genes: if gene.Symbol.lower() == 'gli2': break #print gene #print len(gene) #print brca2.Symbol #print brca2.Description #print brca2 location = str(gene.Location).split(':') chromossome = location[-3] start = location[-2].split('-')[0] end = location[-2].split('-')[1] print chromossome, start, end
Release=Release, account=account) outfile = open("dataset_fungal_homologs_sequences.csv", "w") print Species ##### GET SEQUENCES FROM ENSEMBL ###################################### i = 0 for hit in hits: i += 1 print "i: " + str(i) identifier = hit print "\n***** ZPRACOVANI ZAZNAMU " + identifier + " *****" # Selection of species genes = yeast.getGenesMatching(StableId="CADNFIAP00000001") if (genes == None): print "> skip" continue for gene in genes: try: print "\n\n=================== GENE ====================" print "> gene.symbol: " + str(gene.Symbol) print "> gene.description: " + str(gene.Description) print "> gene.location: " + str(gene.Location) print "> gene.length: " + str(len(gene)) print "> gene.full_info: " + str(gene) print "> gene.bio_type: " + str(gene.BioType) dnaSequence = str(gene.Seq).strip()
if 'ENSEMBL_ACCOUNT' in os.environ: host, username, password = os.environ['ENSEMBL_ACCOUNT'].split() account = HostAccount(host, username, password) else: account = None human = Genome('human', Release=75, account=account) gene_symbols = ['brca1', 'brca2'] genomes1k_url = 'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502' for gene_symbol in gene_symbols: print gene_symbol genes = human.getGenesMatching(Symbol=gene_symbol) for gene in genes: print gene.Location.CoordName print gene.Location.Start, gene.Location.End command = './breastcancer/programs/htslib/tabix -h \ %s/ALL.chr%s.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz \ %s:%s-%s > breastcancer/%s.vcf' % ( genomes1k_url, gene.Location.CoordName, gene.Location.CoordName, gene.Location.Start, gene.Location.End, gene_symbol) print command #os.system(command) size = gene.Location.End - gene.Location.Start print 'Size', size print gene.Location.Strand #brca2 = human.getGeneByStableId(StableId='ENSG00000139618') #transcript = brca2.CanonicalTranscript
spamreader = csv.reader(csvfile, delimiter=',', quotechar='|') for row in spamreader: print row[7] if row[7] != "Ensembl Gene ID": Getexons(row[7],row[1],row[2]) sys.exit() candidate_gene_list = {{}} print Species #print human genes = human.getGenesMatching(Symbol='INPP5E') for gene in genes: print gene.Symbol if gene.Symbol.lower() == 'inpp5e': break inpp5e = gene #annot_brca2 = brca2.getAnnotatedSeq(feature_types='gene') #print len(annot_brca2) transcripts = inpp5e.Transcripts for transcript in transcripts: print transcript
Release = 78 account = HostAccount('ensembldb.ensembl.org', 'anonymous', '') human = Genome(Species='human', Release=Release, account=account) # UniProt, seq offset, residue, isoform positions = [('P00533', 40, 'Q', 1), ('P60520', 30, 'P', 1)] m = mapping.Mapper() m.load_uniprot_mappings(['ensg'], bi=True) positions_ens = [] for p in positions: ensgs = m.map_name(p[0], 'uniprot', 'ensg') for ensg in ensgs: genes = human.getGenesMatching(StableId=ensg) for gene in genes: positions_ens.append( tuple([ensg, gene.Location, gene.CanonicalTranscript.Exons] + list(p))) # another attempts with biopython -- # (it works, if you can map all proteins to RefSeq Gene IDs) from Bio.SeqUtils.Mapper import CoordinateMapper from Bio import SeqIO from Bio import Entrez from pypath import mapping, dataio Entrez.email = '*****@*****.**'
def add_ensembl_gene_data(session, species, ensembl_release, account=None, debug=False): """add Ensembl genes and their transcripts to the db session""" rr = RunRecord('add_ensembl_gene_data') genome = Genome(species, Release=ensembl_release, account=account) skip = set(['processed_transcript', 'pseudogene']) biotypes = [b for b in genome.getDistinct('BioType') if b not in skip] data = [] unique_gene_ids = set() unique_exon_ids = set() chromSet = set() n = 0 total_objects = 0 for biotype in biotypes: for gene in genome.getGenesMatching(BioType=biotype): # gene.Location.CoordName is the chromosome name min_chrom_length = 5 # likely an unconfirmed scaffold if len(gene.Location.CoordName) > min_chrom_length: rr.addWarning('Skipping chrom', gene.Location.CoordName) continue chromSet.add(gene.Location.CoordName) if gene.StableId not in unique_gene_ids: db_gene = Gene(ensembl_id=gene.StableId, symbol=gene.Symbol, biotype=gene.BioType, description=gene.Description, status=gene.Status, chrom=gene.Location.CoordName, start=gene.Location.Start, end=gene.Location.End, strand=gene.Location.Strand) unique_gene_ids.add(gene.StableId) data.append(db_gene) else: rr.addWarning('Duplicate gene', gene.StableId) for exon in gene.CanonicalTranscript.Exons: if exon.StableId not in unique_exon_ids: db_exon = Exon(exon.StableId, exon.Rank, exon.Location.Start, exon.Location.End) db_exon.gene = db_gene unique_exon_ids.add(exon.StableId) data.append(db_exon) else: rr.addWarning('Duplicate exon', exon.StableId) n += 1 if n % 100 == 0: print 'Genes processed:', n, '; Db objects created:', len(data) if debug: session.add_all(data) session.commit() return rr.addInfo('Instantiating chromosomes', chromSet) chroms = Chroms(species, chromSet) data.append(chroms) rr.addInfo('Writing objects into db', len(data)) session.add_all(data) session.commit() return chroms
def download_database_pycogent(species, release, database_name='ensembl', nucleotide=False) : log = get_log() #try : import cogent from cogent.db.ensembl import Species, Genome, Compara, HostAccount from cogent.db.ensembl.database import Database #except ImportError : # log.fatal("pycogent import failed, exiting...") # exit(1) if cogent.version_info != (1,5,3) : log.warning("only tested with pycogent version 1.5.3 (you are running %s)" % cogent.version) release, db_name, db_details = get_missing_info(species, release, database_name) account = HostAccount( db_details['hostname'], db_details['username'], db_details['password'], port=db_details['port']) if Species.getSpeciesName(species) == 'None' : # this is not an error, it returns the string "None" log.warning("%s not found in pycogent, attempting to add it manually" % species) Species.amendSpecies(species.capitalize().replace('_', ' '), species) genome = Genome(species, Release=release, account=account) compara = Compara([species], Release=release, account=account) # DON'T TRY THIS AT HOME! # # what happens is it searches for compara databases, but unfortunately finds more than one # in this situation pycogent just connects to the first one, which is always compara_bacteria # so one solution is to dig through all the compara objects internals to provide a connection # to the correct database ... obviously not the best solution, but at 6 lines of code definitely # the shortest ;-P # if db_name not in ('ensembl', 'bacteria') : log.warning("accessing compara from pycogent with species outside of ensembl-main and ensembl-bacteria is problematic, attempting to patch...") from cogent.db.ensembl.host import DbConnection from cogent.db.ensembl.name import EnsemblDbName import sqlalchemy new_db_name = EnsemblDbName(compara.ComparaDb.db_name.Name.replace('bacteria', db_name)) compara.ComparaDb._db = DbConnection(account=account, db_name=new_db_name) compara.ComparaDb._meta = sqlalchemy.MetaData(compara.ComparaDb._db) # end of DON'T TRY THIS AT HOME! genes = set() families = [] stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes))) for gene in genome.getGenesMatching(BioType='protein_coding') : stableid = gene.StableId # ignore genes that have already been seen as members of other gene families if stableid in genes : continue genes.add(stableid) paralogs = compara.getRelatedGenes(StableId=stableid, Relationship='within_species_paralog') current = [] if paralogs is None : stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes))) current.append((stableid, str(gene.CanonicalTranscript.Cds) if nucleotide else str(gene.CanonicalTranscript.ProteinSeq))) else : for paralog in paralogs.Members : paralogid = paralog.StableId genes.add(paralogid) stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes))) try : current.append((paralogid, str(paralog.CanonicalTranscript.Cds) if nucleotide else str(paralog.CanonicalTranscript.ProteinSeq))) except AttributeError : log.fatal("pycogent did not find a canonical transcript for %s" % paralogid) exit(1) #print ','.join([ i for i,j in current ]) families.append(current) stderr.write("\r[downloading %s] got %d sequences\n" % ("CDS" if nucleotide else "protein", len(genes))) return families
A Note on Coordinate Systems The positions employed on Ensembls web-site, and in their MySQL database differ from those used internally by cogent.db.ensembl.In all cases where you are querying cogent.db.ensembl objects directly inputting nucleotide positions you can indicate you are using Ensembl coordinates by setting ensembl_coord=True. If you are explicitly passing in a cogent.db.ensembl region, that argument has no effect. ''' ## Selecting Gene #Via StableID brca1 = human.getGeneByStableId(StableId='ENSG00000012048') print brca1.Description #Or gene symbol genes = human.getGenesMatching(Symbol='brca2') for gene in genes: if gene.Symbol.lower() == 'brca2': break brca2 = gene # so we keep track of this reference for later on print "Symbol\t", brca2.Symbol print "Descr.\t", brca2.Description print "gene\t", brca2 print "loc.\t", brca2.Location print "length\t" ,len(brca2) ''' Each location is directly tied to the parent genome and the coordinate above also shows the coordinates' type (chromosome in this case), name (13), start, end and strand. The start and end positions are python indices and will differ from the Ensembl indices