def hgnc_to_ensembl_id(hgnc_list):
    account = HostAccount('blackrussian', 'bioinfo', 'A29bcd1234#', port=3306)
    human = Genome('human', Release=73, account=account)

    ensembl_stable_id_list = []

    for gene in hgnc_list:
        gene_query = human.getGenesMatching(Symbol=gene)
        for gene_obj in gene_query:
            if gene_obj.Symbol == gene:
                ensembl_stable_id_list.append(gene_obj.StableId)

    #Remove duplicates
    ensembl_stable_id_list = set(ensembl_stable_id_list)

    #Keep list elements starting with 'ENSG'
    ensembl_stable_id_list = [x for x in ensembl_stable_id_list if x.startswith('ENSG')]

    return ensembl_stable_id_list
Beispiel #2
0
def hgnc_to_ensembl_id(hgnc_list):
    account = HostAccount('blackrussian', 'bioinfo', 'A29bcd1234#', port=3306)
    human = Genome('human', Release=73, account=account)

    ensembl_stable_id_list = []

    for gene in hgnc_list:
        gene_query = human.getGenesMatching(Symbol=gene)
        for gene_obj in gene_query:
            if gene_obj.Symbol == gene:
                ensembl_stable_id_list.append(gene_obj.StableId)

    #Remove duplicates
    ensembl_stable_id_list = set(ensembl_stable_id_list)

    #Keep list elements starting with 'ENSG'
    ensembl_stable_id_list = [
        x for x in ensembl_stable_id_list if x.startswith('ENSG')
    ]

    return ensembl_stable_id_list
	sys.exit()



print "<html>"
print "<head>"
print "<style>\ntab1 { padding-left: 4em; }\ntab2 { padding-left: 8em; }\n"
print "body{font-family:helvetica} \ntab3 { padding-left: 12em; }\n p{color:#AACCFF}\n"
print "button {padding: 15px 32px;text-align: center; text-decoration: none;display: inline-block;font-size: 16px;}\n li {margin-top: 0px; margin-right: 5px;}\n</style>"
print "</head>"		##AACCFF
print "<body style=\"background-color:#222233;\" text=\"#FFFFA8\">"

mouse=Genome(Species='mouse',Release=87,account=None)
#coding=mouse.getGenesMatching(StableID=geneID)
#print coding
coding=mouse.getGenesMatching(Symbol=genename)

#print mouse.getGenesMatching()
#print dir(mouse.getGenesMatching())
#print mouse.getGenesMatching().__dict__

#print coding
#sys.exit()

if not coding:
	print "<p style=\"font-size: 55px;\">This is a fatal error. Can't find your gene<br>"
	print "</body>"
	sys.exit()
completed=[]
Mexons=[]
for g in coding:
#print len(protein)
#cds = 'ATGGAGACGTCTGCCTCAGCCACTGCCTCCGAGAAGCAAGAAGCCAAAAGTGGGATCCTGGAGGCCGCTGGCTTCCCCGACCCGGGTAAAAAGGCCTCTCCTTTGGTGGTGGCTGCAGCGGCAGCAGCAGCGGTAGCTGCCCAAGGAGTGCCGCAGCATCTCTTGCCACCATTCCATGCGCCCCTACCGATTGACATGCGACACCAGGAAGGAAGGTACCATTACGAGCCTCATTCTGTCCACGGTGTGCACGGGCCCCCTGCCCTCAGCGGCAGCCCTGTCATCTCTGACATCTCCTTGATCCGGCTTTCCCCGCACCCGGCTGGCCCTGGGGAGTCCCCCTTCAACGCCCCCCACCCGTACGTGAACCCCCACATGGAGCACTACCTCCGTTCTGTGCACAGCAGCCCCACGCTCTCCATGATCTCTGCAGCCAGGGGCCTCAGCCCCGCTGATGTGGCCCAGGAGCACCTTAAGGAGAGGGGACTGTTTGGCCTTCCTGCTCCAGGCACCACCCCCTCAGACTATTACCACCAGATGACCCTCGTGGCAGGCCACCCCGCGCCCTACGGGGACCTGCTGATGCAGAGCGGGGGCGCTGCCAGCGCACCCCATCTCCACGACTACCTCAACCCCGTGGACGTGTCCCGTTTCTCCAGCCCGCGGGTGACGCCCCGCCTGAGCCGCAAGCGGGCGCTGTCCATCTCCCCACTCTCAGACGCCAGCCTGGACCTGCAGCGGATGATCCGCACCTCACCCAACTCGCTAGTGGCCTACATCAACAACTCCCGAAGCAGCTCGGCGGCCAGCGGTTCCTACGGGCATCTGTCAGCGGGTGCCCTCAGCCCAGCCTTCACCTTCCCCCACCCCATCAACCCCGTGGCCTACCAGCAGATTCTGAGCCAGCAGAGGGGTCTGGGGTCAGCCTTTGGACACACACCACCCCTGATCCAGCCCTCACCCACCTTCCTGGCCCAGCAGCCCATGGCCCTCACCTCCATCAATGCCACGCCCACCCAGCTCAGCAGCAGCAGCAACTGTCTGAGTGACACCAACCAGAACAAGCAGAGCAGTGAGTCGGCCGTCAGCAGCACCGTCAACCCTGTCGCCATTCACAAGCGCAGCAAGGTCAAGACCGAGCCTGAGGGCCTGCGGCCGGCCTCCCCTCTGGCGCTGACGCAGGGCCAGGTGTCTGGACACGGCTCATGTGGGTGTGCCCTTCCCCTCTCCCAGGAGCAGCTGGCTGACCTCAAGGAAGATCTGGACAGGGATGACTGTAAGCAGGAGGCTGAGGTGGTCATCTATGAGACCAACTGCCACTGGGAAGACTGCACCAAGGAGTACGACACCCAGGAGCAGCTGGTGCATCACATCAACAACGAGCACATCCACGGGGAGAAGAAGGAGTTTGTGTGCCGCTGGCAGGCCTGCACGCGGGAGCAGAAGCCCTTCAAGGCGCAGTACATGCTGGTGGTGCACATGCGGCGACACACGGGCGAGAAGCCCCACAAGTGCACGTTCGAGGGCTGCTCGAAGGCCTACTCCCGCCTGGAGAACCTGAAGACACACCTGCGGTCCCACACCGGGGAGAAGCCATATGTGTGTGAGCACGAGGGCTGCAACAAAGCCTTCTCCAACGCCTCGGACCGCGCCAAGCACCAGAATCGCACCCACTCCAACGAGAAACCCTACATCTGCAAGATCCCAGGCTGCACCAAGAGATACACAGACCCCAGCTCTCTCCGGAAGCATGTGAAAACGGTCCACGGCCCAGATGCCCACGTCACCAAGAAGCAGCGCAATGACGTGCACCTCCGCACACCGCTGCTCAAAGAGAATGGGGACAGTGAGGCCGGCACGGAGCCTGGCGGCCCAGAGAGCACCGAGGCCAGCAGCACCAGCCAGGCCGTGGAGGACTGCCTGCACGTCAGAGCCATCAAGACCGAGAGCTCCGGGCTGTGTCAGTCCAGCCCCGGGGCCCAGTCGTCCTGCAGCAGCGAGCCCTCTCCTCTGGGCAGTGCCCCCAACAATGACAGTGGCGTGGAGATGCCGGGGACGGGGCCCGGGAGCCTGGGAGACCTGACGGCACTGGATGACACACCCCCAGGGGCCGACACCTCAGCCCTGGCTGCCCCCTCCGCTGGTGGCCTCCAGCTGCGCAAACACATGACCACCATGCACCGGTTCGAGCAGCTCAAGAAGGAGAAGCTCAAGTCACTCAAGGATTCCTGCTCATGGGCCGGGCCGACTCCACACACGCGGAACACCAAGCTGCCTCCCCTCCCGGGAAGTGGCTCCATCCTGGAAAACTTCAGTGGCAGTGGGGGCGGCGGGCCCGCGGGGCTGCTGCCGAACCCGCGGCTGTCGGAGCTGTCCGCGAGCGAGGTGACCATGCTGAGCCAGCTGCAGGAGCGCCGCGACAGCTCCACCAGCACGGTCAGCTCGGCCTACACCGTGAGCCGCCGCTCCTCCGGCATCTCCCCCTACTTCTCCAGCCGCCGCTCCAGCGAGGCCTCGCCCCTGGGCGCCGGCCGCCCGCACAACGCGAGCTCCGCTGACTCCTACGACCCCATCTCCACGGACGCGTCGCGGCGCTCGAGCGAGGCCAGCCAGTGCAGCGGCGGCTCCGGGCTGCTCAACCTCACGCCGGCGCAGCAGTACAGCCTGCGGGCCAAGTACGCGGCAGCCACTGGCGGCCCCCCGCCCACTCCGCTGCCGGGCCTGGAGCGCATGAGCCTGCGGACCAGGCTGGCGCTGCTGGACGCGCCCGAGCGCACGCTGCCCGCCGGCTGCCCACGCCCACTGGGGCCGCGGCGTGGCAGCGACGGGCCGACCTATGGCCACGGCCACGCGGGGGCTGCGCCCGCCTTCCCCCACGAGGCTCCAGGCGGCGGAGCCAGGCGGGCCAGCGACCCTGTGCGGCGGCCCGATGCCCTGTCCCTGCCGCGGGTGCAGCGCTTCCACAGCACCCACAACGTGAACCCCGGCCCGCTGCCGCCCTGTGCCGACAGGCGAGGCCTCCGCCTGCAGAGCCACCCGAGCACCGACGGCGGCCTGGCCCGCGGCGCCTACTCGCCCCGGCCGCCTAGCATCAGCGAGAACGTGGCGATGGAGGCCGTGGCGGCAGGAGTGGACGGCGCGGGGCCCGAGGCCGACCTGGGGCTGCCGGAGGACGACCTGGTGCTTCCAGACGACGTGGTGCAGTACATCAAGGCGCACGCCAGTGGCGCTCTGGACGAGGGCACCGGGCAGGTGTATCCCACGGAAAGCACTGGCTTCTCTGACAACCCCAGACTACCCAGCCCGGGGCTGCACGGCCAGCGCAGGATGGTGGCTGCGGACTCCAACGTGGGCCCCTCCGCCCCTATGCTGGGAGGATGCCAGTTAGGCTTTGGGGCGCCCTCCAGCCTGAACAAAAATAACATGCCTGTGCAGTGGAATGAGGTGAGCTCCGGCACCGTAGACGCCCTGGCCAGCCAGGTGAAGCCTCCACCCTTTCCTCAGGGCAACCTGGCGGTGGTGCAGCAGAAGCCTGCCTTTGGCCAGTACCCGGGCTACAGTCCGCAAGGCCTACAGGCTAGCCCTGGGGGCCTGGACAGCACGCAGCCACACCTGCAGCCCCGCAGCGGAGCCCCCTCCCAGGGCATCCCCAGGGTAAACTACATGCAGCAGCTGCGACAGCCAGTGGCAGGCAGCCAGTGTCCTGGCATGACTACCACTATGAGCCCCCATGCCTGCTATGGCCAAGTCCACCCCCAGCTGAGCCCCAGCACCATCAGTGGGGCCCTCAACCAGTTCCCCCAATCCTGCAGCAACATGCCAGCCAAGCCAGGGCATCTGGGGCACCCTCAGCAGACAGAAGTGGCACCTGACCCCACCACGATGGGCAATCGCCACAGGGAACTTGGGGTCCCCGATTCAGCCCTGGCTGGAGTGCCACCACCTCACCCAGTCCAGAGCTACCCACAGCAGAGCCATCACCTGGCAGCCTCCATGAGCCAGGAGGGCTACCACCAGGTCCCCAGCCTTCTGCCTGCCCGCCAGCCTGGCTTCATGGAGCCCCAAACAGGCCCGATGGGGGTGGCTACAGCAGGCTTTGGCCTAGTGCAGCCCCGGCCTCCCCTCGAGCCCAGCCCCACTGGCCGCCACCGTGGGGTACGTGCTGTGCAGCAGCAGCTGGCCTACGCCAGGGCCACAGGCCATGCCATGGCTGCCATGCCGTCCAGTCAGGAAACAGCAGAGGCTGTGCCCAAGGGAGCGATGGGCAACATGGGGTCGGTGCCTCCCCAGCCGCCTCCGCAGGACGCAGGTGGGGCCCCGGACCACAGCATGCTCTACTACTACGGCCAGATCCACATGTACGAACAGGATGGAGGCCTGGAGAACCTCGGGAGCTGCCAGGTCATGCGGTCCCAGCCACCACAGCCACAGGCCTGTCAGGACAGCATCCAGCCCCAGCCCTTGCCCTCACCAGGGGTCAACCAGGTGTCCAGCACTGTGGACTCCCAGCTCCTGGAGGCCCCCCAGATTGACTTCGATGCCATCATGGATGATGGCGATCACTCGAGTTTGTTCTCGGGTGCTCTGAGCCCCAGCCTCCTCCACAGCCTCTCCCAGAACTCCTCCCGCCTCACCACCCCCCGAAACTCCTTGACCCTGCCCTCCATCCCCGCAGGCATCAGCAACATGGCTGTCGGGGACATGAGCTCCATGCTCACCAGCCTCGCCGAGGAGAGCAAGTTCCTGAACATGATGACCTAG'
#my_seq = DNA.makeSequence(cds,'gli2')
#seq = my_seq.withoutTerminalStopCodon()
#pep = seq.getTranslation()
#print pep.toFasta()
#codons = my_seq.getInMotifSize(3)
#print codons
##pep = my_seq.getTranslation()
#print pep
#print len(pep)

#brca1 = human.getGeneByStableId(StableId='ENSG00000012048')
#print brca1.Description

genes = human.getGenesMatching(Symbol='gli2')
for gene in genes:
    if gene.Symbol.lower() == 'gli2':
        break
#print gene
#print len(gene)
#print brca2.Symbol
#print brca2.Description
#print brca2
location = str(gene.Location).split(':')
chromossome = location[-3]
start = location[-2].split('-')[0]
end = location[-2].split('-')[1]
print chromossome, start, end

#print 'protein seq'
#cds = 'ATGGAGACGTCTGCCTCAGCCACTGCCTCCGAGAAGCAAGAAGCCAAAAGTGGGATCCTGGAGGCCGCTGGCTTCCCCGACCCGGGTAAAAAGGCCTCTCCTTTGGTGGTGGCTGCAGCGGCAGCAGCAGCGGTAGCTGCCCAAGGAGTGCCGCAGCATCTCTTGCCACCATTCCATGCGCCCCTACCGATTGACATGCGACACCAGGAAGGAAGGTACCATTACGAGCCTCATTCTGTCCACGGTGTGCACGGGCCCCCTGCCCTCAGCGGCAGCCCTGTCATCTCTGACATCTCCTTGATCCGGCTTTCCCCGCACCCGGCTGGCCCTGGGGAGTCCCCCTTCAACGCCCCCCACCCGTACGTGAACCCCCACATGGAGCACTACCTCCGTTCTGTGCACAGCAGCCCCACGCTCTCCATGATCTCTGCAGCCAGGGGCCTCAGCCCCGCTGATGTGGCCCAGGAGCACCTTAAGGAGAGGGGACTGTTTGGCCTTCCTGCTCCAGGCACCACCCCCTCAGACTATTACCACCAGATGACCCTCGTGGCAGGCCACCCCGCGCCCTACGGGGACCTGCTGATGCAGAGCGGGGGCGCTGCCAGCGCACCCCATCTCCACGACTACCTCAACCCCGTGGACGTGTCCCGTTTCTCCAGCCCGCGGGTGACGCCCCGCCTGAGCCGCAAGCGGGCGCTGTCCATCTCCCCACTCTCAGACGCCAGCCTGGACCTGCAGCGGATGATCCGCACCTCACCCAACTCGCTAGTGGCCTACATCAACAACTCCCGAAGCAGCTCGGCGGCCAGCGGTTCCTACGGGCATCTGTCAGCGGGTGCCCTCAGCCCAGCCTTCACCTTCCCCCACCCCATCAACCCCGTGGCCTACCAGCAGATTCTGAGCCAGCAGAGGGGTCTGGGGTCAGCCTTTGGACACACACCACCCCTGATCCAGCCCTCACCCACCTTCCTGGCCCAGCAGCCCATGGCCCTCACCTCCATCAATGCCACGCCCACCCAGCTCAGCAGCAGCAGCAACTGTCTGAGTGACACCAACCAGAACAAGCAGAGCAGTGAGTCGGCCGTCAGCAGCACCGTCAACCCTGTCGCCATTCACAAGCGCAGCAAGGTCAAGACCGAGCCTGAGGGCCTGCGGCCGGCCTCCCCTCTGGCGCTGACGCAGGGCCAGGTGTCTGGACACGGCTCATGTGGGTGTGCCCTTCCCCTCTCCCAGGAGCAGCTGGCTGACCTCAAGGAAGATCTGGACAGGGATGACTGTAAGCAGGAGGCTGAGGTGGTCATCTATGAGACCAACTGCCACTGGGAAGACTGCACCAAGGAGTACGACACCCAGGAGCAGCTGGTGCATCACATCAACAACGAGCACATCCACGGGGAGAAGAAGGAGTTTGTGTGCCGCTGGCAGGCCTGCACGCGGGAGCAGAAGCCCTTCAAGGCGCAGTACATGCTGGTGGTGCACATGCGGCGACACACGGGCGAGAAGCCCCACAAGTGCACGTTCGAGGGCTGCTCGAAGGCCTACTCCCGCCTGGAGAACCTGAAGACACACCTGCGGTCCCACACCGGGGAGAAGCCATATGTGTGTGAGCACGAGGGCTGCAACAAAGCCTTCTCCAACGCCTCGGACCGCGCCAAGCACCAGAATCGCACCCACTCCAACGAGAAACCCTACATCTGCAAGATCCCAGGCTGCACCAAGAGATACACAGACCCCAGCTCTCTCCGGAAGCATGTGAAAACGGTCCACGGCCCAGATGCCCACGTCACCAAGAAGCAGCGCAATGACGTGCACCTCCGCACACCGCTGCTCAAAGAGAATGGGGACAGTGAGGCCGGCACGGAGCCTGGCGGCCCAGAGAGCACCGAGGCCAGCAGCACCAGCCAGGCCGTGGAGGACTGCCTGCACGTCAGAGCCATCAAGACCGAGAGCTCCGGGCTGTGTCAGTCCAGCCCCGGGGCCCAGTCGTCCTGCAGCAGCGAGCCCTCTCCTCTGGGCAGTGCCCCCAACAATGACAGTGGCGTGGAGATGCCGGGGACGGGGCCCGGGAGCCTGGGAGACCTGACGGCACTGGATGACACACCCCCAGGGGCCGACACCTCAGCCCTGGCTGCCCCCTCCGCTGGTGGCCTCCAGCTGCGCAAACACATGACCACCATGCACCGGTTCGAGCAGCTCAAGAAGGAGAAGCTCAAGTCACTCAAGGATTCCTGCTCATGGGCCGGGCCGACTCCACACACGCGGAACACCAAGCTGCCTCCCCTCCCGGGAAGTGGCTCCATCCTGGAAAACTTCAGTGGCAGTGGGGGCGGCGGGCCCGCGGGGCTGCTGCCGAACCCGCGGCTGTCGGAGCTGTCCGCGAGCGAGGTGACCATGCTGAGCCAGCTGCAGGAGCGCCGCGACAGCTCCACCAGCACGGTCAGCTCGGCCTACACCGTGAGCCGCCGCTCCTCCGGCATCTCCCCCTACTTCTCCAGCCGCCGCTCCAGCGAGGCCTCGCCCCTGGGCGCCGGCCGCCCGCACAACGCGAGCTCCGCTGACTCCTACGACCCCATCTCCACGGACGCGTCGCGGCGCTCGAGCGAGGCCAGCCAGTGCAGCGGCGGCTCCGGGCTGCTCAACCTCACGCCGGCGCAGCAGTACAGCCTGCGGGCCAAGTACGCGGCAGCCACTGGCGGCCCCCCGCCCACTCCGCTGCCGGGCCTGGAGCGCATGAGCCTGCGGACCAGGCTGGCGCTGCTGGACGCGCCCGAGCGCACGCTGCCCGCCGGCTGCCCACGCCCACTGGGGCCGCGGCGTGGCAGCGACGGGCCGACCTATGGCCACGGCCACGCGGGGGCTGCGCCCGCCTTCCCCCACGAGGCTCCAGGCGGCGGAGCCAGGCGGGCCAGCGACCCTGTGCGGCGGCCCGATGCCCTGTCCCTGCCGCGGGTGCAGCGCTTCCACAGCACCCACAACGTGAACCCCGGCCCGCTGCCGCCCTGTGCCGACAGGCGAGGCCTCCGCCTGCAGAGCCACCCGAGCACCGACGGCGGCCTGGCCCGCGGCGCCTACTCGCCCCGGCCGCCTAGCATCAGCGAGAACGTGGCGATGGAGGCCGTGGCGGCAGGAGTGGACGGCGCGGGGCCCGAGGCCGACCTGGGGCTGCCGGAGGACGACCTGGTGCTTCCAGACGACGTGGTGCAGTACATCAAGGCGCACGCCAGTGGCGCTCTGGACGAGGGCACCGGGCAGGTGTATCCCACGGAAAGCACTGGCTTCTCTGACAACCCCAGACTACCCAGCCCGGGGCTGCACGGCCAGCGCAGGATGGTGGCTGCGGACTCCAACGTGGGCCCCTCCGCCCCTATGCTGGGAGGATGCCAGTTAGGCTTTGGGGCGCCCTCCAGCCTGAACAAAAATAACATGCCTGTGCAGTGGAATGAGGTGAGCTCCGGCACCGTAGACGCCCTGGCCAGCCAGGTGAAGCCTCCACCCTTTCCTCAGGGCAACCTGGCGGTGGTGCAGCAGAAGCCTGCCTTTGGCCAGTACCCGGGCTACAGTCCGCAAGGCCTACAGGCTAGCCCTGGGGGCCTGGACAGCACGCAGCCACACCTGCAGCCCCGCAGCGGAGCCCCCTCCCAGGGCATCCCCAGGGTAAACTACATGCAGCAGCTGCGACAGCCAGTGGCAGGCAGCCAGTGTCCTGGCATGACTACCACTATGAGCCCCCATGCCTGCTATGGCCAAGTCCACCCCCAGCTGAGCCCCAGCACCATCAGTGGGGCCCTCAACCAGTTCCCCCAATCCTGCAGCAACATGCCAGCCAAGCCAGGGCATCTGGGGCACCCTCAGCAGACAGAAGTGGCACCTGACCCCACCACGATGGGCAATCGCCACAGGGAACTTGGGGTCCCCGATTCAGCCCTGGCTGGAGTGCCACCACCTCACCCAGTCCAGAGCTACCCACAGCAGAGCCATCACCTGGCAGCCTCCATGAGCCAGGAGGGCTACCACCAGGTCCCCAGCCTTCTGCCTGCCCGCCAGCCTGGCTTCATGGAGCCCCAAACAGGCCCGATGGGGGTGGCTACAGCAGGCTTTGGCCTAGTGCAGCCCCGGCCTCCCCTCGAGCCCAGCCCCACTGGCCGCCACCGTGGGGTACGTGCTGTGCAGCAGCAGCTGGCCTACGCCAGGGCCACAGGCCATGCCATGGCTGCCATGCCGTCCAGTCAGGAAACAGCAGAGGCTGTGCCCAAGGGAGCGATGGGCAACATGGGGTCGGTGCCTCCCCAGCCGCCTCCGCAGGACGCAGGTGGGGCCCCGGACCACAGCATGCTCTACTACTACGGCCAGATCCACATGTACGAACAGGATGGAGGCCTGGAGAACCTCGGGAGCTGCCAGGTCATGCGGTCCCAGCCACCACAGCCACAGGCCTGTCAGGACAGCATCCAGCCCCAGCCCTTGCCCTCACCAGGGGTCAACCAGGTGTCCAGCACTGTGGACTCCCAGCTCCTGGAGGCCCCCCAGATTGACTTCGATGCCATCATGGATGATGGCGATCACTCGAGTTTGTTCTCGGGTGCTCTGAGCCCCAGCCTCCTCCACAGCCTCTCCCAGAACTCCTCCCGCCTCACCACCCCCCGAAACTCCTTGACCCTGCCCTCCATCCCCGCAGGCATCAGCAACATGGCTGTCGGGGACATGAGCTCCATGCTCACCAGCCTCGCCGAGGAGAGCAAGTTCCTGAACATGATGACCTAG'
#my_seq = DNA.makeSequence(cds,'gli2')
#seq = my_seq.withoutTerminalStopCodon()
#pep = seq.getTranslation()
#print pep.toFasta()
#codons = my_seq.getInMotifSize(3)
#print codons
##pep = my_seq.getTranslation()
#print pep
#print len(pep)


#brca1 = human.getGeneByStableId(StableId='ENSG00000012048')
#print brca1.Description

genes = human.getGenesMatching(Symbol='gli2')
for gene in genes:
  if gene.Symbol.lower() == 'gli2':
    break
#print gene
#print len(gene)
#print brca2.Symbol
#print brca2.Description
#print brca2
location = str(gene.Location).split(':')
chromossome = location[-3]
start = location[-2].split('-')[0]
end = location[-2].split('-')[1]
print chromossome, start, end

               Release=Release,
               account=account)
outfile = open("dataset_fungal_homologs_sequences.csv", "w")
print Species

##### GET SEQUENCES FROM ENSEMBL ######################################
i = 0
for hit in hits:
    i += 1
    print "i: " + str(i)

    identifier = hit
    print "\n***** ZPRACOVANI ZAZNAMU " + identifier + " *****"

    # Selection of species
    genes = yeast.getGenesMatching(StableId="CADNFIAP00000001")

    if (genes == None):
        print "> skip"
        continue

    for gene in genes:
        try:
            print "\n\n=================== GENE ===================="
            print "> gene.symbol: " + str(gene.Symbol)
            print "> gene.description: " + str(gene.Description)
            print "> gene.location: " + str(gene.Location)
            print "> gene.length: " + str(len(gene))
            print "> gene.full_info: " + str(gene)
            print "> gene.bio_type: " + str(gene.BioType)
            dnaSequence = str(gene.Seq).strip()
if 'ENSEMBL_ACCOUNT' in os.environ:
    host, username, password = os.environ['ENSEMBL_ACCOUNT'].split()
    account = HostAccount(host, username, password)
else:
    account = None

human = Genome('human', Release=75, account=account)

gene_symbols = ['brca1', 'brca2']

genomes1k_url = 'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502'

for gene_symbol in gene_symbols:
    print gene_symbol
    genes = human.getGenesMatching(Symbol=gene_symbol)
    for gene in genes:
        print gene.Location.CoordName
        print gene.Location.Start, gene.Location.End
        command = './breastcancer/programs/htslib/tabix -h \
        %s/ALL.chr%s.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz \
        %s:%s-%s > breastcancer/%s.vcf' % (
            genomes1k_url, gene.Location.CoordName, gene.Location.CoordName,
            gene.Location.Start, gene.Location.End, gene_symbol)
        print command
        #os.system(command)
        size = gene.Location.End - gene.Location.Start
        print 'Size', size
        print gene.Location.Strand
#brca2 = human.getGeneByStableId(StableId='ENSG00000139618')
#transcript = brca2.CanonicalTranscript
Beispiel #8
0
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in spamreader:
        print row[7]
        if row[7] != "Ensembl Gene ID":
            Getexons(row[7],row[1],row[2])

sys.exit()


candidate_gene_list = {{}}

print Species


#print human
genes = human.getGenesMatching(Symbol='INPP5E')

for gene in genes:
    print gene.Symbol
    if gene.Symbol.lower() == 'inpp5e':
        break

inpp5e = gene
#annot_brca2 = brca2.getAnnotatedSeq(feature_types='gene')

#print len(annot_brca2)

transcripts = inpp5e.Transcripts

for transcript in transcripts:
    print transcript
Beispiel #9
0
Release = 78
account = HostAccount('ensembldb.ensembl.org', 'anonymous', '')

human = Genome(Species='human', Release=Release, account=account)

# UniProt, seq offset, residue, isoform
positions = [('P00533', 40, 'Q', 1), ('P60520', 30, 'P', 1)]

m = mapping.Mapper()
m.load_uniprot_mappings(['ensg'], bi=True)

positions_ens = []
for p in positions:
    ensgs = m.map_name(p[0], 'uniprot', 'ensg')
    for ensg in ensgs:
        genes = human.getGenesMatching(StableId=ensg)
        for gene in genes:
            positions_ens.append(
                tuple([ensg, gene.Location, gene.CanonicalTranscript.Exons] +
                      list(p)))

# another attempts with biopython --
# (it works, if you can map all proteins to RefSeq Gene IDs)

from Bio.SeqUtils.Mapper import CoordinateMapper
from Bio import SeqIO
from Bio import Entrez
from pypath import mapping, dataio

Entrez.email = '*****@*****.**'
Beispiel #10
0
def add_ensembl_gene_data(session,
                          species,
                          ensembl_release,
                          account=None,
                          debug=False):
    """add Ensembl genes and their transcripts to the db session"""
    rr = RunRecord('add_ensembl_gene_data')
    genome = Genome(species, Release=ensembl_release, account=account)

    skip = set(['processed_transcript', 'pseudogene'])
    biotypes = [b for b in genome.getDistinct('BioType') if b not in skip]

    data = []
    unique_gene_ids = set()
    unique_exon_ids = set()
    chromSet = set()
    n = 0
    total_objects = 0
    for biotype in biotypes:
        for gene in genome.getGenesMatching(BioType=biotype):
            # gene.Location.CoordName is the chromosome name
            min_chrom_length = 5  # likely an unconfirmed scaffold
            if len(gene.Location.CoordName) > min_chrom_length:
                rr.addWarning('Skipping chrom', gene.Location.CoordName)
                continue
            chromSet.add(gene.Location.CoordName)

            if gene.StableId not in unique_gene_ids:

                db_gene = Gene(ensembl_id=gene.StableId,
                               symbol=gene.Symbol,
                               biotype=gene.BioType,
                               description=gene.Description,
                               status=gene.Status,
                               chrom=gene.Location.CoordName,
                               start=gene.Location.Start,
                               end=gene.Location.End,
                               strand=gene.Location.Strand)

                unique_gene_ids.add(gene.StableId)
                data.append(db_gene)
            else:
                rr.addWarning('Duplicate gene', gene.StableId)

            for exon in gene.CanonicalTranscript.Exons:
                if exon.StableId not in unique_exon_ids:
                    db_exon = Exon(exon.StableId, exon.Rank,
                                   exon.Location.Start, exon.Location.End)
                    db_exon.gene = db_gene
                    unique_exon_ids.add(exon.StableId)
                    data.append(db_exon)

                else:
                    rr.addWarning('Duplicate exon', exon.StableId)
            n += 1
            if n % 100 == 0:
                print 'Genes processed:', n, '; Db objects created:', len(data)
                if debug:
                    session.add_all(data)
                    session.commit()
                    return
    rr.addInfo('Instantiating chromosomes', chromSet)
    chroms = Chroms(species, chromSet)
    data.append(chroms)

    rr.addInfo('Writing objects into db', len(data))
    session.add_all(data)
    session.commit()
    return chroms
Beispiel #11
0
def download_database_pycogent(species, release, database_name='ensembl', nucleotide=False) :
    log = get_log()

    #try :
    import cogent
    from cogent.db.ensembl import Species, Genome, Compara, HostAccount
    from cogent.db.ensembl.database import Database

    #except ImportError :
    #    log.fatal("pycogent import failed, exiting...")
    #    exit(1)

    if cogent.version_info != (1,5,3) :
        log.warning("only tested with pycogent version 1.5.3 (you are running %s)" % cogent.version)


    release, db_name, db_details = get_missing_info(species, release, database_name)

    account = HostAccount(
                db_details['hostname'],
                db_details['username'],
                db_details['password'],
                port=db_details['port'])

    if Species.getSpeciesName(species) == 'None' : # this is not an error, it returns the string "None"
        log.warning("%s not found in pycogent, attempting to add it manually" % species)
        Species.amendSpecies(species.capitalize().replace('_', ' '), species)

    genome = Genome(species, Release=release, account=account)
    compara = Compara([species], Release=release, account=account)



    # DON'T TRY THIS AT HOME!
    #
    # what happens is it searches for compara databases, but unfortunately finds more than one
    # in this situation pycogent just connects to the first one, which is always compara_bacteria
    # so one solution is to dig through all the compara objects internals to provide a connection
    # to the correct database ... obviously not the best solution, but at 6 lines of code definitely
    # the shortest ;-P
    #
    if db_name not in ('ensembl', 'bacteria') :
        log.warning("accessing compara from pycogent with species outside of ensembl-main and ensembl-bacteria is problematic, attempting to patch...")

        from cogent.db.ensembl.host import DbConnection
        from cogent.db.ensembl.name import EnsemblDbName
        import sqlalchemy

        new_db_name = EnsemblDbName(compara.ComparaDb.db_name.Name.replace('bacteria', db_name))
        compara.ComparaDb._db = DbConnection(account=account, db_name=new_db_name)
        compara.ComparaDb._meta = sqlalchemy.MetaData(compara.ComparaDb._db)
    # end of DON'T TRY THIS AT HOME!



    genes = set()
    families = []

    stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes)))

    for gene in genome.getGenesMatching(BioType='protein_coding') :
        stableid = gene.StableId

        # ignore genes that have already been seen as members of other gene families
        if stableid in genes :
            continue

        genes.add(stableid)

        paralogs = compara.getRelatedGenes(StableId=stableid, Relationship='within_species_paralog')
        
        current = []
        
        if paralogs is None :
            stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes)))
            current.append((stableid, str(gene.CanonicalTranscript.Cds) if nucleotide else str(gene.CanonicalTranscript.ProteinSeq)))

        else :
            for paralog in paralogs.Members :
                paralogid = paralog.StableId
                genes.add(paralogid)

                stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes)))

                try :
                    current.append((paralogid, str(paralog.CanonicalTranscript.Cds) if nucleotide else str(paralog.CanonicalTranscript.ProteinSeq)))
                
                except AttributeError :
                    log.fatal("pycogent did not find a canonical transcript for %s" % paralogid)
                    exit(1)

        #print ','.join([ i for i,j in current ])
        families.append(current)

    stderr.write("\r[downloading %s] got %d sequences\n" % ("CDS" if nucleotide else "protein", len(genes)))

    return families
Beispiel #12
0
A Note on Coordinate Systems

The positions employed on Ensembls web-site, and in their MySQL database differ 
from those used internally by cogent.db.ensembl.In all cases where you are querying 
cogent.db.ensembl objects directly inputting nucleotide positions you can indicate 
you are using Ensembl coordinates by setting ensembl_coord=True.
If you are explicitly passing in a cogent.db.ensembl region, that argument has no effect.
'''

## Selecting Gene
#Via StableID
brca1 = human.getGeneByStableId(StableId='ENSG00000012048')
print brca1.Description

#Or gene symbol
genes = human.getGenesMatching(Symbol='brca2')
for gene in genes:
     if gene.Symbol.lower() == 'brca2':
         break

brca2 = gene # so we keep track of this reference for later on
print "Symbol\t", brca2.Symbol
print "Descr.\t", brca2.Description
print "gene\t", brca2
print "loc.\t", brca2.Location
print "length\t" ,len(brca2)

'''
Each location is directly tied to the parent genome and the coordinate above also 
shows the coordinates' type (chromosome in this case), name (13), start, end and strand. 
The start and end positions are python indices and will differ from the Ensembl indices