예제 #1
0
def get_sequence_from_location(species, coords):
    """Get sequence from a genomic location in an ensembl species genome."""

    from cogent.db.ensembl import HostAccount, Genome, Compara, Species
    genome = Genome(Species=species, Release='87', account=None)
    chrom, start, end, strand = coords
    #print coords
    r = genome.getRegion(CoordName=str(chrom),
                         Start=start,
                         End=end,
                         Strand=strand)
    return r.Seq
예제 #2
0
def ensembl_to_hgnc(gene_list):
    account = HostAccount('blackrussian', 'bioinfo', 'A29bcd1234#', port=3306)
    human = Genome('human', Release=73, account=account)

    hgnc_list = []

    for gene in gene_list:
        hgnc_list.append(human.getGeneByStableId(StableId=gene).Symbol)

    hgnc_list = set(hgnc_list)

    return hgnc_list
예제 #3
0
def get_genes_from_location(ref, coords, pad=0):
    """Get genes from a set of genome coordinates.
       pad will add n bases to either side to expand area"""

    genome = Genome(Species=ref, Release=release, account=account)
    chrom, start, end, strand = coords
    genes = list(
        genome.getFeatures(CoordName=chrom,
                           Start=start - pad,
                           End=end + pad,
                           feature_types='gene'))
    return genes
예제 #4
0
def ensembl_to_hgnc(gene_list):
    account = HostAccount('blackrussian', 'bioinfo', 'A29bcd1234#', port=3306)
    human = Genome('human', Release=73, account=account)

    hgnc_list = []

    for gene in gene_list:
        hgnc_list.append(human.getGeneByStableId(StableId=gene).Symbol)

    hgnc_list = set(hgnc_list)

    return hgnc_list
예제 #5
0
def get_chrom_seqs(species, release, account=None, debug=False):
    """yields sequence objects for the indicated chromosomes from Ensembl"""
    genome = Genome(species, Release=release, account=account)
    for chrom in chroms[species]:
        region = genome.getRegion(CoordName=chrom)
        seq = region.Seq
        name = 'chr_%s' % chrom
        seq.Name = name
        if debug:
            print name
            print repr(seq)
        
        yield seq
예제 #6
0
def Main():
    global args,out
    args=ParseArg()
    if args.output=="stdout":
        out=sys.stdout
    else:
        try:
            out=open(args.output,"w")
        except IOError:
            print >>sys.stderr,"can't open file ",args.output,"to write. Using stdout instead"
            out=sys.stdout

    count={}
    dbi1=DBI.init(args.db,"bed") # the DBI init file for bed6 file of all kinds of RNA
    dbi2=DBI.init(args.db_detail,"bed") # the DBI init file for bed12 file of lincRNA and mRNA with intron, exon, UTR
    genome=Genome('mouse', Release=67, account=None)
    for bed in TableIO.parse(args.input,args.format):
        [typ,name,subtype]=annotation(bed,dbi1,dbi2,genome)
        if count.has_key(typ):
            count[typ]+=1
        else:
            count[typ]=1
        print >>out, "\t".join (str(f) for f in [bed.chr,bed.start,bed.stop,bed.id,name,bed.strand,typ, subtype])

    print >>out, "\n".join ("#"+typ+"\t%d"%(count[typ]) for typ in count.keys())
예제 #7
0
def hgnc_to_ensembl_id(hgnc_list):
    account = HostAccount('blackrussian', 'bioinfo', 'A29bcd1234#', port=3306)
    human = Genome('human', Release=73, account=account)

    ensembl_stable_id_list = []

    for gene in hgnc_list:
        gene_query = human.getGenesMatching(Symbol=gene)
        for gene_obj in gene_query:
            if gene_obj.Symbol == gene:
                ensembl_stable_id_list.append(gene_obj.StableId)

    #Remove duplicates
    ensembl_stable_id_list = set(ensembl_stable_id_list)

    #Keep list elements starting with 'ENSG'
    ensembl_stable_id_list = [x for x in ensembl_stable_id_list if x.startswith('ENSG')]

    return ensembl_stable_id_list
예제 #8
0
def get_genes_from_Ensembl_multiple(args, sourcedata, targetdata):

    inputtype = args.inputType

    if (inputtype == "id"):
        geneidlistfile = args.geneIdListFile
        if (geneidlistfile == None):
            print "Argument -gidlf <geneidlistfilename> is required"
        else:
            for line in open(geneidlistfile, "r").readlines():
                parse = line.split("\n")[0].split(" ")
                if (len(parse) > 1):
                    species = parse[0]
                    geneid = parse[1]
                    print "Retreving Gene", geneid
                    account = HostAccount('ensembldb.ensembl.org', 'anonymous',
                                          '')
                    genome = Genome(species, ENSEMBL_VERSION, account)
                    gene = genome.getGeneByStableId(StableId=geneid)
                    sourcedata += get_cds_data(gene)
                    targetdata += get_gene_data(gene)

    if (inputtype == "name"):
        gene = args.gene
        if (gene == None):
            print "Argument -g <genename> is required"
        specieslistfile = args.specieslistfile
        if (specieslistfile == None):
            print "Argument -slf <specieslistfilename> is required"
        if (gene != None and specieslistfile != None):
            for line in open(specieslistfile, "r").readlines():
                parse = line.split("\n")[0].split(" ")
                if (len(parse) > 0):
                    species = parse[0]
                    print "Retreving Gene", gene, "from species", species
                    account = HostAccount('ensembldb.ensembl.org', 'anonymous',
                                          '')
                    genome = Genome(species, ENSEMBL_VERSION, account)
                    gene = get_gene_from_Ensembl_by_name(gene, genome)
                    sourcedata += get_cds_data(gene)
                    targetdata += get_gene_data(gene)
    return sourcedata, targetdata
예제 #9
0
def hgnc_to_ensembl_id(hgnc_list):
    account = HostAccount('blackrussian', 'bioinfo', 'A29bcd1234#', port=3306)
    human = Genome('human', Release=73, account=account)

    ensembl_stable_id_list = []

    for gene in hgnc_list:
        gene_query = human.getGenesMatching(Symbol=gene)
        for gene_obj in gene_query:
            if gene_obj.Symbol == gene:
                ensembl_stable_id_list.append(gene_obj.StableId)

    #Remove duplicates
    ensembl_stable_id_list = set(ensembl_stable_id_list)

    #Keep list elements starting with 'ENSG'
    ensembl_stable_id_list = [
        x for x in ensembl_stable_id_list if x.startswith('ENSG')
    ]

    return ensembl_stable_id_list
def main():
    import os
    script_dir = os.path.dirname(os.path.abspath(__file__))
    """ Neccesary to log into the ensembl database """
    import os
    from cogent.db.ensembl import HostAccount
    if 'ENSEMBL_ACCOUNT' in os.environ:
        host, username, password = os.environ['ENSEMBL_ACCOUNT'].split()
        account = HostAccount(host, username, password)
    else:
        account = None
    """ gathers the transcript id and protein sequence from gene """
    sp = "zebrafish"
    gn = "ENSDARG00000027279"
    from cogent.db.ensembl import Genome
    specie = Genome(Species=sp, Release="81", account=None)
    gene = specie.getGeneByStableId(StableId=gn)
    for tr in gene.Transcripts:
        print(tr.StableId)
        for ex in tr.Exons:
            print(ex.Symbol)
예제 #11
0
def get_genes_from_Ensembl_pairwise(args, sourcedata, targetdata):
    genelist = []

    inputtype = args.inputType

    firstspecies = args.firstspecies
    secondspecies = args.secondspecies
    if (firstspecies == None):
        print "Argument -s1 <firstspeciesname> is required"
    elif (secondspecies == None):
        print "Argument -s2 <secondspeciesname> is required"
    else:
        account = HostAccount('ensembldb.ensembl.org', 'anonymous', '')
        firstgenome = Genome(firstspecies, ENSEMBL_VERSION, account)
        secondgenome = Genome(secondspecies, ENSEMBL_VERSION, account)

        if (inputtype == "id"):
            firstgeneid = args.firstgeneid
            if (firstgeneid == None):
                print "Argument -gid1 <firstgeneid> is required"
            secondgeneid = args.secondgeneid
            if (secondgeneid == None):
                print "Argument -gid2 <secondgeneid> is required"

            if (firstgeneid != None and secondgeneid != None):
                print "Retreving Genes", firstgeneid, secondgeneid

                firstgene = firstgenome.getGeneByStableId(StableId=firstgeneid)
                secondgene = secondgenome.getGeneByStableId(
                    StableId=secondgeneid)

        if (inputtype == "name"):
            gene = args.gene
            if (gene == None):
                print "Argument -g <genename> is required"
            else:
                print "Retreving Genes", name, "from species", firstspecies, secondspecies

                firstgene = get_gene_from_Ensembl_by_name(gene, firstgenome)
                secondgene = get_gene_from_Ensembl_by_name(gene, secondgenome)

        sourcedata += get_cds_data(firstgene) + get_cds_data(secondgene)
        targetdata += get_gene_data(firstgene) + get_gene_data(secondgene)

    return sourcedata, targetdata
예제 #12
0
# Code to merge ensembl gene information with annotation of probes and junctions to get an idea of the gene structure

import pandas as pd
import re
from collections import Counter
from itertools import permutations
import csv
import string
from cogent.db.ensembl import Genome, HostAccount
from difflib import SequenceMatcher

#account = HostAccount('ensembldb.ensembl.org', 'anonymous', '', port=5306)
account = HostAccount('127.0.0.1', 'root', 'ensembl', port=3306)
Release = 89
HumanDB = Genome(Species='human', Release=Release, account=account)


class SettingTCID_GeneIDError(Exception):
    def __init__(self, msg):
        self.msg = msg

    def __str__(self):
        return self.msg


def GeneStructure(TCID,
                  GeneID,
                  Probesets=None,
                  MappingFile="HTA_2_0_Probeset_SequenceIndices.txt",
                  SequenceFile="HTA_2_0_Probeset_Sequences.txt",
                  location="Output",
예제 #13
0
genome and reverse complement if on the negative strand. The sequence currently
in the reference file may refer to the sequence synthesized on the chip, in which
case it is the version with the lowest A's (easier to synthesize). It could also
be the sequence without strand consideration. To be sure, let's just pull from 
the genome.
'''

import MySQLdb
# use cogent to extract sequences from the genome
from cogent.db.ensembl import HostAccount, Genome
import pandas as pd

# set up connection
# host, user, password, port
pycog = HostAccount('ensembldb.ensembl.org', 'anonymous', '', 3306)
hs37 = Genome('human', Release=78, account=pycog)

# read in reference
data = pd.read_table('../produced_data/splicemod_data_clean.txt', sep='\t')
# reformat so intron/exon length columns are int and not float, convert NA to 0 so we can use int
data[['intron1_len', 'exon_len',
      'intron2_len']] = data[['intron1_len', 'exon_len',
                              'intron2_len']].fillna(0.0).astype(int)

# grab sequences with chr, start and end information
lib = data[data.chr.notnull() & data.start.notnull() & data.end.notnull()]


def grab_region(chr, start, end, strand, genome):

    start = int(start)
예제 #14
0
Release=80
from cogent.db.ensembl import Species, Genome
human = Genome(Species='human', Release=Release, account=None)
gene = human.getGeneByStableId(StableId='ENSG00000205274')
print gene.Symbol
예제 #15
0
    host, username, password = os.environ['ENSEMBL_ACCOUNT'].split()
    account = HostAccount(host, username, password)
else:
    account = None

protein_mutation = 'A203T'
protein_A = protein_mutation[0]
protein_B = protein_mutation[-1]
codon_index = int(protein_mutation[1:-1])


#result = find_codon_index(protein_A, 'GCA', protein_B)
#print result

    
human = Genome(Species='human', Release=Release, account=account)
print human

#seqs = {'original' : 'A',
	#'mutation' : 'T'
	#}
#protein =  LoadSeqs(data = seqs, moltype = PROTEIN)
#print protein.getTranslation()

#protein = 'METSASATASEKQEAKSGILEAAGFPDPGKKASPLVVAAAAAAAVAAQGVPQHLLPPFHAPLPIDMRHQEGRYHYEPHSVHGVHGPPALSGSPVISDISLIRLSPHPAGPGESPFNAPHPYVNPHMEHYLRSVHSSPTLSMISAARGLSPADVAQEHLKERGLFGLPAPGTTPSDYYHQMTLVAGHPAPYGDLLMQSGGAASAPHLHDYLNPVDVSRFSSPRVTPRLSRKRALSISPLSDASLDLQRMIRTSPNSLVAYINNSRSSSAASGSYGHLSAGALSPAFTFPHPINPVAYQQILSQQRGLGSAFGHTPPLIQPSPTFLAQQPMALTSINATPTQLSSSSNCLSDTNQNKQSSESAVSSTVNPVAIHKRSKVKTEPEGLRPASPLALTQGQVSGHGSCGCALPLSQEQLADLKEDLDRDDCKQEAEVVIYETNCHWEDCTKEYDTQEQLVHHINNEHIHGEKKEFVCRWQACTREQKPFKAQYMLVVHMRRHTGEKPHKCTFEGCSKAYSRLENLKTHLRSHTGEKPYVCEHEGCNKAFSNASDRAKHQNRTHSNEKPYICKIPGCTKRYTDPSSLRKHVKTVHGPDAHVTKKQRNDVHLRTPLLKENGDSEAGTEPGGPESTEASSTSQAVEDCLHVRAIKTESSGLCQSSPGAQSSCSSEPSPLGSAPNNDSGVEMPGTGPGSLGDLTALDDTPPGADTSALAAPSAGGLQLRKHMTTMHRFEQLKKEKLKSLKDSCSWAGPTPHTRNTKLPPLPGSGSILENFSGSGGGGPAGLLPNPRLSELSASEVTMLSQLQERRDSSTSTVSSAYTVSRRSSGISPYFSSRRSSEASPLGAGRPHNASSADSYDPISTDASRRSSEASQCSGGSGLLNLTPAQQYSLRAKYAAATGGPPPTPLPGLERMSLRTRLALLDAPERTLPAGCPRPLGPRRGSDGPTYGHGHAGAAPAFPHEAPGGGARRASDPVRRPDALSLPRVQRFHSTHNVNPGPLPPCADRRGLRLQSHPSTDGGLARGAYSPRPPSISENVAMEAVAAGVDGAGPEADLGLPEDDLVLPDDVVQYIKAHASGALDEGTGQVYPTESTGFSDNPRLPSPGLHGQRRMVAADSNVGPSAPMLGGCQLGFGAPSSLNKNNMPVQWNEVSSGTVDALASQVKPPPFPQGNLAVVQQKPAFGQYPGYSPQGLQASPGGLDSTQPHLQPRSGAPSQGIPRVNYMQQLRQPVAGSQCPGMTTTMSPHACYGQVHPQLSPSTISGALNQFPQSCSNMPAKPGHLGHPQQTEVAPDPTTMGNRHRELGVPDSALAGVPPPHPVQSYPQQSHHLAASMSQEGYHQVPSLLPARQPGFMEPQTGPMGVATAGFGLVQPRPPLEPSPTGRHRGVRAVQQQLAYARATGHAMAAMPSSQETAEAVPKGAMGNMGSVPPQPPPQDAGGAPDHSMLYYYGQIHMYEQDGGLENLGSCQVMRSQPPQPQACQDSIQPQPLPSPGVNQVSSTVDSQLLEAPQIDFDAIMDDGDHSSLFSGALSPSLLHSLSQNSSRLTTPRNSLTLPSIPAGISNMAVGDMSSMLTSLAEESKFLNMMT'
#print len(protein)
#cds = 'ATGGAGACGTCTGCCTCAGCCACTGCCTCCGAGAAGCAAGAAGCCAAAAGTGGGATCCTGGAGGCCGCTGGCTTCCCCGACCCGGGTAAAAAGGCCTCTCCTTTGGTGGTGGCTGCAGCGGCAGCAGCAGCGGTAGCTGCCCAAGGAGTGCCGCAGCATCTCTTGCCACCATTCCATGCGCCCCTACCGATTGACATGCGACACCAGGAAGGAAGGTACCATTACGAGCCTCATTCTGTCCACGGTGTGCACGGGCCCCCTGCCCTCAGCGGCAGCCCTGTCATCTCTGACATCTCCTTGATCCGGCTTTCCCCGCACCCGGCTGGCCCTGGGGAGTCCCCCTTCAACGCCCCCCACCCGTACGTGAACCCCCACATGGAGCACTACCTCCGTTCTGTGCACAGCAGCCCCACGCTCTCCATGATCTCTGCAGCCAGGGGCCTCAGCCCCGCTGATGTGGCCCAGGAGCACCTTAAGGAGAGGGGACTGTTTGGCCTTCCTGCTCCAGGCACCACCCCCTCAGACTATTACCACCAGATGACCCTCGTGGCAGGCCACCCCGCGCCCTACGGGGACCTGCTGATGCAGAGCGGGGGCGCTGCCAGCGCACCCCATCTCCACGACTACCTCAACCCCGTGGACGTGTCCCGTTTCTCCAGCCCGCGGGTGACGCCCCGCCTGAGCCGCAAGCGGGCGCTGTCCATCTCCCCACTCTCAGACGCCAGCCTGGACCTGCAGCGGATGATCCGCACCTCACCCAACTCGCTAGTGGCCTACATCAACAACTCCCGAAGCAGCTCGGCGGCCAGCGGTTCCTACGGGCATCTGTCAGCGGGTGCCCTCAGCCCAGCCTTCACCTTCCCCCACCCCATCAACCCCGTGGCCTACCAGCAGATTCTGAGCCAGCAGAGGGGTCTGGGGTCAGCCTTTGGACACACACCACCCCTGATCCAGCCCTCACCCACCTTCCTGGCCCAGCAGCCCATGGCCCTCACCTCCATCAATGCCACGCCCACCCAGCTCAGCAGCAGCAGCAACTGTCTGAGTGACACCAACCAGAACAAGCAGAGCAGTGAGTCGGCCGTCAGCAGCACCGTCAACCCTGTCGCCATTCACAAGCGCAGCAAGGTCAAGACCGAGCCTGAGGGCCTGCGGCCGGCCTCCCCTCTGGCGCTGACGCAGGGCCAGGTGTCTGGACACGGCTCATGTGGGTGTGCCCTTCCCCTCTCCCAGGAGCAGCTGGCTGACCTCAAGGAAGATCTGGACAGGGATGACTGTAAGCAGGAGGCTGAGGTGGTCATCTATGAGACCAACTGCCACTGGGAAGACTGCACCAAGGAGTACGACACCCAGGAGCAGCTGGTGCATCACATCAACAACGAGCACATCCACGGGGAGAAGAAGGAGTTTGTGTGCCGCTGGCAGGCCTGCACGCGGGAGCAGAAGCCCTTCAAGGCGCAGTACATGCTGGTGGTGCACATGCGGCGACACACGGGCGAGAAGCCCCACAAGTGCACGTTCGAGGGCTGCTCGAAGGCCTACTCCCGCCTGGAGAACCTGAAGACACACCTGCGGTCCCACACCGGGGAGAAGCCATATGTGTGTGAGCACGAGGGCTGCAACAAAGCCTTCTCCAACGCCTCGGACCGCGCCAAGCACCAGAATCGCACCCACTCCAACGAGAAACCCTACATCTGCAAGATCCCAGGCTGCACCAAGAGATACACAGACCCCAGCTCTCTCCGGAAGCATGTGAAAACGGTCCACGGCCCAGATGCCCACGTCACCAAGAAGCAGCGCAATGACGTGCACCTCCGCACACCGCTGCTCAAAGAGAATGGGGACAGTGAGGCCGGCACGGAGCCTGGCGGCCCAGAGAGCACCGAGGCCAGCAGCACCAGCCAGGCCGTGGAGGACTGCCTGCACGTCAGAGCCATCAAGACCGAGAGCTCCGGGCTGTGTCAGTCCAGCCCCGGGGCCCAGTCGTCCTGCAGCAGCGAGCCCTCTCCTCTGGGCAGTGCCCCCAACAATGACAGTGGCGTGGAGATGCCGGGGACGGGGCCCGGGAGCCTGGGAGACCTGACGGCACTGGATGACACACCCCCAGGGGCCGACACCTCAGCCCTGGCTGCCCCCTCCGCTGGTGGCCTCCAGCTGCGCAAACACATGACCACCATGCACCGGTTCGAGCAGCTCAAGAAGGAGAAGCTCAAGTCACTCAAGGATTCCTGCTCATGGGCCGGGCCGACTCCACACACGCGGAACACCAAGCTGCCTCCCCTCCCGGGAAGTGGCTCCATCCTGGAAAACTTCAGTGGCAGTGGGGGCGGCGGGCCCGCGGGGCTGCTGCCGAACCCGCGGCTGTCGGAGCTGTCCGCGAGCGAGGTGACCATGCTGAGCCAGCTGCAGGAGCGCCGCGACAGCTCCACCAGCACGGTCAGCTCGGCCTACACCGTGAGCCGCCGCTCCTCCGGCATCTCCCCCTACTTCTCCAGCCGCCGCTCCAGCGAGGCCTCGCCCCTGGGCGCCGGCCGCCCGCACAACGCGAGCTCCGCTGACTCCTACGACCCCATCTCCACGGACGCGTCGCGGCGCTCGAGCGAGGCCAGCCAGTGCAGCGGCGGCTCCGGGCTGCTCAACCTCACGCCGGCGCAGCAGTACAGCCTGCGGGCCAAGTACGCGGCAGCCACTGGCGGCCCCCCGCCCACTCCGCTGCCGGGCCTGGAGCGCATGAGCCTGCGGACCAGGCTGGCGCTGCTGGACGCGCCCGAGCGCACGCTGCCCGCCGGCTGCCCACGCCCACTGGGGCCGCGGCGTGGCAGCGACGGGCCGACCTATGGCCACGGCCACGCGGGGGCTGCGCCCGCCTTCCCCCACGAGGCTCCAGGCGGCGGAGCCAGGCGGGCCAGCGACCCTGTGCGGCGGCCCGATGCCCTGTCCCTGCCGCGGGTGCAGCGCTTCCACAGCACCCACAACGTGAACCCCGGCCCGCTGCCGCCCTGTGCCGACAGGCGAGGCCTCCGCCTGCAGAGCCACCCGAGCACCGACGGCGGCCTGGCCCGCGGCGCCTACTCGCCCCGGCCGCCTAGCATCAGCGAGAACGTGGCGATGGAGGCCGTGGCGGCAGGAGTGGACGGCGCGGGGCCCGAGGCCGACCTGGGGCTGCCGGAGGACGACCTGGTGCTTCCAGACGACGTGGTGCAGTACATCAAGGCGCACGCCAGTGGCGCTCTGGACGAGGGCACCGGGCAGGTGTATCCCACGGAAAGCACTGGCTTCTCTGACAACCCCAGACTACCCAGCCCGGGGCTGCACGGCCAGCGCAGGATGGTGGCTGCGGACTCCAACGTGGGCCCCTCCGCCCCTATGCTGGGAGGATGCCAGTTAGGCTTTGGGGCGCCCTCCAGCCTGAACAAAAATAACATGCCTGTGCAGTGGAATGAGGTGAGCTCCGGCACCGTAGACGCCCTGGCCAGCCAGGTGAAGCCTCCACCCTTTCCTCAGGGCAACCTGGCGGTGGTGCAGCAGAAGCCTGCCTTTGGCCAGTACCCGGGCTACAGTCCGCAAGGCCTACAGGCTAGCCCTGGGGGCCTGGACAGCACGCAGCCACACCTGCAGCCCCGCAGCGGAGCCCCCTCCCAGGGCATCCCCAGGGTAAACTACATGCAGCAGCTGCGACAGCCAGTGGCAGGCAGCCAGTGTCCTGGCATGACTACCACTATGAGCCCCCATGCCTGCTATGGCCAAGTCCACCCCCAGCTGAGCCCCAGCACCATCAGTGGGGCCCTCAACCAGTTCCCCCAATCCTGCAGCAACATGCCAGCCAAGCCAGGGCATCTGGGGCACCCTCAGCAGACAGAAGTGGCACCTGACCCCACCACGATGGGCAATCGCCACAGGGAACTTGGGGTCCCCGATTCAGCCCTGGCTGGAGTGCCACCACCTCACCCAGTCCAGAGCTACCCACAGCAGAGCCATCACCTGGCAGCCTCCATGAGCCAGGAGGGCTACCACCAGGTCCCCAGCCTTCTGCCTGCCCGCCAGCCTGGCTTCATGGAGCCCCAAACAGGCCCGATGGGGGTGGCTACAGCAGGCTTTGGCCTAGTGCAGCCCCGGCCTCCCCTCGAGCCCAGCCCCACTGGCCGCCACCGTGGGGTACGTGCTGTGCAGCAGCAGCTGGCCTACGCCAGGGCCACAGGCCATGCCATGGCTGCCATGCCGTCCAGTCAGGAAACAGCAGAGGCTGTGCCCAAGGGAGCGATGGGCAACATGGGGTCGGTGCCTCCCCAGCCGCCTCCGCAGGACGCAGGTGGGGCCCCGGACCACAGCATGCTCTACTACTACGGCCAGATCCACATGTACGAACAGGATGGAGGCCTGGAGAACCTCGGGAGCTGCCAGGTCATGCGGTCCCAGCCACCACAGCCACAGGCCTGTCAGGACAGCATCCAGCCCCAGCCCTTGCCCTCACCAGGGGTCAACCAGGTGTCCAGCACTGTGGACTCCCAGCTCCTGGAGGCCCCCCAGATTGACTTCGATGCCATCATGGATGATGGCGATCACTCGAGTTTGTTCTCGGGTGCTCTGAGCCCCAGCCTCCTCCACAGCCTCTCCCAGAACTCCTCCCGCCTCACCACCCCCCGAAACTCCTTGACCCTGCCCTCCATCCCCGCAGGCATCAGCAACATGGCTGTCGGGGACATGAGCTCCATGCTCACCAGCCTCGCCGAGGAGAGCAAGTTCCTGAACATGATGACCTAG'
#my_seq = DNA.makeSequence(cds,'gli2')
#seq = my_seq.withoutTerminalStopCodon()
#pep = seq.getTranslation()
#print pep.toFasta()
예제 #16
0
    hitsOrganisms[items[7]] = True

print "> Number of loaded hits: " + str(len(hits))
outfile.close()

print "> SPECIES <"
for key in hitsOrganisms:
    print key

exit(1)

##### SET CONNECTION TO THE ENSEMBL DATABASE ##########################
Release = 67
account = None
yeast = Genome(Species='Neosartorya fischeris',
               Release=Release,
               account=account)
outfile = open("dataset_fungal_homologs_sequences.csv", "w")
print Species

##### GET SEQUENCES FROM ENSEMBL ######################################
i = 0
for hit in hits:
    i += 1
    print "i: " + str(i)

    identifier = hit
    print "\n***** ZPRACOVANI ZAZNAMU " + identifier + " *****"

    # Selection of species
    genes = yeast.getGenesMatching(StableId="CADNFIAP00000001")
예제 #17
0
def download_database_pycogent(species, release, database_name='ensembl', nucleotide=False) :
    log = get_log()

    #try :
    import cogent
    from cogent.db.ensembl import Species, Genome, Compara, HostAccount
    from cogent.db.ensembl.database import Database

    #except ImportError :
    #    log.fatal("pycogent import failed, exiting...")
    #    exit(1)

    if cogent.version_info != (1,5,3) :
        log.warning("only tested with pycogent version 1.5.3 (you are running %s)" % cogent.version)


    release, db_name, db_details = get_missing_info(species, release, database_name)

    account = HostAccount(
                db_details['hostname'],
                db_details['username'],
                db_details['password'],
                port=db_details['port'])

    if Species.getSpeciesName(species) == 'None' : # this is not an error, it returns the string "None"
        log.warning("%s not found in pycogent, attempting to add it manually" % species)
        Species.amendSpecies(species.capitalize().replace('_', ' '), species)

    genome = Genome(species, Release=release, account=account)
    compara = Compara([species], Release=release, account=account)



    # DON'T TRY THIS AT HOME!
    #
    # what happens is it searches for compara databases, but unfortunately finds more than one
    # in this situation pycogent just connects to the first one, which is always compara_bacteria
    # so one solution is to dig through all the compara objects internals to provide a connection
    # to the correct database ... obviously not the best solution, but at 6 lines of code definitely
    # the shortest ;-P
    #
    if db_name not in ('ensembl', 'bacteria') :
        log.warning("accessing compara from pycogent with species outside of ensembl-main and ensembl-bacteria is problematic, attempting to patch...")

        from cogent.db.ensembl.host import DbConnection
        from cogent.db.ensembl.name import EnsemblDbName
        import sqlalchemy

        new_db_name = EnsemblDbName(compara.ComparaDb.db_name.Name.replace('bacteria', db_name))
        compara.ComparaDb._db = DbConnection(account=account, db_name=new_db_name)
        compara.ComparaDb._meta = sqlalchemy.MetaData(compara.ComparaDb._db)
    # end of DON'T TRY THIS AT HOME!



    genes = set()
    families = []

    stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes)))

    for gene in genome.getGenesMatching(BioType='protein_coding') :
        stableid = gene.StableId

        # ignore genes that have already been seen as members of other gene families
        if stableid in genes :
            continue

        genes.add(stableid)

        paralogs = compara.getRelatedGenes(StableId=stableid, Relationship='within_species_paralog')
        
        current = []
        
        if paralogs is None :
            stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes)))
            current.append((stableid, str(gene.CanonicalTranscript.Cds) if nucleotide else str(gene.CanonicalTranscript.ProteinSeq)))

        else :
            for paralog in paralogs.Members :
                paralogid = paralog.StableId
                genes.add(paralogid)

                stderr.write("\r[downloading %s] got %d sequences " % ("CDS" if nucleotide else "protein", len(genes)))

                try :
                    current.append((paralogid, str(paralog.CanonicalTranscript.Cds) if nucleotide else str(paralog.CanonicalTranscript.ProteinSeq)))
                
                except AttributeError :
                    log.fatal("pycogent did not find a canonical transcript for %s" % paralogid)
                    exit(1)

        #print ','.join([ i for i,j in current ])
        families.append(current)

    stderr.write("\r[downloading %s] got %d sequences\n" % ("CDS" if nucleotide else "protein", len(genes)))

    return families
예제 #18
0
'''
import sys
import csv
import os
from cogent.db.ensembl import HostAccount, Species, Genome
import sqlalchemy as sql

Release = 70

if 'ENSEMBL_ACCOUNT' in os.environ:
    host, username, password = os.environ['ENSEMBL_ACCOUNT'].split()
    account = HostAccount(host, username, password)
else:
    account = None

human = Genome(Species='human', Release=Release, account=account)

def Getexons(ENSG,start,end):
    gene = human.getGeneByStableId(StableId = ENSG)
    print gene.BioType
    print len(gene.Transcripts)
    #print dir(gene.Location)
    #print "\t\t\tstart\t\tEnd"
    #print "from csv\t\t",start,"\t",end
    #print "from location\t",gene.Location.Start,"\t",gene.Location.End
    #print "from ensembl\t",gene.Location.EnsemblStart,"\t", gene.Location.EnsemblEnd
    Not_Found = True
    for transcript in gene.Transcripts:
        #print dir(transcript.Location)
        #print start,end
        #if int(start)-1 == transcript.Location.Start and int(end) == transcript.Location.End:
예제 #19
0
import os
Release = 93
from cogent.db.ensembl import HostAccount
if 'ENSEMBL_ACCOUNT' in os.environ:
    host, username, password = os.environ['ENSEMBL_ACCOUNT'].split()
    account = HostAccount(host, username, password)
else:
    account = None

from cogent.db.ensembl import HostAccount, Genome
human = Genome(Species='human', Release=Release, account=account)
variants = human.getVariation(Symbol='rs369202065')
for variant in variants:
	print(variant)
	# die()
	['AlleleFreqs', 'Alleles', 'Ancestral', 'Effect', 'FlankingSeq', 'Location', 'MapWeight', 'NULL_VALUE', 'NumAlleles', 'PeptideAlleles', 'Seq', 'Somatic', 'Symbol', 'TranslationLocation', 'Type', 'Validation', 'Variants', '__class__', '__cmp__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__len__', '__module__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_attr_ensembl_table_map', '_cached', '_get_allele_freqs', '_get_allele_table_record', '_get_alleles', '_get_ancestral', '_get_ancestral_data', '_get_cached_value', '_get_effect', '_get_flanking_seq', '_get_flanking_seq_data', '_get_flanking_seq_data_ge_70', '_get_flanking_seq_data_lt_70', '_get_location', '_get_location_record', '_get_map_weight', '_get_number_alleles', '_get_peptide_variation', '_get_seq_region_record', '_get_sequence', '_get_somatic', '_get_symbol', '_get_transcript_record', '_get_translation_location', '_get_validation', '_get_variants', '_get_variation_table_record', '_location_column_prefix', '_make_location', '_populate_cache_from_record', '_set_null_values', '_split_alleles', '_table_rows', 'allele_code_table', 'allele_table', 'db', 'featureData', 'genome', 'getAnnotatedSeq', 'getFeatures', 'transcript_variation_table', 'variation_feature_table', 'variation_table']
예제 #20
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-

# cogent and sqlalchemy modules need to be installed:
# pip2 install cogent
# pip2 install sqlalchemy
from cogent.db.ensembl import HostAccount, Species, Genome
from pypath import mapping

Release = 78
account = HostAccount('ensembldb.ensembl.org', 'anonymous', '')

human = Genome(Species='human', Release=Release, account=account)

# UniProt, seq offset, residue, isoform
positions = [('P00533', 40, 'Q', 1), ('P60520', 30, 'P', 1)]

m = mapping.Mapper()
m.load_uniprot_mappings(['ensg'], bi=True)

positions_ens = []
for p in positions:
    ensgs = m.map_name(p[0], 'uniprot', 'ensg')
    for ensg in ensgs:
        genes = human.getGenesMatching(StableId=ensg)
        for gene in genes:
            positions_ens.append(
                tuple([ensg, gene.Location, gene.CanonicalTranscript.Exons] +
                      list(p)))

# another attempts with biopython --
예제 #21
0
def add_ensembl_gene_data(session,
                          species,
                          ensembl_release,
                          account=None,
                          debug=False):
    """add Ensembl genes and their transcripts to the db session"""
    rr = RunRecord('add_ensembl_gene_data')
    genome = Genome(species, Release=ensembl_release, account=account)

    skip = set(['processed_transcript', 'pseudogene'])
    biotypes = [b for b in genome.getDistinct('BioType') if b not in skip]

    data = []
    unique_gene_ids = set()
    unique_exon_ids = set()
    chromSet = set()
    n = 0
    total_objects = 0
    for biotype in biotypes:
        for gene in genome.getGenesMatching(BioType=biotype):
            # gene.Location.CoordName is the chromosome name
            min_chrom_length = 5  # likely an unconfirmed scaffold
            if len(gene.Location.CoordName) > min_chrom_length:
                rr.addWarning('Skipping chrom', gene.Location.CoordName)
                continue
            chromSet.add(gene.Location.CoordName)

            if gene.StableId not in unique_gene_ids:

                db_gene = Gene(ensembl_id=gene.StableId,
                               symbol=gene.Symbol,
                               biotype=gene.BioType,
                               description=gene.Description,
                               status=gene.Status,
                               chrom=gene.Location.CoordName,
                               start=gene.Location.Start,
                               end=gene.Location.End,
                               strand=gene.Location.Strand)

                unique_gene_ids.add(gene.StableId)
                data.append(db_gene)
            else:
                rr.addWarning('Duplicate gene', gene.StableId)

            for exon in gene.CanonicalTranscript.Exons:
                if exon.StableId not in unique_exon_ids:
                    db_exon = Exon(exon.StableId, exon.Rank,
                                   exon.Location.Start, exon.Location.End)
                    db_exon.gene = db_gene
                    unique_exon_ids.add(exon.StableId)
                    data.append(db_exon)

                else:
                    rr.addWarning('Duplicate exon', exon.StableId)
            n += 1
            if n % 100 == 0:
                print 'Genes processed:', n, '; Db objects created:', len(data)
                if debug:
                    session.add_all(data)
                    session.commit()
                    return
    rr.addInfo('Instantiating chromosomes', chromSet)
    chroms = Chroms(species, chromSet)
    data.append(chroms)

    rr.addInfo('Writing objects into db', len(data))
    session.add_all(data)
    session.commit()
    return chroms
예제 #22
0
if 'ENSEMBL_ACCOUNT' in os.environ:
    host, username, password = os.environ['ENSEMBL_ACCOUNT'].split()
    account = HostAccount(host, username, password)
else:
    account = None

protein_mutation = 'A203T'
protein_A = protein_mutation[0]
protein_B = protein_mutation[-1]
codon_index = int(protein_mutation[1:-1])

#result = find_codon_index(protein_A, 'GCA', protein_B)
#print result

human = Genome(Species='human', Release=Release, account=account)
print human

#seqs = {'original' : 'A',
#'mutation' : 'T'
#}
#protein =  LoadSeqs(data = seqs, moltype = PROTEIN)
#print protein.getTranslation()

#protein = 'METSASATASEKQEAKSGILEAAGFPDPGKKASPLVVAAAAAAAVAAQGVPQHLLPPFHAPLPIDMRHQEGRYHYEPHSVHGVHGPPALSGSPVISDISLIRLSPHPAGPGESPFNAPHPYVNPHMEHYLRSVHSSPTLSMISAARGLSPADVAQEHLKERGLFGLPAPGTTPSDYYHQMTLVAGHPAPYGDLLMQSGGAASAPHLHDYLNPVDVSRFSSPRVTPRLSRKRALSISPLSDASLDLQRMIRTSPNSLVAYINNSRSSSAASGSYGHLSAGALSPAFTFPHPINPVAYQQILSQQRGLGSAFGHTPPLIQPSPTFLAQQPMALTSINATPTQLSSSSNCLSDTNQNKQSSESAVSSTVNPVAIHKRSKVKTEPEGLRPASPLALTQGQVSGHGSCGCALPLSQEQLADLKEDLDRDDCKQEAEVVIYETNCHWEDCTKEYDTQEQLVHHINNEHIHGEKKEFVCRWQACTREQKPFKAQYMLVVHMRRHTGEKPHKCTFEGCSKAYSRLENLKTHLRSHTGEKPYVCEHEGCNKAFSNASDRAKHQNRTHSNEKPYICKIPGCTKRYTDPSSLRKHVKTVHGPDAHVTKKQRNDVHLRTPLLKENGDSEAGTEPGGPESTEASSTSQAVEDCLHVRAIKTESSGLCQSSPGAQSSCSSEPSPLGSAPNNDSGVEMPGTGPGSLGDLTALDDTPPGADTSALAAPSAGGLQLRKHMTTMHRFEQLKKEKLKSLKDSCSWAGPTPHTRNTKLPPLPGSGSILENFSGSGGGGPAGLLPNPRLSELSASEVTMLSQLQERRDSSTSTVSSAYTVSRRSSGISPYFSSRRSSEASPLGAGRPHNASSADSYDPISTDASRRSSEASQCSGGSGLLNLTPAQQYSLRAKYAAATGGPPPTPLPGLERMSLRTRLALLDAPERTLPAGCPRPLGPRRGSDGPTYGHGHAGAAPAFPHEAPGGGARRASDPVRRPDALSLPRVQRFHSTHNVNPGPLPPCADRRGLRLQSHPSTDGGLARGAYSPRPPSISENVAMEAVAAGVDGAGPEADLGLPEDDLVLPDDVVQYIKAHASGALDEGTGQVYPTESTGFSDNPRLPSPGLHGQRRMVAADSNVGPSAPMLGGCQLGFGAPSSLNKNNMPVQWNEVSSGTVDALASQVKPPPFPQGNLAVVQQKPAFGQYPGYSPQGLQASPGGLDSTQPHLQPRSGAPSQGIPRVNYMQQLRQPVAGSQCPGMTTTMSPHACYGQVHPQLSPSTISGALNQFPQSCSNMPAKPGHLGHPQQTEVAPDPTTMGNRHRELGVPDSALAGVPPPHPVQSYPQQSHHLAASMSQEGYHQVPSLLPARQPGFMEPQTGPMGVATAGFGLVQPRPPLEPSPTGRHRGVRAVQQQLAYARATGHAMAAMPSSQETAEAVPKGAMGNMGSVPPQPPPQDAGGAPDHSMLYYYGQIHMYEQDGGLENLGSCQVMRSQPPQPQACQDSIQPQPLPSPGVNQVSSTVDSQLLEAPQIDFDAIMDDGDHSSLFSGALSPSLLHSLSQNSSRLTTPRNSLTLPSIPAGISNMAVGDMSSMLTSLAEESKFLNMMT'
#print len(protein)
#cds = 'ATGGAGACGTCTGCCTCAGCCACTGCCTCCGAGAAGCAAGAAGCCAAAAGTGGGATCCTGGAGGCCGCTGGCTTCCCCGACCCGGGTAAAAAGGCCTCTCCTTTGGTGGTGGCTGCAGCGGCAGCAGCAGCGGTAGCTGCCCAAGGAGTGCCGCAGCATCTCTTGCCACCATTCCATGCGCCCCTACCGATTGACATGCGACACCAGGAAGGAAGGTACCATTACGAGCCTCATTCTGTCCACGGTGTGCACGGGCCCCCTGCCCTCAGCGGCAGCCCTGTCATCTCTGACATCTCCTTGATCCGGCTTTCCCCGCACCCGGCTGGCCCTGGGGAGTCCCCCTTCAACGCCCCCCACCCGTACGTGAACCCCCACATGGAGCACTACCTCCGTTCTGTGCACAGCAGCCCCACGCTCTCCATGATCTCTGCAGCCAGGGGCCTCAGCCCCGCTGATGTGGCCCAGGAGCACCTTAAGGAGAGGGGACTGTTTGGCCTTCCTGCTCCAGGCACCACCCCCTCAGACTATTACCACCAGATGACCCTCGTGGCAGGCCACCCCGCGCCCTACGGGGACCTGCTGATGCAGAGCGGGGGCGCTGCCAGCGCACCCCATCTCCACGACTACCTCAACCCCGTGGACGTGTCCCGTTTCTCCAGCCCGCGGGTGACGCCCCGCCTGAGCCGCAAGCGGGCGCTGTCCATCTCCCCACTCTCAGACGCCAGCCTGGACCTGCAGCGGATGATCCGCACCTCACCCAACTCGCTAGTGGCCTACATCAACAACTCCCGAAGCAGCTCGGCGGCCAGCGGTTCCTACGGGCATCTGTCAGCGGGTGCCCTCAGCCCAGCCTTCACCTTCCCCCACCCCATCAACCCCGTGGCCTACCAGCAGATTCTGAGCCAGCAGAGGGGTCTGGGGTCAGCCTTTGGACACACACCACCCCTGATCCAGCCCTCACCCACCTTCCTGGCCCAGCAGCCCATGGCCCTCACCTCCATCAATGCCACGCCCACCCAGCTCAGCAGCAGCAGCAACTGTCTGAGTGACACCAACCAGAACAAGCAGAGCAGTGAGTCGGCCGTCAGCAGCACCGTCAACCCTGTCGCCATTCACAAGCGCAGCAAGGTCAAGACCGAGCCTGAGGGCCTGCGGCCGGCCTCCCCTCTGGCGCTGACGCAGGGCCAGGTGTCTGGACACGGCTCATGTGGGTGTGCCCTTCCCCTCTCCCAGGAGCAGCTGGCTGACCTCAAGGAAGATCTGGACAGGGATGACTGTAAGCAGGAGGCTGAGGTGGTCATCTATGAGACCAACTGCCACTGGGAAGACTGCACCAAGGAGTACGACACCCAGGAGCAGCTGGTGCATCACATCAACAACGAGCACATCCACGGGGAGAAGAAGGAGTTTGTGTGCCGCTGGCAGGCCTGCACGCGGGAGCAGAAGCCCTTCAAGGCGCAGTACATGCTGGTGGTGCACATGCGGCGACACACGGGCGAGAAGCCCCACAAGTGCACGTTCGAGGGCTGCTCGAAGGCCTACTCCCGCCTGGAGAACCTGAAGACACACCTGCGGTCCCACACCGGGGAGAAGCCATATGTGTGTGAGCACGAGGGCTGCAACAAAGCCTTCTCCAACGCCTCGGACCGCGCCAAGCACCAGAATCGCACCCACTCCAACGAGAAACCCTACATCTGCAAGATCCCAGGCTGCACCAAGAGATACACAGACCCCAGCTCTCTCCGGAAGCATGTGAAAACGGTCCACGGCCCAGATGCCCACGTCACCAAGAAGCAGCGCAATGACGTGCACCTCCGCACACCGCTGCTCAAAGAGAATGGGGACAGTGAGGCCGGCACGGAGCCTGGCGGCCCAGAGAGCACCGAGGCCAGCAGCACCAGCCAGGCCGTGGAGGACTGCCTGCACGTCAGAGCCATCAAGACCGAGAGCTCCGGGCTGTGTCAGTCCAGCCCCGGGGCCCAGTCGTCCTGCAGCAGCGAGCCCTCTCCTCTGGGCAGTGCCCCCAACAATGACAGTGGCGTGGAGATGCCGGGGACGGGGCCCGGGAGCCTGGGAGACCTGACGGCACTGGATGACACACCCCCAGGGGCCGACACCTCAGCCCTGGCTGCCCCCTCCGCTGGTGGCCTCCAGCTGCGCAAACACATGACCACCATGCACCGGTTCGAGCAGCTCAAGAAGGAGAAGCTCAAGTCACTCAAGGATTCCTGCTCATGGGCCGGGCCGACTCCACACACGCGGAACACCAAGCTGCCTCCCCTCCCGGGAAGTGGCTCCATCCTGGAAAACTTCAGTGGCAGTGGGGGCGGCGGGCCCGCGGGGCTGCTGCCGAACCCGCGGCTGTCGGAGCTGTCCGCGAGCGAGGTGACCATGCTGAGCCAGCTGCAGGAGCGCCGCGACAGCTCCACCAGCACGGTCAGCTCGGCCTACACCGTGAGCCGCCGCTCCTCCGGCATCTCCCCCTACTTCTCCAGCCGCCGCTCCAGCGAGGCCTCGCCCCTGGGCGCCGGCCGCCCGCACAACGCGAGCTCCGCTGACTCCTACGACCCCATCTCCACGGACGCGTCGCGGCGCTCGAGCGAGGCCAGCCAGTGCAGCGGCGGCTCCGGGCTGCTCAACCTCACGCCGGCGCAGCAGTACAGCCTGCGGGCCAAGTACGCGGCAGCCACTGGCGGCCCCCCGCCCACTCCGCTGCCGGGCCTGGAGCGCATGAGCCTGCGGACCAGGCTGGCGCTGCTGGACGCGCCCGAGCGCACGCTGCCCGCCGGCTGCCCACGCCCACTGGGGCCGCGGCGTGGCAGCGACGGGCCGACCTATGGCCACGGCCACGCGGGGGCTGCGCCCGCCTTCCCCCACGAGGCTCCAGGCGGCGGAGCCAGGCGGGCCAGCGACCCTGTGCGGCGGCCCGATGCCCTGTCCCTGCCGCGGGTGCAGCGCTTCCACAGCACCCACAACGTGAACCCCGGCCCGCTGCCGCCCTGTGCCGACAGGCGAGGCCTCCGCCTGCAGAGCCACCCGAGCACCGACGGCGGCCTGGCCCGCGGCGCCTACTCGCCCCGGCCGCCTAGCATCAGCGAGAACGTGGCGATGGAGGCCGTGGCGGCAGGAGTGGACGGCGCGGGGCCCGAGGCCGACCTGGGGCTGCCGGAGGACGACCTGGTGCTTCCAGACGACGTGGTGCAGTACATCAAGGCGCACGCCAGTGGCGCTCTGGACGAGGGCACCGGGCAGGTGTATCCCACGGAAAGCACTGGCTTCTCTGACAACCCCAGACTACCCAGCCCGGGGCTGCACGGCCAGCGCAGGATGGTGGCTGCGGACTCCAACGTGGGCCCCTCCGCCCCTATGCTGGGAGGATGCCAGTTAGGCTTTGGGGCGCCCTCCAGCCTGAACAAAAATAACATGCCTGTGCAGTGGAATGAGGTGAGCTCCGGCACCGTAGACGCCCTGGCCAGCCAGGTGAAGCCTCCACCCTTTCCTCAGGGCAACCTGGCGGTGGTGCAGCAGAAGCCTGCCTTTGGCCAGTACCCGGGCTACAGTCCGCAAGGCCTACAGGCTAGCCCTGGGGGCCTGGACAGCACGCAGCCACACCTGCAGCCCCGCAGCGGAGCCCCCTCCCAGGGCATCCCCAGGGTAAACTACATGCAGCAGCTGCGACAGCCAGTGGCAGGCAGCCAGTGTCCTGGCATGACTACCACTATGAGCCCCCATGCCTGCTATGGCCAAGTCCACCCCCAGCTGAGCCCCAGCACCATCAGTGGGGCCCTCAACCAGTTCCCCCAATCCTGCAGCAACATGCCAGCCAAGCCAGGGCATCTGGGGCACCCTCAGCAGACAGAAGTGGCACCTGACCCCACCACGATGGGCAATCGCCACAGGGAACTTGGGGTCCCCGATTCAGCCCTGGCTGGAGTGCCACCACCTCACCCAGTCCAGAGCTACCCACAGCAGAGCCATCACCTGGCAGCCTCCATGAGCCAGGAGGGCTACCACCAGGTCCCCAGCCTTCTGCCTGCCCGCCAGCCTGGCTTCATGGAGCCCCAAACAGGCCCGATGGGGGTGGCTACAGCAGGCTTTGGCCTAGTGCAGCCCCGGCCTCCCCTCGAGCCCAGCCCCACTGGCCGCCACCGTGGGGTACGTGCTGTGCAGCAGCAGCTGGCCTACGCCAGGGCCACAGGCCATGCCATGGCTGCCATGCCGTCCAGTCAGGAAACAGCAGAGGCTGTGCCCAAGGGAGCGATGGGCAACATGGGGTCGGTGCCTCCCCAGCCGCCTCCGCAGGACGCAGGTGGGGCCCCGGACCACAGCATGCTCTACTACTACGGCCAGATCCACATGTACGAACAGGATGGAGGCCTGGAGAACCTCGGGAGCTGCCAGGTCATGCGGTCCCAGCCACCACAGCCACAGGCCTGTCAGGACAGCATCCAGCCCCAGCCCTTGCCCTCACCAGGGGTCAACCAGGTGTCCAGCACTGTGGACTCCCAGCTCCTGGAGGCCCCCCAGATTGACTTCGATGCCATCATGGATGATGGCGATCACTCGAGTTTGTTCTCGGGTGCTCTGAGCCCCAGCCTCCTCCACAGCCTCTCCCAGAACTCCTCCCGCCTCACCACCCCCCGAAACTCCTTGACCCTGCCCTCCATCCCCGCAGGCATCAGCAACATGGCTGTCGGGGACATGAGCTCCATGCTCACCAGCCTCGCCGAGGAGAGCAAGTTCCTGAACATGATGACCTAG'
#my_seq = DNA.makeSequence(cds,'gli2')
#seq = my_seq.withoutTerminalStopCodon()
#pep = seq.getTranslation()
#print pep.toFasta()
예제 #23
0
if 'ENSEMBL_ACCOUNT' in os.environ:
     host, username, password = os.environ['ENSEMBL_ACCOUNT'].split()
     account = HostAccount(host, username, password)
else:
	account = None
print account

## What Species Are Available?

#from cogent.db.ensembl import Species
#print Species

## Interrogating a Genome

from cogent.db.ensembl import HostAccount, Genome
human = Genome(Species='human', Release=Release, account=account)
print human

'''
A Note on Coordinate Systems

The positions employed on Ensembls web-site, and in their MySQL database differ 
from those used internally by cogent.db.ensembl.In all cases where you are querying 
cogent.db.ensembl objects directly inputting nucleotide positions you can indicate 
you are using Ensembl coordinates by setting ensembl_coord=True.
If you are explicitly passing in a cogent.db.ensembl region, that argument has no effect.
'''

## Selecting Gene
#Via StableID
brca1 = human.getGeneByStableId(StableId='ENSG00000012048')
예제 #24
0
except IndexError:
	print "No gene entered"
	gene=False
	sys.exit()



print "<html>"
print "<head>"
print "<style>\ntab1 { padding-left: 4em; }\ntab2 { padding-left: 8em; }\n"
print "body{font-family:helvetica} \ntab3 { padding-left: 12em; }\n p{color:#AACCFF}\n"
print "button {padding: 15px 32px;text-align: center; text-decoration: none;display: inline-block;font-size: 16px;}\n li {margin-top: 0px; margin-right: 5px;}\n</style>"
print "</head>"		##AACCFF
print "<body style=\"background-color:#222233;\" text=\"#FFFFA8\">"

mouse=Genome(Species='mouse',Release=87,account=None)
#coding=mouse.getGenesMatching(StableID=geneID)
#print coding
coding=mouse.getGenesMatching(Symbol=genename)

#print mouse.getGenesMatching()
#print dir(mouse.getGenesMatching())
#print mouse.getGenesMatching().__dict__

#print coding
#sys.exit()

if not coding:
	print "<p style=\"font-size: 55px;\">This is a fatal error. Can't find your gene<br>"
	print "</body>"
	sys.exit()
예제 #25
0
import os
from cogent.db.ensembl import HostAccount, Genome

if 'ENSEMBL_ACCOUNT' in os.environ:
    host, username, password = os.environ['ENSEMBL_ACCOUNT'].split()
    account = HostAccount(host, username, password)
else:
    account = None

human = Genome('human', Release=75, account=account)

gene_symbols = ['brca1', 'brca2']

genomes1k_url = 'ftp://ftp.1000genomes.ebi.ac.uk/vol1/ftp/release/20130502'

for gene_symbol in gene_symbols:
    print gene_symbol
    genes = human.getGenesMatching(Symbol=gene_symbol)
    for gene in genes:
        print gene.Location.CoordName
        print gene.Location.Start, gene.Location.End
        command = './breastcancer/programs/htslib/tabix -h \
        %s/ALL.chr%s.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf.gz \
        %s:%s-%s > breastcancer/%s.vcf' % (
            genomes1k_url, gene.Location.CoordName, gene.Location.CoordName,
            gene.Location.Start, gene.Location.End, gene_symbol)
        print command
        #os.system(command)
        size = gene.Location.End - gene.Location.Start
        print 'Size', size
        print gene.Location.Strand
예제 #26
0
import os
import sqlalchemy as sql
from cogent.db.ensembl import HostAccount, Genome

#account = HostAccount(*os.environ['ENSEMBL_ACCOUNT'].split())
if 'ENSEMBL_ACCOUNT' in os.environ:
    host, username, password = os.environ['ENSEMBL_ACCOUNT'].split()
    account = HostAccount(host, username, password)
else:
    account = None

human = Genome('human', Release=69, account=account)

# BRCA1
gene = human.getGeneByStableId(StableId="ENSG00000167131")

# get the db tables we need
external_db = human.CoreDb.getTable("external_db")
object_xref = human.CoreDb.getTable("object_xref")
xref = human.CoreDb.getTable("xref")

# get the external db ID for refseq mrna
refseq_mrna_id = sql.select([external_db.c.external_db_id],
                            external_db.c.db_name.like('RefSeq_mRNA')).execute().fetchone()


# query for a specific transcript ID
print "Querying for mRNA REFSEQ entries for one transcript"
query = sql.select([object_xref, xref],
        sql.and_(xref.c.xref_id==object_xref.c.xref_id,
        object_xref.c.ensembl_id == 1345831,
예제 #27
0
        + "   Position: " + str(range.start) + "-" + str(range.end-1)\
        + "   Length: " + str(range.end - range.start)
        snp_count = seq.count("/")
        if snp_count > 0:
            header = header + "   SNP count: " + str(snp_count)
        else:
            header = header + "   no SNP"
        output.write("\n" + header)
        output.write("\n" + str(seq))
        output.close


#Setting ensembl parameters
release = 81
species = 'Mus musculus'
mouse = Genome(Species=species, Release=release)

input = raw_input('\nEnter ensembl mouse transcript ID or txt file: ')

if input.strip().split('.')[-1] == 'txt':
    try:
        fh = open(input.strip(), 'r')
        for transID in fh:
            transID = transID.strip().split('.')[0]
            print "\n" + "============ Start to manipulate", transID, "============"
            gene, transcript, range_seqs = specific_region(transID)
            if transcript is None: continue
            seq_output(gene, transcript, range_seqs)
        fh.close()
    except IOError:
        print "File", input, "not found!!"