예제 #1
0
def main():
    option_parser, opts, args =\
       parse_command_line_parameters(**script_info)
    
    if None in (opts.hostname, opts.user, opts.passwd):
        assert len(set((opts.hostname, opts.user, opts.passwd))) == 1,\
            'You must provide all MySQL options, or none at all.'
    
    if opts.hostname is not None:
        account = HostAccount(opts.hostname,opts.user,opts.passwd)
    elif 'ENSEMBL_ACCOUNT' in os.environ:
        h, u, p = os.environ['ENSEMBL_ACCOUNT'].split()
        account = HostAccount(h,u,p)
    else:
        account = None
    
    if opts.test_run:
        print account
    
    outdir = os.path.abspath(opts.outdir)
    if not os.path.exists(outdir):
        print 'FAIL: %s directory does not exist' % outdir
        exit(-1)
    
    if not opts.by_chrom:
        outfile_name = os.path.join(outdir, '%s-%s.fasta' % (opts.species, opts.release))
        if not opts.test_run:
            outfile = open(outfile_name, 'w')
    
    if opts.test_run:
        print 'Will write to: %s' % outdir
        if not opts.by_chrom:
            print outfile_name
    
    for chrom in get_chrom_seqs(opts.species, opts.release, account,
                                debug=opts.test_run):
        fasta = chrom.toFasta()
        
        if opts.by_chrom:
            outfile_name = os.path.join(outdir, '%s.fasta' % chrom.Name)
        
        if opts.test_run:
            print 'Will write to: %s' % outfile_name
            break
        
        if opts.by_chrom:
            outfile = open(outfile_name, 'w')
        
        outfile.write(fasta+'\n')
        
        if opts.by_chrom:
            outfile.close()
def kmerhomology(k, kmerlist, fastadict):
    k = int(k)
    homologydict = {} #{kmer:[conserved_occurences, total occurences]}
    UTRcounter = 0
    account = HostAccount('sugarman', 'ensembl', 'ensembl')
    #account = None
    compara = Compara(['mouse', 'human'], Release=61, account=account)
    sqlalchemyfails = 0
    
    if k != len(kmerlist[0]):
        sys.stderr.write('Warning! Provided value of k does not match length of given kmer!')
    for UTR in fastadict:
        UTRcounter +=1
        if UTRcounter % 50 == 0:
            sys.stderr.write('Determining motif conservation in UTR {0} of {1}...\n'.format(UTRcounter, len(fastadict)))
        UTRsequence = fastadict[UTR]
        UTR = UTR.replace(';', '\t').split('\t')
        ID = UTR[0]
        chrm = UTR[1].replace('chr','') #change to ensembl style
        start = int(UTR[2])
        stop = int(UTR[3])
        strand = UTR[4]

        for i in range(len(UTRsequence) - k + 1):
            if strand == '+':
                mousekmer = UTRsequence[i:i+k]
                mousekmerstart = start + i
                mousekmerstop = start + i + k - 1
            elif strand == '-': #actually counting back from the end...and remember the last 50 and stop codons are removed!!
                mousekmer = UTRsequence[i:i+k]
                mousekmerstart = stop - i - k + 1
                mousekmerstop = stop - i
            if mousekmer in kmerlist:
                if homologydict.has_key(mousekmer) == False:
                    homologydict[mousekmer] = [0, 1] #if not in dictionary, initiate entry with 1 in total occurences
                elif homologydict.has_key(mousekmer):
                    homologydict[mousekmer][1] +=1 #if in dictionary, add one to total occurences
                    
                #stupid f*****g sqlalchemy timeouts
                try:
                    for synt_region in compara.getSyntenicRegions(Species = 'mouse', CoordName = chrm, Start = mousekmerstart, End = mousekmerstop, Strand = strand, ensembl_coord = True, align_method = 'PECAN', align_clade = '19 amniota vertebrates Pecan'):
                        membs = synt_region.Members
                        if len(membs) == 2: #if there is no aligned human seq just skip it
                            completed = True
                            mouse = membs[0]
                            human = membs[1]
                            #strand sometimes seems to not get picked up by compara. If on minus strand, this seq may be rev comp of motif.
                            mouseseq = str(mouse.AlignedSeq) 
                            humanseq = str(human.AlignedSeq)
                            if mouseseq == humanseq:
                                homologydict[mousekmer][0] += 1 #add conserved occurence to dictionary
                        elif len(membs) != 2:
                            pass
                except (OE, mOE):
                    sys.stderr.write('Genome mysql error!!!')
                    sqlalchemyfails +=1
                    i = i-1 #try again
                    continue
            
    return homologydict, sqlalchemyfails
def kmerhomologydict(k, fastadict): #for multiple kmers at once, homologydict of every kmer in every UTR sequence
    k = int(k)
    kmerhomologydict = {} #{kmer:[conserved_occurences, total_occurences]}
    UTRcounter = 0
    analyzedUTRs = 0
    account = HostAccount('sugarman', 'ensembl', 'ensembl')
    #account = None
    compara = Compara(['mouse', 'human'], Release=61, account=account)
    sqlalchemyfails = 0

    for UTR in fastadict:
        UTRcounter +=1
        UTRsequence = fastadict[UTR]
        UTR = UTR.replace(';', '\t').split('\t')
        ID = UTR[0]
        chrm = UTR[1].replace('chr','') #change to ensembl style
        start = int(UTR[2])
        stop = int(UTR[3])
        strand = UTR[4]
        if UTRcounter % 1 == 0:
            sys.stderr.write('Determining motif conservation in UTR {0}, number {1} of {2}, (interrogated {3} so far)...\n'.format(ID, UTRcounter, len(fastadict), analyzedUTRs))

        for i in range(len(UTRsequence) - k + 1):
            if strand == '+':
                mousekmer = UTRsequence[i:i+k]
                mousekmerstart = start + i
                mousekmerstop = start + i + k - 1
            elif strand == '-': #actually counting back from the end...and remember the last 50 and stop codons are removed!!
                mousekmer = UTRsequence[i:i+k]
                mousekmerstart = stop - i - k + 1
                mousekmerstop = stop - i
            if kmerhomologydict.has_key(mousekmer) == False:
                kmerhomologydict[mousekmer] = [0, 1] #if not in dictionary, initiate entry with 1 in total occurences
            elif kmerhomologydict.has_key(mousekmer):
                kmerhomologydict[mousekmer][1] +=1 #if in dictionary, add one to total occurences
                
            for synt_region in compara.getSyntenicRegions(Species = 'mouse', CoordName = chrm, Start = mousekmerstart, End = mousekmerstop, Strand = strand, ensembl_coord = True, align_method = 'PECAN', align_clade = '19 amniota vertebrates Pecan'):
                if mousekmerstart == start and strand == '+' or mousekmerstop == stop and strand == '-': #this will be true once per UTR
                    analyzedUTRs +=1
                membs = synt_region.Members
                if len(membs) == 2: #if there is no aligned human seq just skip it
                    completed = True
                    mouse = membs[0]
                    human = membs[1]
                    #strand sometimes seems to not get picked up by compara. If on minus strand, this seq may be rev comp of motif.
                    mouseseq = str(mouse.AlignedSeq) 
                    humanseq = str(human.AlignedSeq)
                    if mouseseq == humanseq:
                        kmerhomologydict[mousekmer][0] += 1 #add conserved occurence to dictionary
                elif len(membs) != 2:
                    pass
                
    sys.stderr.write('Analyzed {0} of {1} UTRs.\n'.format(analyzedUTRs, len(fastadict)))

    return kmerhomologydict, sqlalchemyfails
예제 #4
0
def get_genes_from_Ensembl_multiple(args, sourcedata, targetdata):

    inputtype = args.inputType

    if (inputtype == "id"):
        geneidlistfile = args.geneIdListFile
        if (geneidlistfile == None):
            print "Argument -gidlf <geneidlistfilename> is required"
        else:
            for line in open(geneidlistfile, "r").readlines():
                parse = line.split("\n")[0].split(" ")
                if (len(parse) > 1):
                    species = parse[0]
                    geneid = parse[1]
                    print "Retreving Gene", geneid
                    account = HostAccount('ensembldb.ensembl.org', 'anonymous',
                                          '')
                    genome = Genome(species, ENSEMBL_VERSION, account)
                    gene = genome.getGeneByStableId(StableId=geneid)
                    sourcedata += get_cds_data(gene)
                    targetdata += get_gene_data(gene)

    if (inputtype == "name"):
        gene = args.gene
        if (gene == None):
            print "Argument -g <genename> is required"
        specieslistfile = args.specieslistfile
        if (specieslistfile == None):
            print "Argument -slf <specieslistfilename> is required"
        if (gene != None and specieslistfile != None):
            for line in open(specieslistfile, "r").readlines():
                parse = line.split("\n")[0].split(" ")
                if (len(parse) > 0):
                    species = parse[0]
                    print "Retreving Gene", gene, "from species", species
                    account = HostAccount('ensembldb.ensembl.org', 'anonymous',
                                          '')
                    genome = Genome(species, ENSEMBL_VERSION, account)
                    gene = get_gene_from_Ensembl_by_name(gene, genome)
                    sourcedata += get_cds_data(gene)
                    targetdata += get_gene_data(gene)
    return sourcedata, targetdata
예제 #5
0
def ensembl_to_hgnc(gene_list):
    account = HostAccount('blackrussian', 'bioinfo', 'A29bcd1234#', port=3306)
    human = Genome('human', Release=73, account=account)

    hgnc_list = []

    for gene in gene_list:
        hgnc_list.append(human.getGeneByStableId(StableId=gene).Symbol)

    hgnc_list = set(hgnc_list)

    return hgnc_list
예제 #6
0
def get_genes_from_Ensembl_pairwise(args, sourcedata, targetdata):
    genelist = []

    inputtype = args.inputType

    firstspecies = args.firstspecies
    secondspecies = args.secondspecies
    if (firstspecies == None):
        print "Argument -s1 <firstspeciesname> is required"
    elif (secondspecies == None):
        print "Argument -s2 <secondspeciesname> is required"
    else:
        account = HostAccount('ensembldb.ensembl.org', 'anonymous', '')
        firstgenome = Genome(firstspecies, ENSEMBL_VERSION, account)
        secondgenome = Genome(secondspecies, ENSEMBL_VERSION, account)

        if (inputtype == "id"):
            firstgeneid = args.firstgeneid
            if (firstgeneid == None):
                print "Argument -gid1 <firstgeneid> is required"
            secondgeneid = args.secondgeneid
            if (secondgeneid == None):
                print "Argument -gid2 <secondgeneid> is required"

            if (firstgeneid != None and secondgeneid != None):
                print "Retreving Genes", firstgeneid, secondgeneid

                firstgene = firstgenome.getGeneByStableId(StableId=firstgeneid)
                secondgene = secondgenome.getGeneByStableId(
                    StableId=secondgeneid)

        if (inputtype == "name"):
            gene = args.gene
            if (gene == None):
                print "Argument -g <genename> is required"
            else:
                print "Retreving Genes", name, "from species", firstspecies, secondspecies

                firstgene = get_gene_from_Ensembl_by_name(gene, firstgenome)
                secondgene = get_gene_from_Ensembl_by_name(gene, secondgenome)

        sourcedata += get_cds_data(firstgene) + get_cds_data(secondgene)
        targetdata += get_gene_data(firstgene) + get_gene_data(secondgene)

    return sourcedata, targetdata
예제 #7
0
def main():
    rr = RunRecord('start_chippy_db')
    rr.addCommands(sys.argv)

    args = script_info['args'].parse()
    create_path(args.save_db_dir)

    if not os.path.isdir(args.save_db_dir):
        sys.stderr.write('The save_db_dir must be an existing directory.\n')
        return

    release = args.ensembl_release
    species = args.species
    chippy_db_name = args.save_db_prefix + '_chippy_' + str(release) +\
            '_' + species + '.db'
    db_path = os.path.join(args.save_db_dir, chippy_db_name)
    if not os.path.exists(db_path):
        session = make_session(db_path)

        hostname = args.hostname
        username = args.username
        password = args.password

        account = HostAccount(hostname, username, password, port=args.port)
        add_ensembl_gene_data(session,
                              args.species,
                              ensembl_release=args.ensembl_release,
                              account=account)

        success = create_dummy_expr(session)
        if success:
            rr.addInfo('Dummy data added successfully', 'Expr=1.')
        else:
            rr.addError('Dummy data failed to upload to DB',
                        'Expect bigger problems')

        rr.addInfo('Chippy DB written', db_path)
        print os.path.realpath(db_path)
    else:
        rr.addError('Chippy DB with this name already exists', db_path)

    if args.show_log:
        rr.display()
def main():
    import os
    script_dir = os.path.dirname(os.path.abspath(__file__))
    """ Neccesary to log into the ensembl database """
    import os
    from cogent.db.ensembl import HostAccount
    if 'ENSEMBL_ACCOUNT' in os.environ:
        host, username, password = os.environ['ENSEMBL_ACCOUNT'].split()
        account = HostAccount(host, username, password)
    else:
        account = None
    """ gathers the transcript id and protein sequence from gene """
    sp = "zebrafish"
    gn = "ENSDARG00000027279"
    from cogent.db.ensembl import Genome
    specie = Genome(Species=sp, Release="81", account=None)
    gene = specie.getGeneByStableId(StableId=gn)
    for tr in gene.Transcripts:
        print(tr.StableId)
        for ex in tr.Exons:
            print(ex.Symbol)
예제 #9
0
def hgnc_to_ensembl_id(hgnc_list):
    account = HostAccount('blackrussian', 'bioinfo', 'A29bcd1234#', port=3306)
    human = Genome('human', Release=73, account=account)

    ensembl_stable_id_list = []

    for gene in hgnc_list:
        gene_query = human.getGenesMatching(Symbol=gene)
        for gene_obj in gene_query:
            if gene_obj.Symbol == gene:
                ensembl_stable_id_list.append(gene_obj.StableId)

    #Remove duplicates
    ensembl_stable_id_list = set(ensembl_stable_id_list)

    #Keep list elements starting with 'ENSG'
    ensembl_stable_id_list = [
        x for x in ensembl_stable_id_list if x.startswith('ENSG')
    ]

    return ensembl_stable_id_list
예제 #10
0
    print codon_B
    #get chaging index
    if codon_B[0] != codon_A[0]:
        codon_index = 1
    if codon_B[1] != codon_A[1]:
        codon_index = 2
    if codon_B[2] != codon_A[2]:
        codon_index = 3
    result['codon_index'] = codon_index
    result['codon_B'] = codon_B
    return result


if 'ENSEMBL_ACCOUNT' in os.environ:
    host, username, password = os.environ['ENSEMBL_ACCOUNT'].split()
    account = HostAccount(host, username, password)
else:
    account = None

protein_mutation = 'A203T'
protein_A = protein_mutation[0]
protein_B = protein_mutation[-1]
codon_index = int(protein_mutation[1:-1])

#result = find_codon_index(protein_A, 'GCA', protein_B)
#print result

human = Genome(Species='human', Release=Release, account=account)
print human

#seqs = {'original' : 'A',
예제 #11
0
# Code to merge ensembl gene information with annotation of probes and junctions to get an idea of the gene structure

import pandas as pd
import re
from collections import Counter
from itertools import permutations
import csv
import string
from cogent.db.ensembl import Genome, HostAccount
from difflib import SequenceMatcher

#account = HostAccount('ensembldb.ensembl.org', 'anonymous', '', port=5306)
account = HostAccount('127.0.0.1', 'root', 'ensembl', port=3306)
Release = 89
HumanDB = Genome(Species='human', Release=Release, account=account)


class SettingTCID_GeneIDError(Exception):
    def __init__(self, msg):
        self.msg = msg

    def __str__(self):
        return self.msg


def GeneStructure(TCID,
                  GeneID,
                  Probesets=None,
                  MappingFile="HTA_2_0_Probeset_SequenceIndices.txt",
                  SequenceFile="HTA_2_0_Probeset_Sequences.txt",
                  location="Output",
def ComparaLiftover(gff):
    account = HostAccount('sugarman', 'ensembl', 'ensembl')
    compara = Compara(['mouse', 'rat', 'human', 'cow', 'dog'],
                      Release=61,
                      account=account)
    ratgff = []
    humangff = []
    cowgff = []
    doggff = []

    #Make gff database
    gff_fn = gff
    db_fn = os.path.basename(gff_fn) + '.db'
    if os.path.isfile(db_fn) == False:
        gffutils.create_db(gff_fn, db_fn)

    db = gffutils.FeatureDB(db_fn)
    UTRs = db.features_of_type('3\'UTR')

    for UTR in UTRs:
        #Remove stop codons and last 50 nt of UTR
        if UTR.strand == '+':
            UTRstart = int(UTR.start) + 3
            UTRstop = int(UTR.stop) - 50
        elif UTR.strand == '-':
            UTRstart = int(UTR.start) + 50
            UTRstop = int(UTR.stop) - 3

        for synt_region in compara.getSyntenicRegions(
                Species='mouse',
                CoordName=UTR.chrom.replace('chr', ''),
                Start=UTRstart,
                End=UTRstop,
                Strand=UTR.strand,
                ensembl_coord=True,
                align_method='PECAN',
                align_clade='19 amniota vertebrates Pecan'):
            for region in synt_region.Members:
                if region.Region:
                    locdata = str(region.Region.Location).replace(
                        '-', ':', 1).replace(' ', '_').split(':')
                    species = str(locdata[0])
                    chrm = 'chr' + str(locdata[2])
                    start = locdata[3]
                    stop = locdata[4]
                    if str(locdata[5]) == '1' and UTR.strand == '+':
                        strand = '+'
                    elif str(locdata[5]) == '-1' and UTR.strand == '+':
                        strand = '-'
                    elif str(locdata[5]) == '-1' and UTR.strand == '-':
                        strand = '+'
                    elif str(locdata[5]) == '1' and UTR.strand == '-':
                        strand = '-'
                    ID = (str(UTR.id) + '_' + species)
                    if species == 'Rattus_norvegicus':
                        ratgff.append([
                            chrm, 'ALE', '3\'UTR', start, stop, '.', strand,
                            '.', ID
                        ])
                    elif species == 'Homo_sapiens':
                        humangff.append([
                            chrm, 'ALE', '3\'UTR', start, stop, '.', strand,
                            '.', ID
                        ])
                    elif species == 'Bos_taurus':
                        cowgff.append([
                            chrm, 'ALE', '3\'UTR', start, stop, '.', strand,
                            '.', ID
                        ])
                    elif species == 'Canis_familiaris':
                        doggff.append([
                            chrm, 'ALE', '3\'UTR', start, stop, '.', strand,
                            '.', ID
                        ])

    os.remove(db_fn)
    sys.stderr.write(
        'Succesfully found matches in {0} rat regions, {1} human regions, {2} cow regions and {3} dog regions.\n'
        .format(len(ratgff), len(humangff), len(cowgff), len(doggff)))
    return ratgff, humangff, cowgff, doggff
예제 #13
0
#!/usr/bin/env python2
# -*- coding: utf-8 -*-

# cogent and sqlalchemy modules need to be installed:
# pip2 install cogent
# pip2 install sqlalchemy
from cogent.db.ensembl import HostAccount, Species, Genome
from pypath import mapping

Release = 78
account = HostAccount('ensembldb.ensembl.org', 'anonymous', '')

human = Genome(Species='human', Release=Release, account=account)

# UniProt, seq offset, residue, isoform
positions = [('P00533', 40, 'Q', 1), ('P60520', 30, 'P', 1)]

m = mapping.Mapper()
m.load_uniprot_mappings(['ensg'], bi=True)

positions_ens = []
for p in positions:
    ensgs = m.map_name(p[0], 'uniprot', 'ensg')
    for ensg in ensgs:
        genes = human.getGenesMatching(StableId=ensg)
        for gene in genes:
            positions_ens.append(
                tuple([ensg, gene.Location, gene.CanonicalTranscript.Exons] +
                      list(p)))

# another attempts with biopython --
예제 #14
0
reference files. It uses the genomic coordinates to pull the sequence from the
genome and reverse complement if on the negative strand. The sequence currently
in the reference file may refer to the sequence synthesized on the chip, in which
case it is the version with the lowest A's (easier to synthesize). It could also
be the sequence without strand consideration. To be sure, let's just pull from 
the genome.
'''

import MySQLdb
# use cogent to extract sequences from the genome
from cogent.db.ensembl import HostAccount, Genome
import pandas as pd

# set up connection
# host, user, password, port
pycog = HostAccount('ensembldb.ensembl.org', 'anonymous', '', 3306)
hs37 = Genome('human', Release=78, account=pycog)

# read in reference
data = pd.read_table('../produced_data/splicemod_data_clean.txt', sep='\t')
# reformat so intron/exon length columns are int and not float, convert NA to 0 so we can use int
data[['intron1_len', 'exon_len',
      'intron2_len']] = data[['intron1_len', 'exon_len',
                              'intron2_len']].fillna(0.0).astype(int)

# grab sequences with chr, start and end information
lib = data[data.chr.notnull() & data.start.notnull() & data.end.notnull()]


def grab_region(chr, start, end, strand, genome):