def store(cursor, in_path, infile):

    inf   = erropen (in_path+"/"+infile, "r")

    print "storing contents of ", in_path, " file ", infile
    
    ct = 0
    start = time()
    for line in inf:
        ct += 1
        if (not ct%1000):
            print "     %5d    %8.3f" % (ct,  time()-start);
            start = time()

        fixed_fields    = {}
        update_fields   = {}


        line   = line.rstrip()
        field = line.split("\t")
        if len(field) < 4: continue
        [human_stable_id, cognate_stable_id, species, common_name]  =  field
  
        fixed_fields ['ensembl_gene_id'] = human_stable_id  
        fixed_fields ['species']         = species  
  
        update_fields['cognate_gene_id'] = cognate_stable_id
        update_fields['common_name']     = common_name
 
        store_or_update (cursor, 'ortholog', fixed_fields, update_fields)

    inf.close()
Example #2
0
def dump_paralogues(species_list, db_info):

    [local_db, ensembl_db_name, outdir] = db_info
    db = connect_to_mysql()
    cursor = db.cursor()

    for species in species_list:
        print
        print "############################"
        print species
        qry = "use " + ensembl_db_name[species]
        search_db(cursor, qry)

        outfile = "{0}/{1}_para_dump.txt".format(outdir, species)
        print outfile
        #continue
        of = erropen(outfile, "w")
        if not of: continue

        if (species == 'homo_sapiens'):
            gene_ids = get_gene_ids(cursor,
                                    biotype='protein_coding',
                                    is_known=1)
        else:
            gene_ids = get_gene_ids(cursor, biotype='protein_coding')

        para_table = 'paralogue'

        ct = 0
        seen = []
        for gene_id in gene_ids:
            ct += 1
            if not ct % 100:
                print "\t", species, "   ", ct, "out of", len(gene_ids)

            if gene_id in seen: continue

            stable_id = gene2stable(cursor, gene_id)

            paralogues = read_paralogues(cursor, gene_id)

            if (paralogues):
                # dump
                for para in paralogues:
                    print >> of, stable_id, para
                seen += paralogues

        of.close()

    cursor.close()
    db.close()
def dump_paralogues(species_list, db_info):
    
    [local_db, ensembl_db_name, outdir] = db_info
    db     = connect_to_mysql()
    cursor = db.cursor()


    for species in species_list:
        print
        print "############################"
        print  species
        qry = "use " + ensembl_db_name[species]
        search_db(cursor, qry)
        
        outfile  = "{0}/{1}_para_dump.txt".format(outdir, species)
        print outfile
        #continue
        of       = erropen (outfile,"w")
        if not of: continue
        
        if (species=='homo_sapiens'):
            gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1)
        else:
            gene_ids = get_gene_ids (cursor, biotype='protein_coding')

        para_table = 'paralogue'

        ct   =  0
        seen = []
        for gene_id in gene_ids:
            ct += 1
            if not ct%100: print "\t", species, "   ", ct, "out of", len(gene_ids)

            if gene_id in seen: continue

            stable_id  = gene2stable(cursor, gene_id)
                
            paralogues = read_paralogues(cursor, gene_id)
            
            if ( paralogues):
                # dump
                for para in paralogues:
                    print >> of,  stable_id, para
                seen += paralogues
                
        of.close()
 
    cursor.close()
    db.close()
def store(cursor, in_path, infile):

    table = 'name_resolution'
    inf   = erropen (in_path+"/"+infile, "r")

    print "storing contents of ", infile
    ct = 0
    for line in inf:

        ct += 1

        fixed_fields    = {}
        update_fields   = {}


        line   = line.rstrip()
        fields = line.split("\t")
        #if not ct%100:
        print ct, fields[0]
        if  'ENSG' in fields[-1]: 
            ensembl_gene_id = fields[-1]
        else:
            continue


        # check we are tracking that gene (for example, if it is pseudo, we are not)
        qry  = "select count(1) from exon_homo_sapiens where ensembl_gene_id  = '%s'" % ensembl_gene_id
        rows = search_db(cursor, qry)

        if not rows or not rows[0][0]: continue

        for field in fields[:-1]:

            if not field.replace (' ',''): continue

            fixed_fields ['synonym']   = field.replace("'", "").upper()
            fixed_fields ['stable_id'] = ensembl_gene_id
            store_or_update (cursor, table, fixed_fields, update_fields)

    inf.close()
def dump_orthos (species_list, db_info):

    
    [local_db, ensembl_db_name] = db_info
    db     = connect_to_mysql()
    cfg    = ConfigurationReader()
    cursor = db.cursor()

     # find db ids adn common names for each species db
    [all_species, ensembl_db_name] = get_species (cursor)

    # in the afa headers use 'trivial' names for the species: cow, dog, pig, ...
    trivial_name   = translate_to_trivial(cursor, all_species)

    out_path = cfg.get_path('afs_dumps')
    outfile  = "{0}/orthologue_dump.txt".format(out_path)
    print outfile
    of       = erropen (outfile,"w")

    species  = 'homo_sapiens'
    switch_to_db (cursor,  ensembl_db_name[species])


    qry = "select * from orthologue"
    rows = search_db (cursor, qry)
    for row in rows:
        [pair_id, human_gene_id, cognate_gene_id, genome_db_id, source] =  row
        species = genome_db_id2species (cursor, genome_db_id)
        switch_to_db (cursor,  ensembl_db_name['homo_sapiens'])
        human_stable_id = gene2stable(cursor, human_gene_id)
        switch_to_db (cursor,  ensembl_db_name[species])
        cognate_stable_id = gene2stable(cursor, cognate_gene_id)
        print  >>of,  orthos_tabstring ([human_stable_id, cognate_stable_id, species, trivial_name[species]])


    of.close()
    
    cursor.close()
    db    .close()
Example #6
0
def get_theme_ids(cursor, cfg, theme_name):
    resources = cfg.dir_path['resources']
    fnm = resources + '/' + theme_name+'.txt'
    if not os.path.exists(fnm):
        print fnm, "not found"
        exit(1)

    if not os.path.getsize(fnm) > 0:
        print fnm, "empty"
        exit(1)
        
    inf = erropen(fnm, "r")
    gene_ids = []
    for line in inf:
        line.rstrip()
        [stable_id, name] = line.split("\t")
        qry = "select gene_id, description from gene where stable_id='%s'"% stable_id
        rows = search_db (cursor, qry)
        if not rows: continue
        gene_ids.append(rows[0][0])
    inf.close()

    return gene_ids
def store(cursor, infile):

    inf = erropen(infile, "r")

    total        = 0
    id_not_found = 0
    for line in inf:
        line.rstrip()
        total += 1
        if not total%1000: print "\t", total
        if ( len(line.split()) !=  2 or not 'ENS' in line):
            continue
        [stable_id1, stable_id2] = line.split()
        fixed_fields    = {}
        update_fields   = {}
        
        fixed_fields['gene_id1'] = stable_id1
        fixed_fields['gene_id2'] = stable_id2

        store_or_update (cursor, 'paralog', fixed_fields, update_fields)

    print "done with ", infile, "total ",  total

    inf.close ()
Example #8
0
def store(cursor, infile):

    inf = erropen(infile, "r")

    total = 0
    id_not_found = 0
    for line in inf:
        line.rstrip()
        total += 1
        if not total % 1000: print "\t", total
        if (len(line.split()) != 2 or not 'ENS' in line):
            continue
        [stable_id1, stable_id2] = line.split()
        fixed_fields = {}
        update_fields = {}

        fixed_fields['gene_id1'] = stable_id1
        fixed_fields['gene_id2'] = stable_id2

        store_or_update(cursor, 'paralog', fixed_fields, update_fields)

    print "done with ", infile, "total ", total

    inf.close()
Example #9
0
def main():

    verbose = True
    db = connect_to_mysql()
    acg = AlignmentCommandGenerator()
    cursor = db.cursor()

    # find db ids adn common names for each species db
    [all_species, ensembl_db_name] = get_species(cursor)

    logf = erropen("error.log", "w")
    if not logf: exit(1)

    outf = erropen("mut_significance_bg_data.txt", "w")
    if not outf: exit(1)

    switch_to_db(cursor, ensembl_db_name['homo_sapiens'])
    gene_ids = get_gene_ids(cursor,
                            biotype='protein_coding',
                            is_known=1,
                            ref_only=True)

    # the categories of mutations for which we will be collecting statistics
    fill_category()
    # for each human gene
    #gene_ids = [10093176 ]
    for gene_id in gene_ids:

        switch_to_db(cursor, ensembl_db_name['homo_sapiens'])
        stable_id = gene2stable(cursor, gene_id)

        # find all canonical coding  human exons
        # get_canonical_coding_exons also sorts exons by the start in the gene
        canonical_human_exons = get_canonical_coding_exons(
            cursor, gene_id, ensembl_db_name['homo_sapiens'])

        # bail out if there is a problem
        if not canonical_human_exons: continue

        full_reconstituted_cDNA = ""
        prev_codon_piece_plus_right_flank = ""
        for human_exon in canonical_human_exons:
            [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, nucseq] = \
                    get_exon_seqs(cursor, human_exon.exon_id, human_exon.is_known)
            # add the split codon
            phase = get_exon_phase(cursor, human_exon.exon_id,
                                   human_exon.is_known)
            left_flank_plus_codon_piece = left_flank + nucseq[:
                                                              pepseq_transl_start]
            split_codon = ""
            if phase > 0 and prev_codon_piece_plus_right_flank and left_flank:
                offset = (3 - phase) % 3
                # hedge against the possibility that the translation starts
                # right at the start of the exon, but there is supposed to be a phase
                split_codon = prev_codon_piece_plus_right_flank[:
                                                                phase] + left_flank_plus_codon_piece[
                                                                    -offset:]
            full_reconstituted_cDNA += split_codon + nucseq[
                pepseq_transl_start:pepseq_transl_end]
            prev_codon_piece_plus_right_flank = nucseq[
                pepseq_transl_end:] + right_flank

        mitochondrial = is_mitochondrial(cursor, gene_id)
        if (mitochondrial):
            full_reconstituted_seq = Seq(full_reconstituted_cDNA).translate(
                table="Vertebrate Mitochondrial").tostring()
        else:
            full_reconstituted_seq = Seq(
                full_reconstituted_cDNA).translate().tostring()

        canonical = get_canonical_transl(acg,
                                         cursor,
                                         gene_id,
                                         'homo_sapiens',
                                         strip_X=False)
        if canonical[
                0] == 'X':  #that's some crap apparently wrong transcript is annotated as canonical
            print >> logf, "warning", gene_id, stable_id, get_description(
                cursor, gene_id)
            print >> logf, "the deposited canonical sequence starts with X - is there an alternative (?)"
            canonical = canonical[1:]

        if full_reconstituted_seq[-1] == '*' and canonical[-1] != '*':
            canonical += '*'
        if (len(full_reconstituted_seq) != len(canonical)
                or full_reconstituted_seq != canonical):

            if (len(canonical) - len(full_reconstituted_seq) < 3
                    and full_reconstituted_seq in canonical):
                # go with it  - I do not have that much of that crap anyway
                print >> logf, "warning", gene_id, stable_id, get_description(
                    cursor, gene_id)
                print >> logf, "missing a couple of amino acids in beginning or in the end"
            else:
                print >> logf, "error", gene_id, stable_id, get_description(
                    cursor, gene_id)
                print >> logf, "error reassembling,  len(full_reconstituted_seq) != len(canonical) ", len(
                    full_reconstituted_seq), len(canonical)
                print >> logf, "canonical:"
                print >> logf, canonical
                print >> logf, "reconstituted:"
                print >> logf, full_reconstituted_seq
                continue

        # nucleotide stats
        count = {'A': 0, 'C': 0, 'C-CpG': 0, 'T': 0, 'G': 0, 'G-CpG': 0}
        is_CpG = {}
        for i in range(len(full_reconstituted_cDNA)):
            is_CpG[i] = False
            if full_reconstituted_cDNA[i] == 'A':
                count['A'] += 1
            elif full_reconstituted_cDNA[i] == 'T':
                count['T'] += 1
            elif full_reconstituted_cDNA[i] == 'C':
                if i + 1 < len(full_reconstituted_cDNA
                               ) and full_reconstituted_cDNA[i + 1] == 'G':
                    count['C-CpG'] += 1
                    is_CpG[i] = True
                else:
                    count['C'] += 1
            elif full_reconstituted_cDNA[i] == 'G':
                if i > 0 and full_reconstituted_cDNA[i - 1] == 'C':
                    count['G-CpG'] += 1
                    is_CpG[i] = True
                else:
                    count['G'] += 1

        # in each category_dict (AT transt, AT transv, CG trans, CG transv, Cpg trans, cpGtransv, how many missense,
        #  how many nonsense, how many silent  possible
        codons = map(''.join, zip(*[iter(full_reconstituted_cDNA)] * 3))
        silent = {}
        missense = {}
        nonsense = {}
        for cg in categories:
            silent[cg] = 0
            missense[cg] = 0
            nonsense[cg] = 0
        for i in range(len(codons)):
            codon = codons[i]
            aa = full_reconstituted_seq[i]
            for j in range(3):
                nt_position = i * 3 + j
                nt = full_reconstituted_cDNA[nt_position]
                for new_nt in ['A', 'C', 'T', 'G']:
                    if new_nt == nt: continue
                    mutated_codon = mutate(codon, j, new_nt)
                    if (mitochondrial):
                        mutated_aa = Seq(mutated_codon).translate(
                            table="Vertebrate Mitochondrial").tostring()
                    else:
                        mutated_aa = Seq(mutated_codon).translate().tostring()
                    cg = category_dict[codon[j]][new_nt][is_CpG[nt_position]]
                    if not cg or not cg in categories:
                        print >> logf, "category problem in ", gene_id, stable_id, get_description(
                            cursor, gene_id)
                        print >> logf, codon, mutated_codon, j, codon[
                            j], new_nt, is_CpG[nt_position], cg
                        print >> logf, i, j, nt_position, nt
                        print >> logf, aa, mutated_aa
                        continue
                    if (mutated_aa == aa):
                        silent[cg] += 1
                    elif (mutated_aa == "*"):
                        nonsense[cg] += 1
                    else:
                        missense[cg] += 1

        print >> outf, stable_id, get_description(cursor, gene_id)
        print >> outf, "# CpG nucleotides (format: cdna_position|nucleotide|codon|context; )"
        print >> outf, "# ('context' contains one nucleotide before and one after the CpG nucleotide)"

        outstr = ""
        for i in range(len(full_reconstituted_cDNA)):
            if (is_CpG[i]):
                context = ""
                if i > 0: context += full_reconstituted_cDNA[i - 1]
                context += full_reconstituted_cDNA[i]
                if i < len(full_reconstituted_cDNA) - 1:
                    context += full_reconstituted_cDNA[i + 1]
                outstr += "%d|%s|%s|%s;" % (i + 1, full_reconstituted_cDNA[i],
                                            codons[i / 3], context)
        print >> outf, outstr

        print >> outf, "# mutations possible (in principle)"
        print >> outf, "# %10s  %5s  %5s  %5s" % ("category", "silent",
                                                  "nonsense", "missense")
        for cg in categories:
            print >> outf, "%10s  %5d  %5d  %5d" % (cg, silent[cg],
                                                    nonsense[cg], missense[cg])

        print >> outf, "# canonical sequence (format: <amino_acid><position_on_peptide_chain><codon>;):"
        outstr = ""
        for i in range(len(codons)):
            if (mitochondrial):
                codon_transl = Seq(codons[i]).translate(
                    table="Vertebrate Mitochondrial").tostring()
            else:
                codon_transl = Seq(codons[i]).translate().tostring()

            outstr += "%s%d%s;" % (full_reconstituted_seq[i], i + 1, codons[i])
        print >> outf, outstr

        print >> outf, stable_id, "done"

    logf.close()
def multiple_exon_alnmt(gene_list, db_info):


    print "process pid: %d, length of gene list: %d" % ( get_process_id(), len(gene_list))

    [local_db, ensembl_db_name] = db_info

    db     = connect_to_mysql()
    cfg    = ConfigurationReader()
    acg    = AlignmentCommandGenerator()
    cursor = db.cursor()

    # find db ids adn common names for each species db
    [all_species, ensembl_db_name] = get_species (cursor)
    

    species  = 'homo_sapiens'
    switch_to_db (cursor,  ensembl_db_name[species])
    gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1)

    # for each human gene
    gene_ct = 0
    tot  = 0
    ok   = 0
    no_maps        = 0
    no_pepseq      = 0
    no_orthologues = 0
    min_similarity = cfg.get_value('min_accptbl_exon_sim')

    #gene_list.reverse()
    for gene_id in gene_list:

        start = time()
        gene_ct += 1
        if  not gene_ct%10: print gene_ct, "genes out of", len(gene_list)

        switch_to_db (cursor, ensembl_db_name['homo_sapiens'])
        print gene_ct, len(gene_ids),  gene_id,  gene2stable(cursor, gene_id), get_description (cursor, gene_id)

        human_exons = filter (lambda e: e.is_known==1 and e.is_coding and e.covering_exon<0, gene2exon_list(cursor, gene_id))
        human_exons.sort(key=lambda exon: exon.start_in_gene)

        ##################################################################
        for human_exon in human_exons:
            
            tot += 1

            # find all orthologous exons the human exon  maps to
            maps = get_maps(cursor, ensembl_db_name, human_exon.exon_id, human_exon.is_known)
            if verbose: 
                print "\texon no.", tot, " id", human_exon.exon_id,
                if not maps: 
                    print " no maps"
                    print human_exon
                print 
            if not maps: 
                no_maps += 1
                continue

  
            # human sequence to fasta:
            seqname   = "{0}:{1}:{2}".format('homo_sapiens', human_exon.exon_id, human_exon.is_known)
            switch_to_db (cursor, ensembl_db_name['homo_sapiens'])
            [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, 
             left_flank, right_flank, dna_seq] = get_exon_seqs (cursor, human_exon.exon_id, human_exon.is_known)
            if (not pepseq):
                if verbose and  human_exon.is_coding and  human_exon.covering_exon <0: # this should be a master exon
                    print "no pep seq for",  human_exon.exon_id, "coding ", human_exon.is_coding,
                    print "canonical: ",  human_exon.is_canonical
                    print "length of dna ", len(dna_seq)
                no_pepseq += 1
                continue

            # collect seq from all maps, and output them in fasta format
            hassw = False
            headers   = []
            sequences = {}
            exons_per_species = {}

            for map in maps:

                switch_to_db (cursor, ensembl_db_name[map.species_2])
                if map.similarity < min_similarity: continue
                exon    = map2exon(cursor, ensembl_db_name, map)
                pepseq  = get_exon_pepseq (cursor,exon)
                if (not pepseq):
                    continue
                if  map.source == 'sw_sharp':
                    exon_known_code = 2
                    hassw = True
                elif  map.source == 'usearch':
                    exon_known_code = 3
                    hassw = True
                else:
                    exon_known_code = map.exon_known_2
                seqname = "{0}:{1}:{2}".format(map.species_2, map.exon_id_2, exon_known_code)
                headers.append(seqname)
                sequences[seqname] = pepseq
                # for split exon concatenation (see below)
                if not map.species_2 in exons_per_species.keys():
                    exons_per_species[map.species_2] = []
                exons_per_species[map.species_2].append ([ map.exon_id_2, exon_known_code]);
                
                    
            if (len(headers) <=1 ):
                if verbose: print "single species in the alignment"
                no_orthologues += 1
                continue
            
            # concatenate exons from the same gene - the alignment program might go wrong otherwise
            concatenated = concatenate_exons (cursor, ensembl_db_name, sequences, exons_per_species)

            fasta_fnm = "{0}/{1}.fa".format( cfg.dir_path['scratch'], human_exon.exon_id)
            output_fasta (fasta_fnm, sequences.keys(), sequences)

            # align
            afa_fnm  = "{0}/{1}.afa".format( cfg.dir_path['scratch'], human_exon.exon_id)
            mafftcmd = acg.generate_mafft_command (fasta_fnm, afa_fnm)
            ret      = commands.getoutput(mafftcmd)

            if (verbose): print 'almt to', afa_fnm

            # read in the alignment 
            inf = erropen(afa_fnm, "r")
            aligned_seqs = {}
            for record in SeqIO.parse(inf, "fasta"):
                aligned_seqs[record.id] = str(record.seq)
            inf.close()
            # split back the concatenated exons
            if concatenated: split_concatenated_exons (aligned_seqs, concatenated)

            human_seq_seen = False
            for seq_name, sequence in aligned_seqs.iteritems():
                # if this is one of the concatenated seqs, split them back to two

                ### store the alignment as bitstring
                # Generate the bitmap
                bs         = Bits(bin='0b' + re.sub("[^0]","1", sequence.replace('-','0')))
                # The returned value of tobytes() will be padded at the end 
                # with between zero and seven 0 bits to make it byte aligned.
                # I will end up with something that looks like extra alignment gaps, that I'll have to return
                msa_bitmap = bs.tobytes() 
                # Retrieve information on the cognate
                cognate_species, cognate_exon_id, cognate_exon_known = seq_name.split(':')
                if cognate_exon_known == '2':
                    source = 'sw_sharp'
                elif cognate_exon_known == '3':
                    source = 'usearch'
                else:
                    source = 'ensembl'
                if (cognate_species == 'homo_sapiens'):
                    human_seq_seen = True
                cognate_genome_db_id = species2genome_db_id(cursor, cognate_species) # moves the cursor
                switch_to_db(cursor, ensembl_db_name['homo_sapiens']) # so move it back to h**o sapiens
                # Write the bitmap to the database
                #if (cognate_species == 'homo_sapiens'):
                if verbose: # and (source=='sw_sharp' or source=='usearch'):
                    print "storing"
                    print human_exon.exon_id, human_exon.is_known
                    print cognate_species, cognate_genome_db_id, cognate_exon_id, cognate_exon_known, source
                    print sequence
                    if not msa_bitmap:
                        print "no msa_bitmap"
                        continue
                store_or_update(cursor, "exon_map",    {"cognate_genome_db_id":cognate_genome_db_id,
                   "cognate_exon_id":cognate_exon_id   ,"cognate_exon_known"  :cognate_exon_known,
                   "source": source, "exon_id" :human_exon.exon_id, "exon_known":human_exon.is_known},
                  {"msa_bitstring":MySQLdb.escape_string(msa_bitmap)})
                 
            ok += 1
            commands.getoutput("rm "+afa_fnm+" "+fasta_fnm)

        if verbose: print " time: %8.3f\n" % (time()-start);

    print "tot: ", tot, "ok: ", ok
    print "no maps ",   no_pepseq
    print "no pepseq ", no_pepseq
    print "no orthologues  ", no_orthologues
    print
Example #11
0
def store(cursor, table,  in_path, infile, species):

    inf   = erropen (in_path+"/"+infile, "r")
    if not inf: exit(1) 


    print "storing contents of ", in_path, " file ", infile
    
    ct = 0
    start = time()
    for line in inf:
        ct += 1
        if (not ct%10000):
            print "   %s   %5d    %8.3f" % (species, ct,  time()-start)
            sys.stdout.flush()
            start = time()

        fixed_fields    = {}
        update_fields   = {}


        line   = line.rstrip()
        field  = line.split("\t")
        if len(field) < 18: 
            print "number of fields smaller than expected"
            continue

        exon_id         = int(field[0])
        ensembl_gene_id =     field[1]
        ensembl_exon_id =     field[2]
        start_in_gene   = int(field[3])
        end_in_gene     = int(field[4])
        strand          = int(field[5])
        is_known        = int(field[6])
        is_coding       = int(field[7])
        is_canonical    = int(field[8])
        is_constitutive = int(field[9])
        species         =     field[10]
        source          =     field[11]
        #if source == 'sw_sharp' or source=='usearch':
        #    human_exon      = field[12]
        #    protein_seq     = field[13]
        # here I have two fields showing where the peptide translation starts and where it ends
        #   left_flank      = field[16]
        #    right_flank     = field[17]
        #    dna_seq         = field[18]
        #    fixed_fields['maps_to_human_exon_id'] = human_exon
        #else:
        protein_seq     =     field[12]
        # here I have two fields showing where the peptide translation starts and where it ends
        left_flank      =     field[15]
        right_flank     =     field[16]
        dna_seq         =     field[17]


        exon_key = ensembl_gene_id + "_" + str(exon_id) + "_" + str(is_known)
        fixed_fields ['exon_key']        = exon_key  
  
        update_fields['ensembl_gene_id'] = ensembl_gene_id
        update_fields['ensembl_exon_id'] = ensembl_exon_id
        update_fields['start_in_gene']   = start_in_gene 
        update_fields['end_in_gene']     = end_in_gene  
        update_fields['strand']          = strand      
        update_fields['is_known']        = is_known    
        update_fields['is_coding']       = is_coding    
        update_fields['is_canonical']    = is_canonical 
        update_fields['is_constitutive'] = is_constitutive
        update_fields['species']         = species         
        update_fields['source']          = source      
        update_fields['protein_seq']     = protein_seq  
        update_fields['left_flank']      = left_flank   
        update_fields['right_flank']     = right_flank  
        update_fields['dna_seq']         = dna_seq     

        store_or_update (cursor, table, fixed_fields, update_fields)

    inf.close()
def multiple_exon_alnmt(species_list, db_info):


    [local_db, ensembl_db_name] = db_info

    verbose  = False

    db     = connect_to_mysql()
    cfg    = ConfigurationReader()
    acg    = AlignmentCommandGenerator()
    cursor = db.cursor()


    for species in species_list:

        print
        print "############################"
        print  species

        switch_to_db (cursor,  ensembl_db_name[species])
        gene_ids = get_gene_ids (cursor, biotype='protein_coding')
        #gene_ids = get_theme_ids(cursor, cfg, 'wnt_pathway')
        if not gene_ids:
            print "no gene_ids"
            continue


        gene_ct       = 0
        tot           = 0
        ok            = 0
        no_maps       = 0
        no_pepseq     = 0
        no_paralogues = 0
        for gene_id in gene_ids:

            if verbose: start = time()
            gene_ct += 1
            if not gene_ct%100: print species, gene_ct, "genes out of", len(gene_ids)
            if verbose: 
                print
                print gene_id, gene2stable(cursor, gene_id), get_description (cursor, gene_id)

            # get the paralogues - only the representative for  the family will have this 
            paralogues = get_paras (cursor, gene_id)  
            if not paralogues:
                if verbose:  print "\t not a template or no paralogues"
                continue

            if verbose:  print "paralogues: ", paralogues

            # get _all_ exons
            template_exons = gene2exon_list(cursor, gene_id)
            if (not template_exons):
                if verbose: print 'no exons for ', gene_id
                continue

            # find all template  exons we are tracking in the database
            for template_exon in template_exons:

                if verbose: print template_exon.exon_id
                maps = get_maps(cursor, ensembl_db_name, template_exon.exon_id,
                                template_exon.is_known, species=species, table='para_exon_map')

                if not maps:
                    no_maps += 1
                    continue

                # output to fasta:
                seqname        = "{0}:{1}:{2}".format('template', template_exon.exon_id, template_exon.is_known)
                exon_seqs_info =  get_exon_seqs (cursor, template_exon.exon_id, template_exon.is_known)
                if not exon_seqs_info: continue
                [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, 
                 left_flank, right_flank, dna_seq] = exon_seqs_info
                if (not pepseq):
                    if ( template_exon.is_coding and  template_exon.covering_exon <0): # this should be a master exon
                        print "no pep seq for",  template_exon.exon_id, "coding ", template_exon.is_coding,
                        print "canonical: ",  template_exon.is_canonical
                        print "length of dna ", len(dna_seq)
                        no_pepseq += 1
                    continue
                
                tot += 1

                sequences = {seqname:pepseq}
                headers   = [seqname]
                for map in maps:
                    exon    = map2exon(cursor, ensembl_db_name, map, paralogue=True)
                    pepseq  = get_exon_pepseq (cursor,exon)
                    if (not pepseq):
                        continue
                    seqname = "{0}:{1}:{2}".format('para', map.exon_id_2, map.exon_known_2)
                    headers.append(seqname)
                    sequences[seqname] = pepseq

                fasta_fnm = "{0}/{1}_{2}_{3}.fa".format( cfg.dir_path['scratch'], species, template_exon.exon_id, template_exon.is_known)
                output_fasta (fasta_fnm, headers, sequences)

                if (len(headers) <=1 ):
                    print "single species in the alignment (?)"
                    no_paralogues += 1
                    continue

                # align
                afa_fnm  = "{0}/{1}_{2}_{3}.afa".format( cfg.dir_path['scratch'], species, template_exon.exon_id, template_exon.is_known)
                mafftcmd = acg.generate_mafft_command (fasta_fnm, afa_fnm)
                ret      = commands.getoutput(mafftcmd)

                # read in the alignment
                inf = erropen(afa_fnm, "r")
                if not inf:
                    print gene_id
                    continue
                template_seq_seen = False
                for record in SeqIO.parse(inf, "fasta"):
                    ### store the alignment as bitstring
                    # Generate the bitmap
                    bs         = Bits(bin='0b' + re.sub("[^0]","1", str(record.seq).replace('-','0')))
                    msa_bitmap = bs.tobytes()
                    # Retrieve information on the cognate
                    label, cognate_exon_id, cognate_exon_known = record.id.split(':')
                    if (label == 'template'):
                        template_seq_seen = True
                    # Write the bitmap to the database
                    #print "updating: ", template_exon.exon_id
                    store_or_update(cursor, "para_exon_map", {"cognate_exon_id"    :cognate_exon_id,
                                                         "cognate_exon_known" :cognate_exon_known,
                                                         "exon_id"            :template_exon.exon_id,
                                                         "exon_known"         :template_exon.is_known},
                                    {"msa_bitstring":MySQLdb.escape_string(msa_bitmap)})
                inf.close()
                ok += 1
                commands.getoutput("rm "+afa_fnm+" "+fasta_fnm)
            if verbose: print " time: %8.3f\n" % (time()-start);
 
        outstr  =  species + " done \n"
        outstr +=  "tot: %d   ok: %d  \n" % (tot,  ok)
        outstr +=  "no maps       %d  \n" % no_pepseq
        outstr +=  "no pepseq     %d  \n" % no_pepseq
        outstr +=  "no paralogues %d  \n" % no_paralogues
        outstr += "\n"
        print outstr
Example #13
0
def dump_exons(species_list, db_info):

    [local_db, ensembl_db_name] = db_info
    db = connect_to_mysql()
    cfg = ConfigurationReader()
    cursor = db.cursor()

    out_path = "{0}/exons".format(cfg.get_path('afs_dumps'))
    if not os.path.exists(out_path):
        print out_path, "not found"
        exit(1)  # exit on failed output dir check

    for species in species_list:
        #if (not species=='homo_sapiens'):
        #    continue
        outfile = "{0}/{1}_exon_dump.txt".format(out_path, species)
        of = erropen(outfile, "w")
        if not of: continue
        switch_to_db(cursor, ensembl_db_name[species])

        if (species == 'homo_sapiens'):
            gene_ids = get_gene_ids(cursor,
                                    biotype='protein_coding',
                                    is_known=1,
                                    ref_only=True)
        else:
            gene_ids = get_gene_ids(cursor, biotype='protein_coding')

        source = get_analysis_dict(cursor)

        ct = 0
        for gene_id in gene_ids:
            ct += 1
            if (not ct % 1000):
                print species, ct, len(gene_ids)

            # get _all_ exons
            exons = gene2exon_list(cursor, gene_id)
            if (not exons):
                print 'no exons for ', gene_id
                continue

            for exon in exons:

                if exon.covering_exon > 0: continue
                # exons seqs are its aa translation, left_flank, right_flank, and dna_seq
                exon_seqs = get_exon_seqs(cursor, exon.exon_id, exon.is_known)
                if (not exon_seqs):
                    continue
                # human readable string describing the source of annotation for this exon
                if exon.is_known == 2:
                    analysis = 'sw_sharp'
                elif exon.is_known == 3:
                    analysis = 'usearch'
                else:
                    analysis = source[exon.analysis_id]
                # the first field return by get_exon_seqs is the exon_seq_id, so get rid of it
                gene_stable_id = gene2stable(cursor, gene_id)
                if (exon.is_known == 1):
                    exon_stable_id = exon2stable(cursor, exon.exon_id)
                elif (exon.is_known == 2):
                    exon_stable_id = 'sw_sharp_' + str(exon.exon_id)
                elif (exon.is_known == 3):
                    exon_stable_id = 'usearch_' + str(exon.exon_id)
                else:
                    exon_stable_id = "anon"

                print >> of, exon_tabstring(exon, gene_stable_id,
                                            exon_stable_id, species, analysis,
                                            exon_seqs[1:])

        of.close()
        print species, "done"

    cursor.close()
    db.close()
def main():

    verbose  = True
    db       = connect_to_mysql()
    acg      = AlignmentCommandGenerator()
    cursor   = db.cursor()

    # find db ids adn common names for each species db
    [all_species, ensembl_db_name] = get_species (cursor)
    
    logf = erropen("error.log", "w") 
    if not logf: exit(1)
    
    outf = erropen("mut_significance_bg_data.txt", "w") 
    if not outf: exit(1)


    switch_to_db (cursor, ensembl_db_name['homo_sapiens'])
    gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1, ref_only=True)
    
    
    # the categories of mutations for which we will be collecting statistics
    fill_category ()    
    # for each human gene
    #gene_ids = [10093176 ]
    for gene_id in gene_ids:
       
        switch_to_db (cursor,  ensembl_db_name['homo_sapiens'])
        stable_id = gene2stable(cursor, gene_id)
 

        # find all canonical coding  human exons 
        # get_canonical_coding_exons also sorts exons by the start in the gene
        canonical_human_exons = get_canonical_coding_exons (cursor, gene_id, ensembl_db_name['homo_sapiens'])

        # bail out if there is a problem
        if not canonical_human_exons: continue

        full_reconstituted_cDNA = ""
        prev_codon_piece_plus_right_flank = ""
        for human_exon in canonical_human_exons:
            [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, nucseq] = \
                    get_exon_seqs(cursor, human_exon.exon_id, human_exon.is_known)
            # add the split codon
            phase = get_exon_phase (cursor, human_exon.exon_id, human_exon.is_known)
            left_flank_plus_codon_piece = left_flank + nucseq[:pepseq_transl_start]
            split_codon = ""
            if phase > 0 and prev_codon_piece_plus_right_flank and left_flank:
                offset      = (3-phase)%3
                # hedge against the possibility that the translation starts
                # right at the start of the exon, but there is supposed to be a phase
                split_codon = prev_codon_piece_plus_right_flank[:phase] + left_flank_plus_codon_piece[-offset:]
            full_reconstituted_cDNA += split_codon + nucseq[pepseq_transl_start:pepseq_transl_end]
            prev_codon_piece_plus_right_flank = nucseq[pepseq_transl_end:] + right_flank
            
        mitochondrial = is_mitochondrial(cursor, gene_id);
        if (mitochondrial):
            full_reconstituted_seq = Seq(full_reconstituted_cDNA).translate(table="Vertebrate Mitochondrial").tostring()
        else:
            full_reconstituted_seq = Seq(full_reconstituted_cDNA).translate().tostring()
            
        canonical = get_canonical_transl (acg, cursor, gene_id, 'homo_sapiens', strip_X = False)
        if canonical[0] == 'X': #that's some crap apparently wrong transcript is annotated as canonical
            print >> logf, "warning", gene_id, stable_id,  get_description (cursor, gene_id)
            print >> logf, "the deposited canonical sequence starts with X - is there an alternative (?)"
            canonical = canonical[1:]
            
        if full_reconstituted_seq[-1] == '*' and canonical[-1] != '*':
            canonical += '*'
        if ( len(full_reconstituted_seq) != len(canonical)  or  full_reconstituted_seq != canonical):
            
            if ( len(canonical) - len(full_reconstituted_seq) < 3 and  full_reconstituted_seq in canonical):
                # go with it  - I do not have that much of that crap anyway
                print >> logf, "warning", gene_id, stable_id,  get_description (cursor, gene_id)
                print >> logf, "missing a couple of amino acids in beginning or in the end"
            else:   
                print >> logf, "error" , gene_id, stable_id, get_description (cursor, gene_id)
                print >> logf, "error reassembling,  len(full_reconstituted_seq) != len(canonical) ", len(full_reconstituted_seq) , len(canonical) 
                print >> logf, "canonical:"
                print >> logf, canonical
                print >> logf, "reconstituted:"
                print >> logf, full_reconstituted_seq
                continue

        # nucleotide stats
        count = {'A':0, 'C':0, 'C-CpG':0, 'T':0, 'G':0, 'G-CpG':0} 
        is_CpG = {}
        for i in range( len(full_reconstituted_cDNA) ):
            is_CpG[i] = False
            if full_reconstituted_cDNA[i] == 'A':
                count['A'] += 1
            elif full_reconstituted_cDNA[i] == 'T':
                count['T'] += 1
            elif full_reconstituted_cDNA[i] == 'C':
                if i + 1 < len(full_reconstituted_cDNA) and full_reconstituted_cDNA[i + 1] == 'G':
                    count['C-CpG'] += 1
                    is_CpG[i] = True
                else:
                    count['C'] += 1
            elif full_reconstituted_cDNA[i] == 'G':
                if i > 0 and full_reconstituted_cDNA[i - 1] == 'C':
                    count['G-CpG'] += 1
                    is_CpG[i] = True
                else:
                    count['G'] += 1
                    
        # in each category_dict (AT transt, AT transv, CG trans, CG transv, Cpg trans, cpGtransv, how many missense, 
        #  how many nonsense, how many silent  possible    
        codons = map(''.join, zip(*[iter(full_reconstituted_cDNA)]*3))
        silent   = {}
        missense = {}
        nonsense = {}
        for cg in categories:
            silent[cg]   = 0
            missense[cg] = 0
            nonsense[cg] = 0
        for i in range(len(codons)):
            codon = codons[i]
            aa = full_reconstituted_seq[i]
            for j in range(3):
                nt_position = i*3 + j
                nt = full_reconstituted_cDNA[nt_position]
                for new_nt in ['A', 'C', 'T', 'G']:
                    if new_nt == nt: continue
                    mutated_codon = mutate(codon, j, new_nt)
                    if (mitochondrial):
                        mutated_aa = Seq(mutated_codon).translate(table="Vertebrate Mitochondrial").tostring()
                    else:
                        mutated_aa = Seq(mutated_codon).translate().tostring()
                    cg = category_dict[codon[j]][new_nt][is_CpG[nt_position]];
                    if not cg or not cg in categories:
                        print >> logf, "category problem in ", gene_id, stable_id, get_description (cursor, gene_id)
                        print >> logf, codon, mutated_codon, j, codon[j], new_nt, is_CpG[nt_position], cg
                        print >> logf, i, j, nt_position, nt
                        print >> logf, aa, mutated_aa
                        continue
                    if (mutated_aa == aa):
                        silent[cg] += 1
                    elif (mutated_aa == "*"):
                        nonsense[cg] += 1
                    else:
                        missense[cg] += 1
                
        print >> outf, stable_id, get_description (cursor, gene_id)
        print >> outf, "# CpG nucleotides (format: cdna_position|nucleotide|codon|context; )"
        print >> outf, "# ('context' contains one nucleotide before and one after the CpG nucleotide)"

        outstr = ""
        for i in range(len(full_reconstituted_cDNA)):
            if (is_CpG[i]):
                context = ""
                if i>0: context += full_reconstituted_cDNA[i-1]
                context += full_reconstituted_cDNA[i]
                if i<len(full_reconstituted_cDNA)-1: context += full_reconstituted_cDNA[i+1]
                outstr += "%d|%s|%s|%s;" % (i+1, full_reconstituted_cDNA[i], codons[i/3], context)
        print >> outf, outstr
               

        print >> outf,"# mutations possible (in principle)"
        print >> outf,"# %10s  %5s  %5s  %5s" % ("category", "silent", "nonsense", "missense")
        for cg in categories:
            print >> outf,"%10s  %5d  %5d  %5d" % (cg, silent[cg], nonsense[cg], missense[cg])


        print >> outf, "# canonical sequence (format: <amino_acid><position_on_peptide_chain><codon>;):"
        outstr = ""
        for i in range(len(codons)):
            if (mitochondrial):
                codon_transl = Seq(codons[i]).translate(table="Vertebrate Mitochondrial").tostring()
            else:
                codon_transl = Seq(codons[i]).translate().tostring()

            outstr +=  "%s%d%s;" % (full_reconstituted_seq[i], i+1, codons[i])
        print >> outf, outstr


        print >> outf, stable_id,  "done"

    logf.close()
Example #15
0
def dump_exons (species_list, db_info):

    
    [local_db, ensembl_db_name] = db_info
    db     = connect_to_mysql()
    cfg    = ConfigurationReader()
    cursor = db.cursor()

    out_path = "{0}/exons".format(cfg.get_path('afs_dumps'))
    if not os.path.exists(out_path):
        print out_path, "not found"
        exit (1) # exit on failed output dir check

    for species in species_list:
        #if (not species=='homo_sapiens'):
        #    continue
        outfile  = "{0}/{1}_exon_dump.txt".format(out_path, species)
        of       = erropen (outfile,"w")
        if not of:  continue
        switch_to_db (cursor,  ensembl_db_name[species])

        if (species=='homo_sapiens'):
            gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1, ref_only=True)
        else:
            gene_ids = get_gene_ids (cursor, biotype='protein_coding')

        source = get_analysis_dict(cursor)

        ct     = 0
        for gene_id in gene_ids:
            ct += 1
            if (not  ct%1000):
                print species, ct, len(gene_ids)

            # get _all_ exons
            exons = gene2exon_list(cursor, gene_id)
            if (not exons):
                print 'no exons for ', gene_id
                continue

            for exon in exons:

                if exon.covering_exon  > 0: continue
                # exons seqs are its aa translation, left_flank, right_flank, and dna_seq
                exon_seqs = get_exon_seqs(cursor, exon.exon_id, exon.is_known)
                if (not exon_seqs):
                    continue
                # human readable string describing the source of annotation for this exon
                if exon.is_known==2:
                    analysis = 'sw_sharp'
                elif exon.is_known==3:
                    analysis = 'usearch'
                else:
                    analysis = source[exon.analysis_id] 
                # the first field return by get_exon_seqs is the exon_seq_id, so get rid of it
                gene_stable_id = gene2stable(cursor,gene_id)
                if ( exon.is_known == 1):
                    exon_stable_id = exon2stable(cursor,exon.exon_id)
                elif ( exon.is_known == 2):
                    exon_stable_id = 'sw_sharp_'+str(exon.exon_id)
                elif ( exon.is_known == 3):
                    exon_stable_id = 'usearch_'+str(exon.exon_id)
                else:
                    exon_stable_id = "anon"

                print >> of, exon_tabstring (exon, gene_stable_id, exon_stable_id, species, analysis, exon_seqs[1:])


        of.close()
        print species, "done"
    
    cursor.close()
    db    .close()