def make_map_table (cursor, ensembl_db_name, all_species, human_exons):
    
    # make 'table' of maps, which is either pointer to the map if it exists, or None
    map_table  = {}
    for species in all_species:
        map_table[species] = {}
        for he in human_exons:
            map_table[species][he] = None

    maps_for_exon = {}
    for he in human_exons:
        maps_for_exon[he] =  get_maps(cursor, ensembl_db_name, he.exon_id, he.is_known) # exon data
        for m in maps_for_exon[he]:
            #if not m.source == 'ensembl': continue
            #if m.similarity < 0.33333: continue
            if not m.species_2 in all_species: continue
            map_table[m.species_2][he] = m
           #if m.source =='sw_sharp': print m.source
    # get rid of species that do not have the gene at all
    for species in all_species:
        one_exon_found = False
        for he in human_exons:
            if map_table[species][he]:
                one_exon_found = True
                break
        if not one_exon_found:
            del map_table[species]

    return map_table
def make_map_table(cursor, ensembl_db_name, all_species, human_exons):

    # make 'table' of maps, which is either pointer to the map if it exists, or None
    map_table = {}
    for species in all_species:
        map_table[species] = {}
        for he in human_exons:
            map_table[species][he] = None

    maps_for_exon = {}
    for he in human_exons:
        maps_for_exon[he] = get_maps(cursor, ensembl_db_name, he.exon_id,
                                     he.is_known)  # exon data
        for m in maps_for_exon[he]:
            #if not m.source == 'ensembl': continue
            #if m.similarity < 0.33333: continue
            if not m.species_2 in all_species: continue
            map_table[m.species_2][he] = m
        #if m.source =='sw_sharp': print m.source
    # get rid of species that do not have the gene at all
    for species in all_species:
        one_exon_found = False
        for he in human_exons:
            if map_table[species][he]:
                one_exon_found = True
                break
        if not one_exon_found:
            del map_table[species]

    return map_table
def make_exon_alignment(cursor, ensembl_db_name, human_exon_id, human_exon_known, nt=False):
    sequence = {}
    shortest_l = -1 # Uninitialized leading padding length
    shortest_r = -1 # Uninitialized trailing padding length

    # find all other exons that map to the human exon
    maps = get_maps(cursor, ensembl_db_name, human_exon_id, human_exon_known)

        
    for map in maps:
        # get the raw (unaligned) sequence for the exon that maps onto human
        exon_seqs = get_exon_seqs(cursor, map.exon_id_2, map.exon_known_2, ensembl_db_name[map.species_2])
        if not exon_seqs or len(exon_seqs)<7:
            #print map
            continue

        [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, left_flank, right_flank, dna_seq] = exon_seqs

        bs = Bits(bytes=map.bitmap)
        if (not bs.count(1) == len(pepseq)): continue # check bitmap has correct number of 1s
        usi = iter(pepseq)
        reconst_pepseq = "".join(('-' if c=='0' else next(usi) for c in bs.bin))
  
        #########################################################
        # come up with a unique name for this sequence
        species       = map.species_2
        sequence_name = make_seq_name (cursor, ensembl_db_name, species,  map.exon_id_2, map.exon_known_2, exon_seqs)

        if not sequence_name: # for whichever reason we still do not have the name here
            sequence_name = "anon_" + species 
        #########################################################
           
        if nt:
            reconst_ntseq = expand_pepseq (reconst_pepseq, exon_seqs[1:])
            if reconst_ntseq: 
                sequence[sequence_name] = reconst_ntseq
                aln_length = len(reconst_ntseq)
        else:
            if reconst_pepseq: 
                sequence[sequence_name] = reconst_pepseq
                aln_length = len(reconst_pepseq)
                
    # strip common gaps
    all_gaps = {}  
    for pos in range(aln_length):
        all_gaps[pos] = True
        for name, seq in sequence.iteritems():
            if (not seq[pos]=='-'):
                all_gaps[pos] = False
                break

    sequence_stripped = {}
    for name, seq in sequence.iteritems():
        sequence_stripped[name] = ""
        for pos in range(aln_length):
            if all_gaps[pos]: continue
            sequence_stripped[name] += seq[pos]

    return sequence_stripped
def find_missing_exons(human_gene_list, db_info):

    #
    [local_db, ensembl_db_name, method] = db_info
    db = connect_to_mysql()
    cfg = ConfigurationReader()
    acg = AlignmentCommandGenerator()
    cursor = db.cursor()

    # find db ids and common names for each species db
    all_species, ensembl_db_name = get_species(cursor)
    # minimal acceptable similarity between exons
    min_similarity = cfg.get_value('min_accptbl_exon_sim')

    switch_to_db(cursor, ensembl_db_name['homo_sapiens'])

    ##################################################################################
    # loop over human genes
    gene_ct = 0
    found = 0
    sought = 0
    unsequenced = 0
    #human_gene_list.reverse()
    for human_gene_id in human_gene_list:

        switch_to_db(cursor, ensembl_db_name['homo_sapiens'])

        # Get stable id and description of this gene -- DEBUG
        human_stable = gene2stable(cursor, human_gene_id)
        human_description = get_description(cursor, human_gene_id)
        if verbose: print(human_gene_id, human_stable, human_description)

        # progress counter
        gene_ct += 1
        if (not gene_ct % 10):
            print("processed ", gene_ct, " out of ", len(human_gene_list),
                  "genes")
            print("exons found: ", found, " out of ", sought, "sought")

        # find all human exons for this gene that we are tracking in the database
        human_exons = [
            e for e in gene2exon_list(cursor, human_gene_id)
            if e.covering_exon < 0 and e.is_canonical and e.is_known
        ]
        if not human_exons:
            print("\t\t no exons found")
            continue

        human_exons.sort(key=lambda exon: exon.start_in_gene)
        for he in human_exons:
            he.stable_id = exon2stable(cursor, he.exon_id)

    ##################################################################################
    ##################################################################################
        # make 'table' of maps, which is either pointer to the map if it exists, or None
        map_table = {}
        for species in all_species:
            map_table[species] = {}
            for he in human_exons:
                map_table[species][he] = None

    #################
        maps_for_exon = {}
        for he in human_exons:
            maps_for_exon[he] = get_maps(cursor, ensembl_db_name, he.exon_id,
                                         he.is_known)  # exon data
            for m in maps_for_exon[he]:
                #if m.source ==  'usearch': continue
                #if m.source == 'sw_sharp': continue
                #if m.source == 'sw_sharp':
                #    print 'sw_sharp'
                #if m.source == 'usearch':
                #    print 'usearch',  m.similarity, m.species_2, m.exon_id_1, m.exon_id_2
                if m.similarity < min_similarity: continue
                m_previous = map_table[m.species_2][he]
                if m_previous and m_previous.similarity > m.similarity:
                    continue
                map_table[m.species_2][he] = m

    # get rid of species that do not have the gene at all
        for species in all_species:
            one_exon_found = False
            for he in human_exons:
                if map_table[species][he]:
                    one_exon_found = True
                    break
            if not one_exon_found:
                del map_table[species]

    # fill in the peptide sequence field for each human exon
    # get rid of exons  that appear in no other species but human (?)
        bad_he = []
        for he in human_exons:
            one_species_found = False
            he.pepseq = get_exon_pepseq(cursor, he,
                                        ensembl_db_name['homo_sapiens'])
            if len(
                    he.pepseq
            ) < 3:  # can I ever get rid of all the nonsense I find in Ensembl?
                bad_he.append(he)
                continue
            for species in list(map_table.keys()):
                if species == 'homo_sapiens': continue
                if map_table[species][he]:
                    one_species_found = True
                    break
            if not one_species_found:
                bad_he.append(he)
        human_exons = [he for he in human_exons if not he in bad_he]

        # keep track of nearest neighbors for each human exon
        previous = {}
        next = {}
        prev = None
        for he in human_exons:
            previous[he] = prev
            if prev: next[prev] = he
            prev = he
        next[he] = None

        # fill,  starting from the species that are nearest to the human
        if not list(map_table.keys()):
            continue  # whatever

        species_sorted_from_human = species_sort(cursor,
                                                 list(map_table.keys()),
                                                 species)[1:]

        for species in species_sorted_from_human:
            print(species)
            # see which exons have which neighbors
            #if verbose: print he.exon_id, species
            no_left = []
            no_right = []
            has_both_neighbors = []
            one_existing_map = None
            for he in human_exons:
                m = map_table[species][he]
                if m and not m.warning:  # the one existing map should not be a problematic one
                    one_existing_map = m
                    continue
                prev = previous[he]
                nxt = next[he]
                if prev and nxt and map_table[species][prev] and map_table[
                        species][nxt]:
                    has_both_neighbors.append(he)
                elif not prev or not map_table[species][prev]:
                    no_left.append(he)
                elif not nxt or not map_table[species][nxt]:
                    no_right.append(he)

            if not one_existing_map: continue  # this shouldn't happen
            if not has_both_neighbors and not no_left and not no_right:
                continue

            # what is the gene that we are talking about?
            exon_id = one_existing_map.exon_id_2
            is_known = one_existing_map.exon_known_2
            gene_id = exon_id2gene_id(cursor, ensembl_db_name[species],
                                      exon_id, is_known)
            # is it mitochondrial?
            mitochondrial = is_mitochondrial(cursor, gene_id,
                                             ensembl_db_name[species])
            # where is the gene origin (position on the sequence)
            gene_coords = get_gene_coordinates(cursor, gene_id,
                                               ensembl_db_name[species])
            if not gene_coords: continue
            [gene_seq_region_id, gene_start, gene_end,
             gene_strand] = gene_coords

            # fill in exons that have both neighbors:
            # human exon functions as a coordinate here
            for he in has_both_neighbors:

                # get template (known exon from the nearest species)
                template_info = get_template(cursor, ensembl_db_name,
                                             map_table, species, he)
                if not template_info: continue
                # previous_ and next_seq_region are of the type Seq_Region defined on the top of the file
                # get previous region
                prev_seq_region = get_neighboring_region(
                    cursor, ensembl_db_name, map_table, species, gene_coords,
                    he, previous[he])
                if not prev_seq_region: continue
                # get following  region
                next_seq_region = get_neighboring_region(
                    cursor, ensembl_db_name, map_table, species, gene_coords,
                    he, next[he])
                if not next_seq_region: continue
                sought += 1
                reply = find_NNN(cursor, ensembl_db_name, cfg, acg, he,
                                 maps_for_exon[he], species, gene_id,
                                 gene_coords, prev_seq_region, next_seq_region,
                                 template_info, mitochondrial, method)
                if reply == 'NNN':
                    unsequenced += 1

            # work backwards
            # use the last known region on the left as the bound
            no_left.reverse()
            next_seq_region = None
            for he in no_left:
                m = map_table[species][he]
                # check first if we haave already looked into this, and found incomplete region
                #if m and m.warning: continue
                # get template (known exon from the nearest species)
                template_info = get_template(cursor, ensembl_db_name,
                                             map_table, species, he)
                if not template_info: continue

                # get following  region
                if not next_seq_region:
                    next_seq_region = get_neighboring_region(
                        cursor, ensembl_db_name, map_table, species,
                        gene_coords, he, next[he])
                if not next_seq_region: continue

                # otherwise it is the last thing we found
                # the previous region is eyeballed from the next on
                # the previous and the  next region frame the search region
                prev_seq_region = left_region(next_seq_region,
                                              MAX_SEARCH_LENGTH)
                sought += 1
                reply = find_NNN(cursor, ensembl_db_name, cfg, acg, he,
                                 maps_for_exon[he], species, gene_id,
                                 gene_coords, prev_seq_region, next_seq_region,
                                 template_info, mitochondrial, method)
                if reply == 'NNN':
                    unsequenced += 1

            # repeat the whole procedure on the right
            prev_seq_region = None
            for he in no_right:
                m = map_table[species][he]
                # check first if we haave already looked into this, and found incomplete region
                #if  m and m.warning: continue
                # get template (known exon from the nearest species)
                template_info = get_template(cursor, ensembl_db_name,
                                             map_table, species, he)
                if not template_info: continue

                # get following  region
                if not prev_seq_region:
                    prev_seq_region = get_neighboring_region(
                        cursor, ensembl_db_name, map_table, species,
                        gene_coords, he, previous[he])
                if not prev_seq_region: continue
                # otherwise it is the last thing we found

                # the following region is eyeballed from the previous
                next_seq_region = right_region(prev_seq_region,
                                               MAX_SEARCH_LENGTH)
                sought += 1
                reply = find_NNN(cursor, ensembl_db_name, cfg, acg, he,
                                 maps_for_exon[he], species, gene_id,
                                 gene_coords, prev_seq_region, next_seq_region,
                                 template_info, mitochondrial, method)
                if reply == 'NNN':
                    unsequenced += 1

            print(species, "sought", sought, " unseq", unsequenced)
def find_missing_exons(human_gene_list, db_info):

    # 
    [local_db, ensembl_db_name, method] = db_info
    db  = connect_to_mysql()
    cfg = ConfigurationReader()
    acg = AlignmentCommandGenerator()
    cursor = db.cursor()

    # find db ids and common names for each species db
    all_species, ensembl_db_name = get_species (cursor)
    # minimal acceptable similarity between exons
    min_similarity = cfg.get_value('min_accptbl_exon_sim')

    switch_to_db (cursor, ensembl_db_name['homo_sapiens'])

    ##################################################################################
    # loop over human genes
    gene_ct = 0
    found   = 0
    sought  = 0
    unsequenced = 0
    #human_gene_list.reverse()
    for human_gene_id in human_gene_list:

	switch_to_db (cursor, ensembl_db_name['homo_sapiens'])

	# Get stable id and description of this gene -- DEBUG
	human_stable      = gene2stable    (cursor, human_gene_id)
        human_description = get_description(cursor, human_gene_id)
	if verbose:  print human_gene_id, human_stable, human_description

	# progress counter 
	gene_ct += 1
	if (not gene_ct%10): 
            print "processed ",   gene_ct, " out of ", len(human_gene_list), "genes"
            print "exons found: ",  found, " out of ", sought, "sought"

	# find all human exons for this gene that we are tracking in the database 
	human_exons = [e for e in gene2exon_list(cursor, human_gene_id) 
                       if e.covering_exon < 0 and e.is_canonical and e.is_known]
        if not human_exons: 
            print "\t\t no exons found"
            continue

	human_exons.sort(key=lambda exon: exon.start_in_gene)
        for he in human_exons:
            he.stable_id = exon2stable (cursor, he.exon_id)

        ##################################################################################
        ##################################################################################
	# make 'table' of maps, which is either pointer to the map if it exists, or None
	map_table = {}
        for species in all_species:
            map_table[species] = {}
            for he in human_exons:
                map_table[species][he] = None

        ################# 
        maps_for_exon = {}
        for he in human_exons:
            maps_for_exon[he] =  get_maps(cursor, ensembl_db_name, he.exon_id, he.is_known) # exon data
            for m in maps_for_exon[he]:
                #if m.source ==  'usearch': continue
                #if m.source == 'sw_sharp': continue
                #if m.source == 'sw_sharp': 
                #    print 'sw_sharp'
                #if m.source == 'usearch': 
                #    print 'usearch',  m.similarity, m.species_2, m.exon_id_1, m.exon_id_2
                if m.similarity < min_similarity: continue
                m_previous = map_table[m.species_2][he]
                if m_previous and m_previous.similarity > m.similarity:
                        continue
                map_table[m.species_2][he] = m


        # get rid of species that do not have the gene at all
        for species in all_species:
            one_exon_found = False
            for he in human_exons:
                if map_table[species][he]:
                    one_exon_found = True
                    break
            if not one_exon_found:
                del map_table[species]
               
        # fill in the peptide sequence field for each human exon
        # get rid of exons  that appear in no other species but human (?)
        bad_he = []
        for he in human_exons:
            one_species_found = False
            he.pepseq =   get_exon_pepseq (cursor, he, ensembl_db_name['homo_sapiens'])
            if len (he.pepseq) < 3:  # can I ever get rid of all the nonsense I find in Ensembl?
                bad_he.append(he)
                continue
            for species in  map_table.keys(): 
                if species =='homo_sapiens': continue
                if map_table[species][he]:
                    one_species_found = True
                    break
            if not one_species_found:
                bad_he.append(he)
        human_exons = filter (lambda he: not he in bad_he, human_exons)

 
  
        # keep track of nearest neighbors for each human exon
        previous = {}
        next     = {}
        prev     = None
        for he in human_exons:
            previous[he]        = prev
            if prev: next[prev] = he
            prev = he
        next[he] = None

        # fill,  starting from the species that are nearest to the human
        if not map_table.keys():
            continue # whatever

        species_sorted_from_human = species_sort(cursor,map_table.keys(),species)[1:]

        for species in species_sorted_from_human:
            print species
            # see which exons have which neighbors
            #if verbose: print he.exon_id, species
            no_left  = []
            no_right = []
            has_both_neighbors = []
            one_existing_map   = None
            for he in human_exons:
                m =  map_table[species][he]
                if m and not m.warning: # the one existing map should not be a problematic one 
                    one_existing_map = m
                    continue
                prev = previous[he]
                nxt  = next[he]
                if prev and nxt and map_table[species][prev] and map_table[species][nxt]:
                    has_both_neighbors.append(he)
                elif not prev or not map_table[species][prev]:
                    no_left.append(he)
                elif not nxt  or not map_table[species][nxt]:
                    no_right.append(he)
            
            if not one_existing_map: continue # this shouldn't happen
            if not has_both_neighbors and not no_left and not no_right: continue

            # what is the gene that we are talking about?
            exon_id  = one_existing_map.exon_id_2
            is_known = one_existing_map.exon_known_2
            gene_id  = exon_id2gene_id (cursor, ensembl_db_name[species], exon_id, is_known)
            # is it mitochondrial?
            mitochondrial = is_mitochondrial(cursor, gene_id, ensembl_db_name[species])
            # where is the gene origin (position on the sequence)
            gene_coords =  get_gene_coordinates (cursor, gene_id, ensembl_db_name[species])
            if not gene_coords: continue
            [gene_seq_region_id, gene_start, gene_end, gene_strand] = gene_coords

            # fill in exons that have both neighbors:
            # human exon functions as a coordinate here
            for he in has_both_neighbors:


                # get template (known exon from the nearest species)
                template_info = get_template (cursor, ensembl_db_name, 
                                              map_table, species, he)
                if not template_info: continue
                # previous_ and next_seq_region are of the type Seq_Region defined on the top of the file
                # get previous region
                prev_seq_region = get_neighboring_region (cursor, ensembl_db_name, 
                                                          map_table, species, gene_coords, he, previous[he])
                if not prev_seq_region: continue
                # get following  region
                next_seq_region = get_neighboring_region  (cursor, ensembl_db_name, 
                                                           map_table, species, gene_coords, he, next[he])
                if not next_seq_region: continue
                sought += 1
                reply = find_NNN (cursor, ensembl_db_name, cfg, acg, he, maps_for_exon[he], 
                                                   species, gene_id,  gene_coords, prev_seq_region, 
                                                   next_seq_region, template_info, mitochondrial, method)
                if reply=='NNN':
                    unsequenced += 1


            # work backwards
            # use the last known region on the left as the bound
            no_left.reverse()
            next_seq_region = None
            for he in no_left:
                m =  map_table[species][he]
                # check first if we haave already looked into this, and found incomplete region
                #if m and m.warning: continue
                # get template (known exon from the nearest species)
                template_info = get_template (cursor, ensembl_db_name, map_table, species, he)
                if not template_info: continue

                # get following  region
                if not next_seq_region:
                    next_seq_region = get_neighboring_region (cursor, ensembl_db_name, map_table, 
                                                              species,  gene_coords, he, next[he])
                if not next_seq_region: continue

                # otherwise it is the last thing we found
                # the previous region is eyeballed from the next on
                # the previous and the  next region frame the search region
                prev_seq_region = left_region (next_seq_region, MAX_SEARCH_LENGTH)
                sought         += 1
                reply = find_NNN (cursor, ensembl_db_name, cfg, acg, he, maps_for_exon[he], 
                                                   species,  gene_id, gene_coords, prev_seq_region, next_seq_region,
                                                   template_info, mitochondrial, method)
                if reply=='NNN':
                    unsequenced += 1
 
            # repeat the whole procedure on the right
            prev_seq_region = None
            for he in no_right:
                m =  map_table[species][he]
                # check first if we haave already looked into this, and found incomplete region
                #if  m and m.warning: continue
                # get template (known exon from the nearest species)
                template_info = get_template (cursor, ensembl_db_name, 
                                                                map_table, species, he)
                if not template_info: continue

                # get following  region
                if not prev_seq_region:
                    prev_seq_region = get_neighboring_region (cursor, ensembl_db_name, map_table, 
                                                              species, gene_coords,  he, previous[he])
                if not prev_seq_region: continue
                # otherwise it is the last thing we found
                    

                # the following region is eyeballed from the previous 
                next_seq_region = right_region (prev_seq_region, MAX_SEARCH_LENGTH)
                sought         += 1
                reply = find_NNN (cursor, ensembl_db_name, cfg, acg, he,  maps_for_exon[he], 
                                                    species, gene_id, gene_coords, prev_seq_region, next_seq_region,
                                                    template_info, mitochondrial, method)
                if reply=='NNN':
                    unsequenced += 1
                    
            print species, "sought", sought, " unseq", unsequenced
def multiple_exon_alnmt(gene_list, db_info):


    print "process pid: %d, length of gene list: %d" % ( get_process_id(), len(gene_list))

    [local_db, ensembl_db_name] = db_info

    db     = connect_to_mysql()
    cfg    = ConfigurationReader()
    acg    = AlignmentCommandGenerator()
    cursor = db.cursor()

    # find db ids adn common names for each species db
    [all_species, ensembl_db_name] = get_species (cursor)
    

    species  = 'homo_sapiens'
    switch_to_db (cursor,  ensembl_db_name[species])
    gene_ids = get_gene_ids (cursor, biotype='protein_coding', is_known=1)

    # for each human gene
    gene_ct = 0
    tot  = 0
    ok   = 0
    no_maps        = 0
    no_pepseq      = 0
    no_orthologues = 0
    min_similarity = cfg.get_value('min_accptbl_exon_sim')

    #gene_list.reverse()
    for gene_id in gene_list:

        start = time()
        gene_ct += 1
        if  not gene_ct%10: print gene_ct, "genes out of", len(gene_list)

        switch_to_db (cursor, ensembl_db_name['homo_sapiens'])
        print gene_ct, len(gene_ids),  gene_id,  gene2stable(cursor, gene_id), get_description (cursor, gene_id)

        human_exons = filter (lambda e: e.is_known==1 and e.is_coding and e.covering_exon<0, gene2exon_list(cursor, gene_id))
        human_exons.sort(key=lambda exon: exon.start_in_gene)

        ##################################################################
        for human_exon in human_exons:
            
            tot += 1

            # find all orthologous exons the human exon  maps to
            maps = get_maps(cursor, ensembl_db_name, human_exon.exon_id, human_exon.is_known)
            if verbose: 
                print "\texon no.", tot, " id", human_exon.exon_id,
                if not maps: 
                    print " no maps"
                    print human_exon
                print 
            if not maps: 
                no_maps += 1
                continue

  
            # human sequence to fasta:
            seqname   = "{0}:{1}:{2}".format('homo_sapiens', human_exon.exon_id, human_exon.is_known)
            switch_to_db (cursor, ensembl_db_name['homo_sapiens'])
            [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, 
             left_flank, right_flank, dna_seq] = get_exon_seqs (cursor, human_exon.exon_id, human_exon.is_known)
            if (not pepseq):
                if verbose and  human_exon.is_coding and  human_exon.covering_exon <0: # this should be a master exon
                    print "no pep seq for",  human_exon.exon_id, "coding ", human_exon.is_coding,
                    print "canonical: ",  human_exon.is_canonical
                    print "length of dna ", len(dna_seq)
                no_pepseq += 1
                continue

            # collect seq from all maps, and output them in fasta format
            hassw = False
            headers   = []
            sequences = {}
            exons_per_species = {}

            for map in maps:

                switch_to_db (cursor, ensembl_db_name[map.species_2])
                if map.similarity < min_similarity: continue
                exon    = map2exon(cursor, ensembl_db_name, map)
                pepseq  = get_exon_pepseq (cursor,exon)
                if (not pepseq):
                    continue
                if  map.source == 'sw_sharp':
                    exon_known_code = 2
                    hassw = True
                elif  map.source == 'usearch':
                    exon_known_code = 3
                    hassw = True
                else:
                    exon_known_code = map.exon_known_2
                seqname = "{0}:{1}:{2}".format(map.species_2, map.exon_id_2, exon_known_code)
                headers.append(seqname)
                sequences[seqname] = pepseq
                # for split exon concatenation (see below)
                if not map.species_2 in exons_per_species.keys():
                    exons_per_species[map.species_2] = []
                exons_per_species[map.species_2].append ([ map.exon_id_2, exon_known_code]);
                
                    
            if (len(headers) <=1 ):
                if verbose: print "single species in the alignment"
                no_orthologues += 1
                continue
            
            # concatenate exons from the same gene - the alignment program might go wrong otherwise
            concatenated = concatenate_exons (cursor, ensembl_db_name, sequences, exons_per_species)

            fasta_fnm = "{0}/{1}.fa".format( cfg.dir_path['scratch'], human_exon.exon_id)
            output_fasta (fasta_fnm, sequences.keys(), sequences)

            # align
            afa_fnm  = "{0}/{1}.afa".format( cfg.dir_path['scratch'], human_exon.exon_id)
            mafftcmd = acg.generate_mafft_command (fasta_fnm, afa_fnm)
            ret      = commands.getoutput(mafftcmd)

            if (verbose): print 'almt to', afa_fnm

            # read in the alignment 
            inf = erropen(afa_fnm, "r")
            aligned_seqs = {}
            for record in SeqIO.parse(inf, "fasta"):
                aligned_seqs[record.id] = str(record.seq)
            inf.close()
            # split back the concatenated exons
            if concatenated: split_concatenated_exons (aligned_seqs, concatenated)

            human_seq_seen = False
            for seq_name, sequence in aligned_seqs.iteritems():
                # if this is one of the concatenated seqs, split them back to two

                ### store the alignment as bitstring
                # Generate the bitmap
                bs         = Bits(bin='0b' + re.sub("[^0]","1", sequence.replace('-','0')))
                # The returned value of tobytes() will be padded at the end 
                # with between zero and seven 0 bits to make it byte aligned.
                # I will end up with something that looks like extra alignment gaps, that I'll have to return
                msa_bitmap = bs.tobytes() 
                # Retrieve information on the cognate
                cognate_species, cognate_exon_id, cognate_exon_known = seq_name.split(':')
                if cognate_exon_known == '2':
                    source = 'sw_sharp'
                elif cognate_exon_known == '3':
                    source = 'usearch'
                else:
                    source = 'ensembl'
                if (cognate_species == 'homo_sapiens'):
                    human_seq_seen = True
                cognate_genome_db_id = species2genome_db_id(cursor, cognate_species) # moves the cursor
                switch_to_db(cursor, ensembl_db_name['homo_sapiens']) # so move it back to h**o sapiens
                # Write the bitmap to the database
                #if (cognate_species == 'homo_sapiens'):
                if verbose: # and (source=='sw_sharp' or source=='usearch'):
                    print "storing"
                    print human_exon.exon_id, human_exon.is_known
                    print cognate_species, cognate_genome_db_id, cognate_exon_id, cognate_exon_known, source
                    print sequence
                    if not msa_bitmap:
                        print "no msa_bitmap"
                        continue
                store_or_update(cursor, "exon_map",    {"cognate_genome_db_id":cognate_genome_db_id,
                   "cognate_exon_id":cognate_exon_id   ,"cognate_exon_known"  :cognate_exon_known,
                   "source": source, "exon_id" :human_exon.exon_id, "exon_known":human_exon.is_known},
                  {"msa_bitstring":MySQLdb.escape_string(msa_bitmap)})
                 
            ok += 1
            commands.getoutput("rm "+afa_fnm+" "+fasta_fnm)

        if verbose: print " time: %8.3f\n" % (time()-start);

    print "tot: ", tot, "ok: ", ok
    print "no maps ",   no_pepseq
    print "no pepseq ", no_pepseq
    print "no orthologues  ", no_orthologues
    print
def main():

    no_threads = 1
    special = None

    if len(sys.argv) > 1 and len(sys.argv) < 3:
        print "usage: %s <set name> <number of threads> " % sys.argv[0]
        exit(1)
    elif len(sys.argv) == 3:

        special = sys.argv[1]
        special = special.lower()
        if special == 'none': special = None

        no_threads = int(sys.argv[2])

    db = connect_to_mysql()
    cfg = ConfigurationReader()
    cursor = db.cursor()

    # find db ids adn common names for each species db
    [all_species, ensembl_db_name] = get_species(cursor)
    species = 'homo_sapiens'
    switch_to_db(cursor, ensembl_db_name[species])

    if special:
        print "using", special, "set"
        gene_list = get_theme_ids(cursor, ensembl_db_name, cfg, special)
    else:
        print "using all protein coding genes"
        switch_to_db(cursor, ensembl_db_name['homo_sapiens'])
        gene_list = get_gene_ids(cursor, biotype='protein_coding', is_known=1)

    incomplete = 0
    genes_checked = 0
    #for gene_id in gene_list:
    #for gene_id in [743609]:
    for sampling_count in range(1000):

        gene_id = choice(gene_list)
        genes_checked += 1
        with_map = 0
        tot = 0
        switch_to_db(cursor, ensembl_db_name['homo_sapiens'])
        print gene2stable(cursor, gene_id), get_description(cursor, gene_id)

        # find all exons we are tracking in the database
        human_exons = gene2exon_list(cursor, gene_id)
        human_exons.sort(key=lambda exon: exon.start_in_gene)
        has_a_map = False
        for human_exon in human_exons:
            if (not human_exon.is_canonical or not human_exon.is_coding):
                continue
            if verbose:
                print
                print "\t human", human_exon.exon_id, human_exon.is_known
                print "\t ", get_exon_pepseq(cursor, human_exon,
                                             ensembl_db_name['homo_sapiens'])
                print "\t checking maps ..."
            maps = get_maps(cursor, ensembl_db_name, human_exon.exon_id,
                            human_exon.is_known)
            tot += 1
            if maps:
                has_a_map = True
                with_map += 1
                #print "ok"
            else:
                print "no maps for exon", human_exon.exon_id
                continue
            if verbose:
                for map in maps:
                    species = map.species_2
                    exon = map2exon(cursor, ensembl_db_name, map)
                    unaligned_sequence = get_exon_pepseq(
                        cursor, exon, ensembl_db_name[species])
                    if (map.similarity):
                        print "\t", species, map.source, map.exon_id_2, map.exon_known_2
                        print "\tmaps to ", map.exon_id_1, map.exon_known_1
                        print "\tsim", map.similarity,
                        print "\tsource", map.source
                        print "\t", unaligned_sequence
                        if not map.bitmap:
                            print "\t bitmap not assigned"
                        else:
                            bs = Bits(bytes=map.bitmap)
                            reconst_pepseq = ''
                            if (not bs.count(1) == len(unaligned_sequence)):
                                print "\talnd seq mismatch"

                            else:
                                usi = iter(unaligned_sequence)
                                for c in bs.bin:
                                    if c == '0': reconst_pepseq += '-'
                                    else: reconst_pepseq += next(usi)
                                print "\tbinary   : ", bs.bin
                                print "\talnd seq: ", reconst_pepseq
                        print
        if not tot == with_map:
            print "####  gene id: %d   total exons: %d     with map:  %d   ( = %d%%) " % \
                (gene_id,  tot,  with_map, int(float(with_map)/tot*100) )
            incomplete += 1

    print "genes checked: %d,  incomplete: %d" % (genes_checked, incomplete)
    cursor.close()
    db.close()

    print tot, with_map
def multiple_exon_alnmt(species_list, db_info):


    [local_db, ensembl_db_name] = db_info

    verbose  = False

    db     = connect_to_mysql()
    cfg    = ConfigurationReader()
    acg    = AlignmentCommandGenerator()
    cursor = db.cursor()


    for species in species_list:

        print
        print "############################"
        print  species

        switch_to_db (cursor,  ensembl_db_name[species])
        gene_ids = get_gene_ids (cursor, biotype='protein_coding')
        #gene_ids = get_theme_ids(cursor, cfg, 'wnt_pathway')
        if not gene_ids:
            print "no gene_ids"
            continue


        gene_ct       = 0
        tot           = 0
        ok            = 0
        no_maps       = 0
        no_pepseq     = 0
        no_paralogues = 0
        for gene_id in gene_ids:

            if verbose: start = time()
            gene_ct += 1
            if not gene_ct%100: print species, gene_ct, "genes out of", len(gene_ids)
            if verbose: 
                print
                print gene_id, gene2stable(cursor, gene_id), get_description (cursor, gene_id)

            # get the paralogues - only the representative for  the family will have this 
            paralogues = get_paras (cursor, gene_id)  
            if not paralogues:
                if verbose:  print "\t not a template or no paralogues"
                continue

            if verbose:  print "paralogues: ", paralogues

            # get _all_ exons
            template_exons = gene2exon_list(cursor, gene_id)
            if (not template_exons):
                if verbose: print 'no exons for ', gene_id
                continue

            # find all template  exons we are tracking in the database
            for template_exon in template_exons:

                if verbose: print template_exon.exon_id
                maps = get_maps(cursor, ensembl_db_name, template_exon.exon_id,
                                template_exon.is_known, species=species, table='para_exon_map')

                if not maps:
                    no_maps += 1
                    continue

                # output to fasta:
                seqname        = "{0}:{1}:{2}".format('template', template_exon.exon_id, template_exon.is_known)
                exon_seqs_info =  get_exon_seqs (cursor, template_exon.exon_id, template_exon.is_known)
                if not exon_seqs_info: continue
                [exon_seq_id, pepseq, pepseq_transl_start, pepseq_transl_end, 
                 left_flank, right_flank, dna_seq] = exon_seqs_info
                if (not pepseq):
                    if ( template_exon.is_coding and  template_exon.covering_exon <0): # this should be a master exon
                        print "no pep seq for",  template_exon.exon_id, "coding ", template_exon.is_coding,
                        print "canonical: ",  template_exon.is_canonical
                        print "length of dna ", len(dna_seq)
                        no_pepseq += 1
                    continue
                
                tot += 1

                sequences = {seqname:pepseq}
                headers   = [seqname]
                for map in maps:
                    exon    = map2exon(cursor, ensembl_db_name, map, paralogue=True)
                    pepseq  = get_exon_pepseq (cursor,exon)
                    if (not pepseq):
                        continue
                    seqname = "{0}:{1}:{2}".format('para', map.exon_id_2, map.exon_known_2)
                    headers.append(seqname)
                    sequences[seqname] = pepseq

                fasta_fnm = "{0}/{1}_{2}_{3}.fa".format( cfg.dir_path['scratch'], species, template_exon.exon_id, template_exon.is_known)
                output_fasta (fasta_fnm, headers, sequences)

                if (len(headers) <=1 ):
                    print "single species in the alignment (?)"
                    no_paralogues += 1
                    continue

                # align
                afa_fnm  = "{0}/{1}_{2}_{3}.afa".format( cfg.dir_path['scratch'], species, template_exon.exon_id, template_exon.is_known)
                mafftcmd = acg.generate_mafft_command (fasta_fnm, afa_fnm)
                ret      = commands.getoutput(mafftcmd)

                # read in the alignment
                inf = erropen(afa_fnm, "r")
                if not inf:
                    print gene_id
                    continue
                template_seq_seen = False
                for record in SeqIO.parse(inf, "fasta"):
                    ### store the alignment as bitstring
                    # Generate the bitmap
                    bs         = Bits(bin='0b' + re.sub("[^0]","1", str(record.seq).replace('-','0')))
                    msa_bitmap = bs.tobytes()
                    # Retrieve information on the cognate
                    label, cognate_exon_id, cognate_exon_known = record.id.split(':')
                    if (label == 'template'):
                        template_seq_seen = True
                    # Write the bitmap to the database
                    #print "updating: ", template_exon.exon_id
                    store_or_update(cursor, "para_exon_map", {"cognate_exon_id"    :cognate_exon_id,
                                                         "cognate_exon_known" :cognate_exon_known,
                                                         "exon_id"            :template_exon.exon_id,
                                                         "exon_known"         :template_exon.is_known},
                                    {"msa_bitstring":MySQLdb.escape_string(msa_bitmap)})
                inf.close()
                ok += 1
                commands.getoutput("rm "+afa_fnm+" "+fasta_fnm)
            if verbose: print " time: %8.3f\n" % (time()-start);
 
        outstr  =  species + " done \n"
        outstr +=  "tot: %d   ok: %d  \n" % (tot,  ok)
        outstr +=  "no maps       %d  \n" % no_pepseq
        outstr +=  "no pepseq     %d  \n" % no_pepseq
        outstr +=  "no paralogues %d  \n" % no_paralogues
        outstr += "\n"
        print outstr
def main():


    no_threads = 1
    special    = None

    if len(sys.argv) > 1 and  len(sys.argv)<3:
        print "usage: %s <set name> <number of threads> " % sys.argv[0]
        exit(1)
    elif len(sys.argv)==3:

        special = sys.argv[1]
        special = special.lower()
        if special == 'none': special = None

        no_threads = int(sys.argv[2])

    db  = connect_to_mysql()
    cfg = ConfigurationReader()
    cursor = db.cursor()

    # find db ids adn common names for each species db
    [all_species, ensembl_db_name] = get_species (cursor)
    species                        = 'homo_sapiens'
    switch_to_db (cursor,  ensembl_db_name[species])

    if special:
        print "using", special, "set"
        gene_list = get_theme_ids (cursor,  ensembl_db_name, cfg, special )
    else:
        print "using all protein coding genes"
        switch_to_db (cursor,  ensembl_db_name['homo_sapiens'])
        gene_list = get_gene_ids (cursor, biotype='protein_coding', is_known=1)
        
    incomplete = 0
    genes_checked = 0
    #for gene_id in gene_list: 
    #for gene_id in [743609]: 
    for sampling_count in range(1000):
 
        gene_id = choice(gene_list)
        genes_checked += 1
        with_map = 0
        tot      = 0
        switch_to_db (cursor, ensembl_db_name['homo_sapiens'])
        print  gene2stable(cursor, gene_id), get_description (cursor, gene_id)

        # find all exons we are tracking in the database
        human_exons = gene2exon_list(cursor, gene_id)
        human_exons.sort(key=lambda exon: exon.start_in_gene)
        has_a_map = False
        for human_exon in human_exons:
            if (not human_exon.is_canonical or  not human_exon.is_coding): continue
            if verbose:
                print  
                print "\t human",   human_exon.exon_id,  human_exon.is_known
                print "\t ", get_exon_pepseq(cursor, human_exon, ensembl_db_name['homo_sapiens'])
                print "\t checking maps ..."
            maps = get_maps(cursor, ensembl_db_name, human_exon.exon_id, human_exon.is_known)
            tot += 1
            if maps:
                has_a_map = True
                with_map += 1
                #print "ok"
            else:
                print"no maps for exon", human_exon.exon_id
                continue
            if verbose:
                for map in maps:
                    species            = map.species_2
                    exon               = map2exon(cursor, ensembl_db_name, map)
                    unaligned_sequence = get_exon_pepseq(cursor, exon, ensembl_db_name[species])
                    if ( map.similarity):
                        print "\t", species,  map.source, map.exon_id_2, map.exon_known_2
                        print "\tmaps to ",  map.exon_id_1, map.exon_known_1
                        print "\tsim",  map.similarity,
                        print "\tsource",  map.source
                        print "\t", unaligned_sequence
                        if not map.bitmap:
                            print "\t bitmap not assigned"
                        else:
                            bs = Bits(bytes=map.bitmap)
                            reconst_pepseq = ''
                            if (not bs.count(1) == len(unaligned_sequence)): 
                                print "\talnd seq mismatch"
                            
                            else:
                                usi = iter(unaligned_sequence)
                                for c in bs.bin:
                                    if c == '0': reconst_pepseq += '-'
                                    else:        reconst_pepseq += next(usi)
                                print "\tbinary   : ", bs.bin
                                print "\talnd seq: ", reconst_pepseq
                        print
        if not tot== with_map:
            print "####  gene id: %d   total exons: %d     with map:  %d   ( = %d%%) " % \
                (gene_id,  tot,  with_map, int(float(with_map)/tot*100) )
            incomplete += 1

    print "genes checked: %d,  incomplete: %d"  %  (genes_checked, incomplete)
    cursor.close()
    db.close()

    print tot, with_map