def main():

    if len(sys.argv) < 5:
        print "Usage: %s <species>  <exon_id> <exon_known> <output_name_root>" % sys.argv[0]
        exit(1)

    species         = sys.argv[1]
    exon_id         = int(sys.argv[2])
    exon_known      = int(sys.argv[3])
    output_fnm_root = sys.argv[4]

    local_db = False
    
    if local_db:
        db  = connect_to_mysql()
        cfg = ConfigurationReader()
    else:
        db  = connect_to_mysql    (user="******", passwd="sqljupitersql", host="jupiter.private.bii", port=3307)
        cfg = ConfigurationReader (user="******", passwd="sqljupitersql", host="jupiter.private.bii", port=3307)
    cursor = db.cursor()

    [all_species, ensembl_db_name] = get_species (cursor)
    sorted_species  = species_sort(cursor, all_species, species)

    reconstruct_alignment (cursor,  cfg, ensembl_db_name, species, exon_id, exon_known, sorted_species, output_fnm_root) 
    cursor.close()
    db.close()

    
    return True
def main():

    if len(sys.argv) < 5:
        print "Usage: %s <species>  <exon_id> <exon_known> <output_name_root>" % sys.argv[
            0]
        exit(1)

    species = sys.argv[1]
    exon_id = int(sys.argv[2])
    exon_known = int(sys.argv[3])
    output_fnm_root = sys.argv[4]

    db = connect_to_mysql()
    cfg = ConfigurationReader()
    cursor = db.cursor()

    [all_species, ensembl_db_name] = get_species(cursor)
    sorted_species = species_sort(cursor, all_species, species)

    reconstruct_alignment(cursor, cfg, ensembl_db_name, species, exon_id,
                          exon_known, sorted_species, output_fnm_root)
    cursor.close()
    db.close()

    return True
def annotate(gene_list, db_info):
    #
    [local_db, all_species, ensembl_db_name, species] = db_info
    db = connect_to_mysql()
    cfg = ConfigurationReader()
    acg = AlignmentCommandGenerator()
    cursor = db.cursor()

    if verbose:
        print "thread %s annotating %s " % (get_thread_name(), species)

    if not species == "oryctolagus_cuniculus":
        print "The preferred list of species is hardcoded for the rabbit. Consider modifying."
        exit(1)

    preferred_species = [species, "mus_musculus", "rattus_norvegicus", "homo_sapiens"]
    nearest_species_list = species_sort(cursor, all_species, species)
    species_list = preferred_species + filter(lambda x: x not in preferred_species, nearest_species_list)

    inf = erropen("temp_out.fasta", "w")

    for gene_id in gene_list:
        # for gene_id in [90020]:
        switch_to_db(cursor, ensembl_db_name[species])
        ####################
        # get stable id and description of this gene
        stable_id = gene2stable(cursor, gene_id)
        if not gene_list.index(gene_id) % 100:
            print gene_list.index(gene_id), "out of", len(gene_list)
        if verbose:
            print "============================================="
        if verbose:
            print gene_id, stable_id
        ####################
        # find the annotation from the preferred source organism
        [annot_source, orthology_type, annotation, ortho_stable_ids] = find_annotation(
            cursor, ensembl_db_name, species_list, gene_id
        )
        if verbose:
            print annot_source, "**", orthology_type, "**", annotation

        ###################
        # find splices (for now find the canonical splice)
        switch_to_db(cursor, ensembl_db_name[species])
        canonical_splice = get_canonical_transl(acg, cursor, gene_id, species)

        # output
        if orthology_type == "self" or annotation == "none":
            header = ">{0} {1}".format(stable_id, annotation)
        else:
            header = ">{0} {1} [by sim to {2}, {3}]".format(stable_id, annotation, annot_source, ortho_stable_ids)

        print >> inf, header
        print >> inf, canonical_splice

    cursor.close()
    db.close()
Beispiel #4
0
def annotate(gene_list, db_info):
    #
    [local_db, all_species, ensembl_db_name, species] = db_info
    db = connect_to_mysql()
    cfg = ConfigurationReader()
    acg = AlignmentCommandGenerator()
    cursor = db.cursor()

    if verbose: print "thread %s annotating %s " % (get_thread_name(), species)

    if not species == 'oryctolagus_cuniculus':
        print 'The preferred list of species is hardcoded for the rabbit. Consider modifying.'
        exit(1)

    preferred_species = [
        species, 'mus_musculus', 'rattus_norvegicus', 'homo_sapiens'
    ]
    nearest_species_list = species_sort(cursor, all_species, species)
    species_list = preferred_species + filter(
        lambda x: x not in preferred_species, nearest_species_list)

    inf = erropen("temp_out.fasta", "w")

    for gene_id in gene_list:
        #for gene_id in [90020]:
        switch_to_db(cursor, ensembl_db_name[species])
        ####################
        # get stable id and description of this gene
        stable_id = gene2stable(cursor, gene_id)
        if not gene_list.index(gene_id) % 100:
            print gene_list.index(gene_id), "out of", len(gene_list)
        if verbose: print "============================================="
        if verbose: print gene_id, stable_id
        ####################
        # find the annotation from the preferred source organism
        [annot_source, orthology_type, annotation,
         ortho_stable_ids] = find_annotation(cursor, ensembl_db_name,
                                             species_list, gene_id)
        if verbose: print annot_source, "**", orthology_type, '**', annotation

        ###################
        # find splices (for now find the canonical splice)
        switch_to_db(cursor, ensembl_db_name[species])
        canonical_splice = get_canonical_transl(acg, cursor, gene_id, species)

        # output
        if orthology_type == 'self' or annotation == 'none':
            header = ">{0} {1}".format(stable_id, annotation)
        else:
            header = ">{0} {1} [by sim to {2}, {3}]".format(
                stable_id, annotation, annot_source, ortho_stable_ids)

        print >> inf, header
        print >> inf, canonical_splice

    cursor.close()
    db.close()
def get_template(cursor, ensembl_db_name, map_table, species, he):

    template_species = None
    template_seq = None

    nearest_species = species_sort(cursor, list(map_table.keys()), species)[1:]
    # I have a problem with the lamprey - it is an outlayer to everything else
    if species == 'petromyzon_marinus':
        nearest_species.reverse()

    exon = Exon()
    len_human_protein_seq = 1.0 * len(he.pepseq)
    for nearest in nearest_species:
        if not map_table[nearest][he]: continue

        m = map_table[nearest][he]

        if m and m.warning: continue

        template_seqs = get_exon_seqs(cursor, m.exon_id_2, m.exon_known_2,
                                      ensembl_db_name[nearest])
        if not template_seqs:
            template_species = None
        else:
            [
                exon_seq_id, protein_seq, pepseq_transl_start,
                pepseq_transl_end, left_flank, right_flank, dna_seq
            ] = template_seqs
            if len(protein_seq) / len_human_protein_seq < 0.3: continue
            if len_human_protein_seq / len(protein_seq) < 0.3: continue
            if not left_flank or not right_flank: continue
            if "XX" in protein_seq: continue
            template_species = nearest
            template_exon_id = m.exon_id_2
            template_exon_known = m.exon_known_2
            template_exon_seq_id = exon_seq_id
            template_similarity_to_human = m.similarity
            break

    if not template_species: return None
    return [
        template_species, template_exon_seq_id, dna_seq, protein_seq,
        template_similarity_to_human
    ]
def get_template (cursor, ensembl_db_name, map_table, species, he):

    template_species = None
    template_seq     = None

    nearest_species = species_sort(cursor, map_table.keys(), species)[1:]
    # I have a problem with the lamprey - it is an outlayer to everything else
    if species=='petromyzon_marinus':
        nearest_species.reverse()

    exon = Exon()
    len_human_protein_seq = 1.0*len(he.pepseq)
    for nearest in nearest_species:
        if not map_table[nearest][he]: continue

        m = map_table[nearest][he]

        if m and m.warning: continue

        template_seqs = get_exon_seqs (cursor,  m.exon_id_2,  m.exon_known_2,  ensembl_db_name[nearest])
        if not template_seqs:
            template_species = None
        else:
            [exon_seq_id, protein_seq, pepseq_transl_start, 
             pepseq_transl_end, left_flank, right_flank, dna_seq] = template_seqs
            if len(protein_seq)/len_human_protein_seq < 0.3: continue
            if len_human_protein_seq/len(protein_seq) < 0.3: continue
            if not left_flank or not right_flank: continue
            if "XX" in protein_seq: continue
            template_species     = nearest
            template_exon_id     = m.exon_id_2
            template_exon_known  = m.exon_known_2
            template_exon_seq_id = exon_seq_id
            template_similarity_to_human = m.similarity
            break

    if not template_species: return None
    return [template_species, template_exon_seq_id, dna_seq, protein_seq, template_similarity_to_human]
def main():

    if len(sys.argv) < 5:
        print "Usage: %s <species>  <exon_id> <exon_known> <output_name_root>" % sys.argv[0]
        exit(1)

    species         = sys.argv[1]
    exon_id         = int(sys.argv[2])
    exon_known      = int(sys.argv[3])
    output_fnm_root = sys.argv[4]

    db  = connect_to_mysql()
    cfg = ConfigurationReader()
    cursor = db.cursor()

    [all_species, ensembl_db_name] = get_species (cursor)
    sorted_species  = species_sort(cursor, all_species, species)

    reconstruct_alignment (cursor,  cfg, ensembl_db_name, species, exon_id, exon_known, sorted_species, output_fnm_root) 
    cursor.close()
    db.close()

    
    return True
def find_missing_exons(human_gene_list, db_info):

    #
    [local_db, ensembl_db_name, method] = db_info
    db = connect_to_mysql()
    cfg = ConfigurationReader()
    acg = AlignmentCommandGenerator()
    cursor = db.cursor()

    # find db ids and common names for each species db
    all_species, ensembl_db_name = get_species(cursor)
    # minimal acceptable similarity between exons
    min_similarity = cfg.get_value('min_accptbl_exon_sim')

    switch_to_db(cursor, ensembl_db_name['homo_sapiens'])

    ##################################################################################
    # loop over human genes
    gene_ct = 0
    found = 0
    sought = 0
    unsequenced = 0
    #human_gene_list.reverse()
    for human_gene_id in human_gene_list:

        switch_to_db(cursor, ensembl_db_name['homo_sapiens'])

        # Get stable id and description of this gene -- DEBUG
        human_stable = gene2stable(cursor, human_gene_id)
        human_description = get_description(cursor, human_gene_id)
        if verbose: print(human_gene_id, human_stable, human_description)

        # progress counter
        gene_ct += 1
        if (not gene_ct % 10):
            print("processed ", gene_ct, " out of ", len(human_gene_list),
                  "genes")
            print("exons found: ", found, " out of ", sought, "sought")

        # find all human exons for this gene that we are tracking in the database
        human_exons = [
            e for e in gene2exon_list(cursor, human_gene_id)
            if e.covering_exon < 0 and e.is_canonical and e.is_known
        ]
        if not human_exons:
            print("\t\t no exons found")
            continue

        human_exons.sort(key=lambda exon: exon.start_in_gene)
        for he in human_exons:
            he.stable_id = exon2stable(cursor, he.exon_id)

    ##################################################################################
    ##################################################################################
        # make 'table' of maps, which is either pointer to the map if it exists, or None
        map_table = {}
        for species in all_species:
            map_table[species] = {}
            for he in human_exons:
                map_table[species][he] = None

    #################
        maps_for_exon = {}
        for he in human_exons:
            maps_for_exon[he] = get_maps(cursor, ensembl_db_name, he.exon_id,
                                         he.is_known)  # exon data
            for m in maps_for_exon[he]:
                #if m.source ==  'usearch': continue
                #if m.source == 'sw_sharp': continue
                #if m.source == 'sw_sharp':
                #    print 'sw_sharp'
                #if m.source == 'usearch':
                #    print 'usearch',  m.similarity, m.species_2, m.exon_id_1, m.exon_id_2
                if m.similarity < min_similarity: continue
                m_previous = map_table[m.species_2][he]
                if m_previous and m_previous.similarity > m.similarity:
                    continue
                map_table[m.species_2][he] = m

    # get rid of species that do not have the gene at all
        for species in all_species:
            one_exon_found = False
            for he in human_exons:
                if map_table[species][he]:
                    one_exon_found = True
                    break
            if not one_exon_found:
                del map_table[species]

    # fill in the peptide sequence field for each human exon
    # get rid of exons  that appear in no other species but human (?)
        bad_he = []
        for he in human_exons:
            one_species_found = False
            he.pepseq = get_exon_pepseq(cursor, he,
                                        ensembl_db_name['homo_sapiens'])
            if len(
                    he.pepseq
            ) < 3:  # can I ever get rid of all the nonsense I find in Ensembl?
                bad_he.append(he)
                continue
            for species in list(map_table.keys()):
                if species == 'homo_sapiens': continue
                if map_table[species][he]:
                    one_species_found = True
                    break
            if not one_species_found:
                bad_he.append(he)
        human_exons = [he for he in human_exons if not he in bad_he]

        # keep track of nearest neighbors for each human exon
        previous = {}
        next = {}
        prev = None
        for he in human_exons:
            previous[he] = prev
            if prev: next[prev] = he
            prev = he
        next[he] = None

        # fill,  starting from the species that are nearest to the human
        if not list(map_table.keys()):
            continue  # whatever

        species_sorted_from_human = species_sort(cursor,
                                                 list(map_table.keys()),
                                                 species)[1:]

        for species in species_sorted_from_human:
            print(species)
            # see which exons have which neighbors
            #if verbose: print he.exon_id, species
            no_left = []
            no_right = []
            has_both_neighbors = []
            one_existing_map = None
            for he in human_exons:
                m = map_table[species][he]
                if m and not m.warning:  # the one existing map should not be a problematic one
                    one_existing_map = m
                    continue
                prev = previous[he]
                nxt = next[he]
                if prev and nxt and map_table[species][prev] and map_table[
                        species][nxt]:
                    has_both_neighbors.append(he)
                elif not prev or not map_table[species][prev]:
                    no_left.append(he)
                elif not nxt or not map_table[species][nxt]:
                    no_right.append(he)

            if not one_existing_map: continue  # this shouldn't happen
            if not has_both_neighbors and not no_left and not no_right:
                continue

            # what is the gene that we are talking about?
            exon_id = one_existing_map.exon_id_2
            is_known = one_existing_map.exon_known_2
            gene_id = exon_id2gene_id(cursor, ensembl_db_name[species],
                                      exon_id, is_known)
            # is it mitochondrial?
            mitochondrial = is_mitochondrial(cursor, gene_id,
                                             ensembl_db_name[species])
            # where is the gene origin (position on the sequence)
            gene_coords = get_gene_coordinates(cursor, gene_id,
                                               ensembl_db_name[species])
            if not gene_coords: continue
            [gene_seq_region_id, gene_start, gene_end,
             gene_strand] = gene_coords

            # fill in exons that have both neighbors:
            # human exon functions as a coordinate here
            for he in has_both_neighbors:

                # get template (known exon from the nearest species)
                template_info = get_template(cursor, ensembl_db_name,
                                             map_table, species, he)
                if not template_info: continue
                # previous_ and next_seq_region are of the type Seq_Region defined on the top of the file
                # get previous region
                prev_seq_region = get_neighboring_region(
                    cursor, ensembl_db_name, map_table, species, gene_coords,
                    he, previous[he])
                if not prev_seq_region: continue
                # get following  region
                next_seq_region = get_neighboring_region(
                    cursor, ensembl_db_name, map_table, species, gene_coords,
                    he, next[he])
                if not next_seq_region: continue
                sought += 1
                reply = find_NNN(cursor, ensembl_db_name, cfg, acg, he,
                                 maps_for_exon[he], species, gene_id,
                                 gene_coords, prev_seq_region, next_seq_region,
                                 template_info, mitochondrial, method)
                if reply == 'NNN':
                    unsequenced += 1

            # work backwards
            # use the last known region on the left as the bound
            no_left.reverse()
            next_seq_region = None
            for he in no_left:
                m = map_table[species][he]
                # check first if we haave already looked into this, and found incomplete region
                #if m and m.warning: continue
                # get template (known exon from the nearest species)
                template_info = get_template(cursor, ensembl_db_name,
                                             map_table, species, he)
                if not template_info: continue

                # get following  region
                if not next_seq_region:
                    next_seq_region = get_neighboring_region(
                        cursor, ensembl_db_name, map_table, species,
                        gene_coords, he, next[he])
                if not next_seq_region: continue

                # otherwise it is the last thing we found
                # the previous region is eyeballed from the next on
                # the previous and the  next region frame the search region
                prev_seq_region = left_region(next_seq_region,
                                              MAX_SEARCH_LENGTH)
                sought += 1
                reply = find_NNN(cursor, ensembl_db_name, cfg, acg, he,
                                 maps_for_exon[he], species, gene_id,
                                 gene_coords, prev_seq_region, next_seq_region,
                                 template_info, mitochondrial, method)
                if reply == 'NNN':
                    unsequenced += 1

            # repeat the whole procedure on the right
            prev_seq_region = None
            for he in no_right:
                m = map_table[species][he]
                # check first if we haave already looked into this, and found incomplete region
                #if  m and m.warning: continue
                # get template (known exon from the nearest species)
                template_info = get_template(cursor, ensembl_db_name,
                                             map_table, species, he)
                if not template_info: continue

                # get following  region
                if not prev_seq_region:
                    prev_seq_region = get_neighboring_region(
                        cursor, ensembl_db_name, map_table, species,
                        gene_coords, he, previous[he])
                if not prev_seq_region: continue
                # otherwise it is the last thing we found

                # the following region is eyeballed from the previous
                next_seq_region = right_region(prev_seq_region,
                                               MAX_SEARCH_LENGTH)
                sought += 1
                reply = find_NNN(cursor, ensembl_db_name, cfg, acg, he,
                                 maps_for_exon[he], species, gene_id,
                                 gene_coords, prev_seq_region, next_seq_region,
                                 template_info, mitochondrial, method)
                if reply == 'NNN':
                    unsequenced += 1

            print(species, "sought", sought, " unseq", unsequenced)
def find_missing_exons(human_gene_list, db_info):

    # 
    [local_db, ensembl_db_name, method] = db_info
    db  = connect_to_mysql()
    cfg = ConfigurationReader()
    acg = AlignmentCommandGenerator()
    cursor = db.cursor()

    # find db ids and common names for each species db
    all_species, ensembl_db_name = get_species (cursor)
    # minimal acceptable similarity between exons
    min_similarity = cfg.get_value('min_accptbl_exon_sim')

    switch_to_db (cursor, ensembl_db_name['homo_sapiens'])

    ##################################################################################
    # loop over human genes
    gene_ct = 0
    found   = 0
    sought  = 0
    unsequenced = 0
    #human_gene_list.reverse()
    for human_gene_id in human_gene_list:

	switch_to_db (cursor, ensembl_db_name['homo_sapiens'])

	# Get stable id and description of this gene -- DEBUG
	human_stable      = gene2stable    (cursor, human_gene_id)
        human_description = get_description(cursor, human_gene_id)
	if verbose:  print human_gene_id, human_stable, human_description

	# progress counter 
	gene_ct += 1
	if (not gene_ct%10): 
            print "processed ",   gene_ct, " out of ", len(human_gene_list), "genes"
            print "exons found: ",  found, " out of ", sought, "sought"

	# find all human exons for this gene that we are tracking in the database 
	human_exons = [e for e in gene2exon_list(cursor, human_gene_id) 
                       if e.covering_exon < 0 and e.is_canonical and e.is_known]
        if not human_exons: 
            print "\t\t no exons found"
            continue

	human_exons.sort(key=lambda exon: exon.start_in_gene)
        for he in human_exons:
            he.stable_id = exon2stable (cursor, he.exon_id)

        ##################################################################################
        ##################################################################################
	# make 'table' of maps, which is either pointer to the map if it exists, or None
	map_table = {}
        for species in all_species:
            map_table[species] = {}
            for he in human_exons:
                map_table[species][he] = None

        ################# 
        maps_for_exon = {}
        for he in human_exons:
            maps_for_exon[he] =  get_maps(cursor, ensembl_db_name, he.exon_id, he.is_known) # exon data
            for m in maps_for_exon[he]:
                #if m.source ==  'usearch': continue
                #if m.source == 'sw_sharp': continue
                #if m.source == 'sw_sharp': 
                #    print 'sw_sharp'
                #if m.source == 'usearch': 
                #    print 'usearch',  m.similarity, m.species_2, m.exon_id_1, m.exon_id_2
                if m.similarity < min_similarity: continue
                m_previous = map_table[m.species_2][he]
                if m_previous and m_previous.similarity > m.similarity:
                        continue
                map_table[m.species_2][he] = m


        # get rid of species that do not have the gene at all
        for species in all_species:
            one_exon_found = False
            for he in human_exons:
                if map_table[species][he]:
                    one_exon_found = True
                    break
            if not one_exon_found:
                del map_table[species]
               
        # fill in the peptide sequence field for each human exon
        # get rid of exons  that appear in no other species but human (?)
        bad_he = []
        for he in human_exons:
            one_species_found = False
            he.pepseq =   get_exon_pepseq (cursor, he, ensembl_db_name['homo_sapiens'])
            if len (he.pepseq) < 3:  # can I ever get rid of all the nonsense I find in Ensembl?
                bad_he.append(he)
                continue
            for species in  map_table.keys(): 
                if species =='homo_sapiens': continue
                if map_table[species][he]:
                    one_species_found = True
                    break
            if not one_species_found:
                bad_he.append(he)
        human_exons = filter (lambda he: not he in bad_he, human_exons)

 
  
        # keep track of nearest neighbors for each human exon
        previous = {}
        next     = {}
        prev     = None
        for he in human_exons:
            previous[he]        = prev
            if prev: next[prev] = he
            prev = he
        next[he] = None

        # fill,  starting from the species that are nearest to the human
        if not map_table.keys():
            continue # whatever

        species_sorted_from_human = species_sort(cursor,map_table.keys(),species)[1:]

        for species in species_sorted_from_human:
            print species
            # see which exons have which neighbors
            #if verbose: print he.exon_id, species
            no_left  = []
            no_right = []
            has_both_neighbors = []
            one_existing_map   = None
            for he in human_exons:
                m =  map_table[species][he]
                if m and not m.warning: # the one existing map should not be a problematic one 
                    one_existing_map = m
                    continue
                prev = previous[he]
                nxt  = next[he]
                if prev and nxt and map_table[species][prev] and map_table[species][nxt]:
                    has_both_neighbors.append(he)
                elif not prev or not map_table[species][prev]:
                    no_left.append(he)
                elif not nxt  or not map_table[species][nxt]:
                    no_right.append(he)
            
            if not one_existing_map: continue # this shouldn't happen
            if not has_both_neighbors and not no_left and not no_right: continue

            # what is the gene that we are talking about?
            exon_id  = one_existing_map.exon_id_2
            is_known = one_existing_map.exon_known_2
            gene_id  = exon_id2gene_id (cursor, ensembl_db_name[species], exon_id, is_known)
            # is it mitochondrial?
            mitochondrial = is_mitochondrial(cursor, gene_id, ensembl_db_name[species])
            # where is the gene origin (position on the sequence)
            gene_coords =  get_gene_coordinates (cursor, gene_id, ensembl_db_name[species])
            if not gene_coords: continue
            [gene_seq_region_id, gene_start, gene_end, gene_strand] = gene_coords

            # fill in exons that have both neighbors:
            # human exon functions as a coordinate here
            for he in has_both_neighbors:


                # get template (known exon from the nearest species)
                template_info = get_template (cursor, ensembl_db_name, 
                                              map_table, species, he)
                if not template_info: continue
                # previous_ and next_seq_region are of the type Seq_Region defined on the top of the file
                # get previous region
                prev_seq_region = get_neighboring_region (cursor, ensembl_db_name, 
                                                          map_table, species, gene_coords, he, previous[he])
                if not prev_seq_region: continue
                # get following  region
                next_seq_region = get_neighboring_region  (cursor, ensembl_db_name, 
                                                           map_table, species, gene_coords, he, next[he])
                if not next_seq_region: continue
                sought += 1
                reply = find_NNN (cursor, ensembl_db_name, cfg, acg, he, maps_for_exon[he], 
                                                   species, gene_id,  gene_coords, prev_seq_region, 
                                                   next_seq_region, template_info, mitochondrial, method)
                if reply=='NNN':
                    unsequenced += 1


            # work backwards
            # use the last known region on the left as the bound
            no_left.reverse()
            next_seq_region = None
            for he in no_left:
                m =  map_table[species][he]
                # check first if we haave already looked into this, and found incomplete region
                #if m and m.warning: continue
                # get template (known exon from the nearest species)
                template_info = get_template (cursor, ensembl_db_name, map_table, species, he)
                if not template_info: continue

                # get following  region
                if not next_seq_region:
                    next_seq_region = get_neighboring_region (cursor, ensembl_db_name, map_table, 
                                                              species,  gene_coords, he, next[he])
                if not next_seq_region: continue

                # otherwise it is the last thing we found
                # the previous region is eyeballed from the next on
                # the previous and the  next region frame the search region
                prev_seq_region = left_region (next_seq_region, MAX_SEARCH_LENGTH)
                sought         += 1
                reply = find_NNN (cursor, ensembl_db_name, cfg, acg, he, maps_for_exon[he], 
                                                   species,  gene_id, gene_coords, prev_seq_region, next_seq_region,
                                                   template_info, mitochondrial, method)
                if reply=='NNN':
                    unsequenced += 1
 
            # repeat the whole procedure on the right
            prev_seq_region = None
            for he in no_right:
                m =  map_table[species][he]
                # check first if we haave already looked into this, and found incomplete region
                #if  m and m.warning: continue
                # get template (known exon from the nearest species)
                template_info = get_template (cursor, ensembl_db_name, 
                                                                map_table, species, he)
                if not template_info: continue

                # get following  region
                if not prev_seq_region:
                    prev_seq_region = get_neighboring_region (cursor, ensembl_db_name, map_table, 
                                                              species, gene_coords,  he, previous[he])
                if not prev_seq_region: continue
                # otherwise it is the last thing we found
                    

                # the following region is eyeballed from the previous 
                next_seq_region = right_region (prev_seq_region, MAX_SEARCH_LENGTH)
                sought         += 1
                reply = find_NNN (cursor, ensembl_db_name, cfg, acg, he,  maps_for_exon[he], 
                                                    species, gene_id, gene_coords, prev_seq_region, next_seq_region,
                                                    template_info, mitochondrial, method)
                if reply=='NNN':
                    unsequenced += 1
                    
            print species, "sought", sought, " unseq", unsequenced
def main():

    # exon_id comes from the command line
    if len(sys.argv) < 5:
        print "Usage: %s <exon_id>  <exon_known>  <species>  <output name> [nt]" % sys.argv[0]
        exit (1)
        
    exon_id    = long(sys.argv[1])
    exon_known = int(sys.argv[2])
    species    = sys.argv[3]
    afa_name   = sys.argv[4]

    nt =  len(sys.argv)>5 and sys.argv[5]=='nt'
    
    ######################################
    db     = connect_to_mysql(user="******", passwd="sqljupitersql", host="jupiter.private.bii", port=3307)
    cursor = db.cursor()
    [all_species, ensembl_db_name] = get_species (cursor)
   
    if not is_coding_exon(cursor, exon_id, exon_known, ensembl_db_name[species]) and not nt:
        # make an empty file
        cmd = "touch " + afa_name
        ret = commands.getoutput(cmd)
        cursor.close()
        db.close()
        return

    ######################################
    if (species == 'homo_sapiens'):
        [human_exon_id, human_exon_known] = [exon_id, exon_known]
        ok = True
    else:
        # find the human exon this guy maps to
        species_db_id = species2genome_db_id(cursor, species)
        if (species_db_id):
            [human_exon_id, human_exon_known] = find_human_cognate(cursor, ensembl_db_name, exon_id,
                                                                   exon_known, species_db_id)
        ok = species_db_id > 0 and human_exon_id>0

    ######################################
    if (ok):
        alignment = make_exon_alignment(cursor, ensembl_db_name, human_exon_id, human_exon_known, nt)   
    if (ok and alignment):
        # sort the remaining species  taxonomically
        sorted_species = species_sort(cursor, all_species, species)
        sorted_names = sort_names (sorted_species, alignment)
        output_fasta (afa_name, sorted_names, alignment)
    else: 
        # make file consisting of the original sequence only
        if nt:
            seq = get_exon_seqs (cursor, exon_id, exon_known, ensembl_db_name[species])[-1];
        else:
            seq = get_exon_pepseq (cursor, exon_id, exon_known, ensembl_db_name[species])
        if seq:
            alignment = {}
            sequence_name  = make_seq_name (cursor, ensembl_db_name, species, exon_id, exon_known, [])
            alignment[sequence_name] = seq;
            output_fasta (afa_name, [seq_name], alignment)
        else:
            # if not even the original sequence can be found, its definitely somebody else's fault;
            # make an empty file
            cmd = "touch " + afa_name
            ret = commands.getoutput(cmd)

    cursor.close()
    db.close()
    
    return
Beispiel #11
0
def main():

    if len(sys.argv) < 2:
        print("usage: %s <gene symbol> [trivial] [prepend]" % sys.argv[0])
        print(
            "trivial = use trivial species name; prepend = prepend gene name")
        exit()
    gene_name = sys.argv[1]
    trivial = "trivial" in sys.argv
    prepend = "prepend" in sys.argv  # prepends geen synbol to gene name
    ref_species = 'homo_sapiens'  # the orthologue table is filled only here, for the moment

    out_fasta = f"{gene_name}.orthos.fasta"
    out_afa = f"{gene_name}.orthos.afa"
    tmpfile = "tmp.fa"
    logfile = "tmp.log"
    for fnm in [out_fasta, out_afa, tmpfile, logfile]:
        if os.path.exists(fnm): os.remove(fnm)
    home = os.getcwd()

    db = connect_to_mysql(Config.mysql_conf_file)
    cursor = db.cursor()

    qry = "select ensembl_gene_id  from identifier_maps.hgnc where approved_symbol='%s'" % gene_name
    ensembl_stable_gene_id = hard_landing_search(cursor, qry)[0][0]

    [all_species, ensembl_db_name] = get_species(cursor)
    # species_sort(cursor, all_species, 'homo_sapiens')
    # exit()

    switch_to_db(cursor, ensembl_db_name[ref_species])
    qry = "select gene_id from gene where stable_id='%s'" % ensembl_stable_gene_id
    gene_id = hard_landing_search(cursor, qry)[0][0]

    ref_stable_transl_id = gene2stable_canon_transl_id(
        cursor, gene_id, ensembl_db_name[ref_species])
    write_to_fasta(home, ref_species, ref_stable_transl_id, tmpfile, logfile,
                   out_fasta)

    print(gene_name, ensembl_stable_gene_id, gene_id, ref_stable_transl_id)
    species_in_the_almt = [ref_species]
    qry = "select  cognate_gene_id, cognate_genome_db_id from orthologues where gene_id=%d" % gene_id
    for line in error_intolerant_search(cursor, qry):
        [cognate_gene_id, cognate_genome_db_id] = line
        qry = f"select db_name from exolocator_meta.db_names where genome_db_id={cognate_genome_db_id}"
        db_name = hard_landing_search(cursor, qry)[0][0]
        stable_transl_id = gene2stable_canon_transl_id(cursor, cognate_gene_id,
                                                       db_name)
        species = db_name.split("core")[0].rstrip("_")
        if species not in all_species: continue
        print(db_name, species, cognate_gene_id, stable_transl_id)
        ok = write_to_fasta(home, species, stable_transl_id, tmpfile, logfile,
                            out_fasta)
        if ok: species_in_the_almt.append(species)
    if os.path.exists(tmpfile): os.remove(tmpfile)

    cmd = f"{Config.muscle} -in {out_fasta} -out tmp.afa"
    subprocess.call(["bash", "-c", cmd])

    species_sorted = species_sort(cursor, species_in_the_almt, ref_species)
    trivial_names = get_trivial(cursor, species_sorted) if trivial else None
    name_prefix = gene_name if prepend else None
    reorder_seqs('tmp.afa', species_sorted, out_afa, trivial_names,
                 name_prefix)
    if os.path.exists('tmp.afa'): os.remove('tmp.afa')

    cursor.close()
    db.close()
    return True