Esempio n. 1
0
def load_clans_from_db():
    """
    Retrieves all clan family members from DB and returns a dictionary in the
    in the form of {clan_id : [FAM1, FAM2, ... ], ... }

    clan_acc: Clan accession as in Rfam
    """

    clans = {}

    cnx = RfamDB.connect()
    cursor = cnx.cursor(raw=True)

    query = "SELECT * FROM clan_membership"

    # execute query
    cursor.execute(query)

    # fetch the data
    rows = cursor.fetchall()

    cursor.close()
    RfamDB.disconnect(cnx)

    # create the dictionary
    for row in rows:
        if str(row[0]) not in clans.keys():
            clans[str(row[0])] = [str(row[1])]
        else:
            clans[str(row[0])].append(str(row[1]))

    return clans
Esempio n. 2
0
def set_is_singificant_to_zero_multi(non_sig_seqs):
    """
    A function for batching the process of updating full_region tables upon
    clan competition. Updates the full_region table setting is_significant
    field to zero (0) for the list of non significant sequences passed in
    the form of (rfam_acc, rfamseq_acc, seq_start) tuples.

    non_sig_seqs: A list of the non significant regions to be set to zero.
                  The list is product of clan competition.

    """

    # connect to db
    cnx = RfamDB.connect()

    # get a new buffered cursor
    cursor = cnx.cursor(raw=True)

    # query to update is_significant field to 0
    query = ("UPDATE full_region SET is_significant=0 "
             "WHERE rfam_acc=%s AND rfamseq_acc=%s AND seq_start=%s")

    try:
        # execute query batched
        cursor.executemany(query, non_sig_seqs)
        cnx.commit()

    except:
        print "MySQL Update Error. Rolling back..."
        cnx.rollback()
        cursor.close()
        RfamDB.disconnect(cnx)

    cursor.close()
    RfamDB.disconnect(cnx)
Esempio n. 3
0
def load_clan_members_from_db(clan_acc):
    """
    Retrieves all clan family members from DB and returns a list of the family
    accessions.

    clan_acc: Clan accession as in Rfam
    """

    clan_members = []

    cnx = RfamDB.connect()
    cursor = cnx.cursor(raw=True)

    query = ("SELECT rfam_acc FROM clan_membership "
             "WHERE clan_acc=\'%s\'") % (clan_acc)

    cursor.execute(query)

    rows = cursor.fetchall()

    cursor.close()
    RfamDB.disconnect(cnx)

    for fam in rows:
        clan_members.append(str(fam[0]))

    return clan_members
Esempio n. 4
0
def fetch_author_orcid(author_name):
    """
    Searches for author by name and
    :param author_name:
    :return:
    """

    orcid = None
    cnx = RfamDB.connect()

    # Get a new buffered cursor
    cursor = cnx.cursor(buffered=True)

    query = """
            Select orcid from author
            where name like '%s%s%s' or synonyms like '%s%s%s'
            """

    cursor.execute(query % (chr(37), author_name, chr(37),
                            chr(37), author_name, chr(37)))

    result = cursor.fetchone()
    if result is not None:
        orcid = result[0]

    cursor.close()
    RfamDB.disconnect(cnx)

    # This will return none if there's no ORCiD available
    return orcid
Esempio n. 5
0
def set_genome_size(genome_sizes):
    """
    Updates total_length in genome table

    genome_sizes: This can be a json file for multiple updates or a tuple in
    the form of (size, upid) for single genome, where size is in nucleotides

    return: A list of UP/RG ids as stored in genome
    """

    # connect to db
    cnx = RfamDB.connect()

    # get a new buffered cursor
    cursor = cnx.cursor(buffered=True)

    # update is_significant field to 0
    query = "update genome set total_length=%s where upid=%s"

    genome_size_list = []
    if os.path.isfile(genome_sizes):
        gen_size_file = open(genome_sizes, 'r')
        genome_size_dict = json.load(gen_size_file)
        gen_size_file.close()
        genome_size_list = [(str(genome_size_dict[upid]), str(upid)) for upid in genome_size_dict.keys()]

    else:
        genome_size_list.append(genome_sizes)

    cursor.executemany(query, genome_size_list)
    cnx.commit()

    cursor.close()
    RfamDB.disconnect(cnx)
def get_full_region_seq_counts():
    """
    Builds a dictionary where keys are Rfam family accessions (rfam_acc)
    and values the number of sequences in full_region per family
    (e.g. {'RFXXXXX':N,...})
    """

    seq_counts = {}

    # get a connection object for RfamDB
    cnx = RfamDB.connect()

    cursor = cnx.cursor(buffered=True)

    query = ("SELECT rfam_acc, count(*) FROM full_region\n"
             "GROUP BY rfam_acc")

    cursor.execute(query)

    # get full_region sequence counts per family
    raw_counts = cursor.fetchall()

    # build dictionary
    for entry in raw_counts:
        seq_counts[str(entry[0])] = int(entry[1])

    # close DB handles
    cursor.close()
    RfamDB.disconnect(cnx)

    # result dictionary
    return seq_counts
Esempio n. 7
0
def fetch_rfam_accs_sorted(order='DESC'):
    """
    Fetch all available Rfam accs and sort by specified order. DESC by default

    order: The order in which to sort the records (ASC, DESC)
    returns: void
    """

    # connect to db
    cnx = RfamDB.connect()

    # get a new buffered cursor
    cursor = cnx.cursor(buffered=True)

    # update is_significant field to 0
    query = ("select rfam_acc from seed_region\n"
             "group by rfam_acc\n"
             "order by count(*) %s" % order)

    cursor.execute(query)

    rfam_accs = [str(x[0]) for x in cursor.fetchall()]

    cursor.close()
    RfamDB.disconnect(cnx)

    return rfam_accs
Esempio n. 8
0
def fetch_clanin_data():
    """
    Fetches all rfam_ids per clan. To be used for clanin file generation

    :return: void
    """

    clan_members = {}
    cnx = RfamDB.connect()

    cursor = cnx.cursor(buffered=True)

    cursor.execute("select cm.clan_acc, f.rfam_id from clan_membership cm, family f "
                   "where f.rfam_acc=cm.rfam_acc "
                   "order by cm.clan_acc")

    clan_pairs = cursor.fetchall()

    cursor.close()

    # build clan membership dictionary
    for clan_pair in clan_pairs:
        clan_acc = clan_pair[0]
        rfam_id = clan_pair[1]

        if clan_acc not in clan_members.keys():
            clan_members[clan_acc] = [rfam_id]
        else:
            clan_members[clan_acc].append(rfam_id)

    cursor.close()
    RfamDB.disconnect(cnx)

    return clan_members
Esempio n. 9
0
def set_is_significant_to_zero_adv(rfam_acc, rfamseq_acc, region):
    """
    Fetch the correct db entry from full_region table according to
    rfam_acc and rfamseq_acc and set is_significant field to zero (0)

    rfam_acc: RNA family accession
    rfamseq_acc: Family specific sequence accession
    """

    # maybe have this working out of the list which will be returned from

    # connect to db
    cnx = RfamDB.connect()

    # get a new buffered cursor
    cursor = cnx.cursor(buffered=True)

    # update is_significant field to 0
    query = ("UPDATE full_region SET is_significant=0 "
             "WHERE rfam_acc=\'%s\' AND rfamseq_acc=\'%s\' AND seq_start=%d") % (rfam_acc,
                                                                                 rfamseq_acc,
                                                                                 region)

    cursor.execute(query)

    cnx.commit()

    cursor.close()
    RfamDB.disconnect(cnx)
Esempio n. 10
0
def fetch_clan_pdb_full_region_records(clan_acc):
    """
    Fetches all regions per clan

    param clan_acc: A valid Rfam clan accession

    returns: A list with all pdb regions per clan
    """

    cnx = RfamDB.connect()
    clan_cursor = cnx.cursor(buffered=True)

    clan_pdb_region_query = ("select pfr.rfam_acc, concat(pfr.pdb_id,'_',pfr.chain) as seq_acc, "
                             "pfr.pdb_start, pfr.pdb_end, pfr.bit_score, pfr.evalue_score "
                             "from pdb_full_region pfr, clan_membership cm "
                             "where cm.rfam_acc=pfr.rfam_acc "
                             "and cm.clan_acc=\'%s\' "
                             "order by seq_acc")

    clan_cursor.execute(clan_pdb_region_query % clan_acc)

    clan_sequence_regions = clan_cursor.fetchall()

    clan_cursor.close()
    RfamDB.disconnect(cnx)

    return clan_sequence_regions
Esempio n. 11
0
def reset_is_significant(clan_comp_type='FULL'):
    """
    This function resets full_region's is_singificant field's back to 1.
    This should be able to update all or part of the table for clan
    competition initialization and restoration.
    """
    seq_regs = []

    cnx = RfamDB.connect()

    # cursor to fetch data
    d_cursor = cnx.cursor(buffered=True)

    # query to fetch all non significant sequences
    if clan_comp_type.upper() == 'FULL':
        select_query = ("SELECT rfam_acc, rfamseq_acc, seq_start FROM full_region "
                        "WHERE is_significant=0")

        # query to update 0 fields from s_query
        update_query = ("UPDATE full_region SET is_significant=1 "
                        "WHERE rfam_acc=%s AND rfamseq_acc=%s AND seq_start=%s")

    elif clan_comp_type.upper() == 'PDB':
        select_query = ("SELECT rfam_acc, pdb_id, chain, pdb_start from pdb_full_region "
                        "WHERE is_significant=0")

        update_query = ("UPDATE pdb_full_region SET is_significant=1 "
                        "WHERE rfam_acc=%s AND pdb_id=%s AND chain=%s AND pdb_start=%s")

    d_cursor.execute(select_query)

    # construct region list here
    for row in d_cursor:
        if clan_comp_type.upper() == 'FULL':
            seq_regs.append((str(row[0]), str(row[1]), int(row[2])))

        elif clan_comp_type.upper() == 'PDB':
            seq_regs.append((str(row[0]), str(row[1]), str(row[2]), int(row[3])))

    d_cursor.close()

    # get a new cursor for db updates
    u_cursor = cnx.cursor(raw=True)

    # update db
    try:
        u_cursor.executemany(update_query, seq_regs)
        cnx.commit()
    except:
        print "MySQL Update Error. Rolling back..."
        cnx.rollback()
        u_cursor.close()
        RfamDB.disconnect(cnx)

    u_cursor.close()
    RfamDB.disconnect(cnx)
Esempio n. 12
0
def set_number_of_distinct_families_in_genome(upid):
    """
    Sets the number distinct families with hits in a specific genome defined
    by its corresponding upid

    upid: A specific genome upid to update the number of distinct families

    return: void
    """

    upids = []
    # connect to db
    cnx = RfamDB.connect()

    # get a new buffered cursor
    cursor = cnx.cursor(buffered=True)

    if upid is None:
        upids = fetch_all_upids()

        for upid in upids:
            select_query = ("select count(distinct rfam_acc) from full_region fr, genseq gs\n"
                            "where fr.rfamseq_acc=gs.rfamseq_acc\n"
                            "and gs.upid=\'%s\'\n"
                            "and gs.version=\'%s\'")

            cursor.execute(select_query % (upid, version))
            count = cursor.fetchone()[0]

            # update is_significant field to 0
            update_query = "update genome set num_families=%d where upid=\'%s\'"

            # execute query
            cursor.execute(update_query % (count, upid))


    else:
        select_query = ("select count(distinct rfam_acc) from full_region fr, genseq gs\n"
                        "where fr.rfamseq_acc=gs.rfamseq_acc\n"
                        "and gs.upid=\'%s\'\n"
                        "and gs.version=\'%s\'")

        cursor.execute(select_query % (upid, version))
        count = cursor.fetchone()[0]

        # update is_significant field to 0
        update_query = "update genome set num_families=%d where upid=\'%s\'"

        # execute query
        cursor.execute(update_query % (count, upid))

    # commit changes and disconnect
    cnx.commit()
    cursor.close()
    RfamDB.disconnect(cnx)
Esempio n. 13
0
def update_family_ncbi():
    """
    Updates table family ncbi by adding all distinct taxonomic ids per family

    :return: void
    """

    cnx = RfamDB.connect()

    cursor = cnx.cursor(buffered=True)
    c_cursor = cnx.cursor(buffered=True)

    cursor.execute("Select rfam_acc from family")

    rfam_accs = cursor.fetchall()

    cursor.close()

    # family_ncbi query
    get_ncbi_ids = ("select distinct rs.ncbi_id, f.rfam_id, "
                    "f.rfam_acc from full_region fr, rfamseq rs, family f "
                    "where fr.rfamseq_acc=rs.rfamseq_acc "
                    "and f.rfam_acc=fr.rfam_acc "
                    "and fr.rfam_acc=\'%s\' "
                    "and fr.is_significant=1")

    insert_query = "insert into family_ncbi (ncbi_id, rfam_id, rfam_acc) values (%s,%s,%s)"

    family_ncbi_entries = []
    cursor = cnx.cursor(buffered=True)
    for rfam_acc in rfam_accs:
        c_cursor.execute(get_ncbi_ids % rfam_acc[0])
        family_ncbi_entries = list(c_cursor.fetchall())
        entries_reformatted = [(str(x[0]), str(x[1]), str(x[2])) for x in family_ncbi_entries]

        try:
            cursor.executemany(insert_query, entries_reformatted)
            cnx.commit()

        except:
            cnx.rollback()
            sys.exit("\nError updating family_ncbi table for family %s." % rfam_acc[0])

        family_ncbi_entries = []
        entries_reformatted = []

    cursor.close()
    c_cursor.close()
    RfamDB.disconnect(cnx)

    print "Done updating family_ncbi."
Esempio n. 14
0
def set_num_full_sig_seqs():
    """
    Updates num_full in family table to hold the number of significant
    sequences rather than the number of sequences in the full alignment
    """

    cnx = RfamDB.connect()

    cursor = cnx.cursor(buffered=True)
    c_cursor = cnx.cursor(buffered=True)

    cursor.execute("Select rfam_acc from family")

    rfam_accs = cursor.fetchall()

    cursor.close()

    # query to count all significant sequences of a family

    count_query = ("select count(*)\n"
                   "from full_region f\n"
                   "where is_significant=1\n"
                   "and type=\'full\'\n"
                   "and rfam_acc=\'%s\'")

    # counts list
    counts = []
    for acc in rfam_accs:
        c_cursor.execute(count_query % str(acc[0]))
        count = c_cursor.fetchall()[0][0]

        counts.append((count, str(acc[0])))

        count = 0

    c_cursor.close()
    c_cursor = cnx.cursor(buffered=True)

    update_query = (
        "update family set num_full=%s where rfam_acc=%s")

    try:
        c_cursor.executemany(update_query, counts)
        cnx.commit()
    except:
        cnx.rollback()

    c_cursor.close()
    RfamDB.disconnect(cnx)

    print "Done"
Esempio n. 15
0
def print_report(no_fams):
    '''
        Calls all functions and displays the results on screen
    '''

    cnx = RfamDB.connect()

    check_ss_images(cnx, no_fams)
    check_sunburst(cnx)
    count_rchie_diagrams(cnx, no_fams)
    check_alignment_and_tree(cnx, no_fams)
    check_html_alignment(cnx, no_fams)

    RfamDB.disconnect(cnx)
Esempio n. 16
0
def fasta_gen_handler(seq_file, out_dir, rfam_accessions=None):
    """
    The purpose of this script is to handle the fasta generation process,
    generate individual shell scripts for each available family and submit
    them to the cluster

    seq_file:   Path to the input sequence file (e.g. rfamseq11.fa)
    out_dir:    The output directory where the fasta files will be generated

    """

    # fetch family accessions
    families = []

    if rfam_accessions is None:
        cnx = RfamDB.connect()

        cursor = cnx.cursor(buffered=True)

        query = ("SELECT rfam_acc FROM family")

        cursor.execute(query)

        entries = cursor.fetchall()

        cursor.close()
        RfamDB.disconnect(cnx)

        families = [str(fam[0]) for fam in entries]
    else:
        fp = open(rfam_accessions, 'r')
        families = [x.strip() for x in fp]
        fp.close()

    # create scripts dir within output directory
    if not os.path.exists(os.path.join(out_dir, "scripts")):
        os.mkdir(os.path.join(out_dir, "scripts"))

    if not os.path.exists(os.path.join(out_dir, "log")):
        os.mkdir(os.path.join(out_dir, "log"))

    for fam in families:

        # 1. Generate script file
        sh_path = shell_script_generator(
            seq_file, fam, out_dir, os.path.join(out_dir, "scripts"))

        # 2. submit job under group
        cmd = "bsub < %s" % (sh_path)
        subprocess.call(cmd, shell=True)
Esempio n. 17
0
def load_clan_seqs_from_db(clan_acc):  # tested
    """
    Loads specific clan family sequences from full_region table and returns
    a dictionary structure as {Rfam_acc:{Rfseq_acc:[start, end, evalue]}}
    for clan competition.

    This has been modified to accommodate sequence duplicates

    clan_acc: Clan accession as in Rfam
    """

    fam_seqs = {}

    # connect to db
    cnx = RfamDB.connect()

    # get a new buffered cursor
    cursor = cnx.cursor(raw=True)

    # Fetch clan specific family full_region data
    query = ("SELECT full_region.rfam_acc, full_region.rfamseq_acc, \
            full_region.seq_start, full_region.seq_end, full_region.evalue_score\n"
             "FROM full_region\n"
             "JOIN (SELECT rfam_acc FROM clan_membership WHERE clan_acc=\'%s\') as CLAN_FAMS\n"
             "ON CLAN_FAMS.rfam_acc=full_region.rfam_acc") % (clan_acc)

    # execute the query
    cursor.execute(query)

    # build family dictionary of sequences
    for row in cursor:

        if str(row[RFAM_ACC]) in fam_seqs.keys():

            if str(row[SEQ_ACC]) in fam_seqs[str(row[RFAM_ACC])].keys():

                fam_seqs[str(row[RFAM_ACC])][str(row[SEQ_ACC])].append(
                    (int(row[START]), int(row[END]), float(row[EVAL])))
            else:
                fam_seqs[str(row[RFAM_ACC])][str(row[SEQ_ACC])] = [(int(row[START]),
                                                                    int(row[END]), float(row[EVAL]))]
        else:
            fam_seqs[str(row[RFAM_ACC])] = {
                str(row[SEQ_ACC]): [(int(row[START]), int(row[END]), float(row[EVAL]))]}

    # close cursor and DB connection
    cursor.close()
    RfamDB.disconnect(cnx)

    return fam_seqs
Esempio n. 18
0
def set_number_of_species():
    """
    Updates number_of_species in family table
    """

    cnx = RfamDB.connect()

    cursor = cnx.cursor(buffered=True)
    c_cursor = cnx.cursor(buffered=True)

    cursor.execute("Select rfam_acc from family")

    rfam_accs = cursor.fetchall()

    cursor.close()

    count_query = ("select count(distinct ncbi_id)\n"
                   "from full_region f, rfamseq r\n"
                   "where r.rfamseq_acc=f.rfamseq_acc\n"
                   "and is_significant=1 and rfam_acc=\'%s\'")

    # counts list
    counts = []
    for acc in rfam_accs:
        c_cursor.execute(count_query % str(acc[0]))
        count = c_cursor.fetchall()

        counts.append((count[0][0], str(acc[0])))

        count = 0

    c_cursor.close()
    c_cursor = cnx.cursor(buffered=True)

    # query to update number_of_species in the family table
    update_query = (
        "update family set number_of_species=%s where rfam_acc=%s")

    try:
        c_cursor.executemany(update_query, counts)
        cnx.commit()
    except:
        cnx.rollback()

    c_cursor.close()
    RfamDB.disconnect(cnx)

    print "Done"
Esempio n. 19
0
def update_chromosome_info_in_genseq():
    # connect to db
    cnx = RfamDB.connect()

    # get a new buffered cursor
    cursor = cnx.cursor(buffered=True, dictionary=True)

    genome_query = "select upid, assembly_acc from genome where assembly_acc is not NULL"

    update_query = """
                   update genseq set chromosome_type=\'%s\', chromosome_name=\'%s\'
                   where upid=\'%s\' and rfamseq_acc=\'%s\' and version=14.0
                   """

    cursor.execute(genome_query)
    accessions = cursor.fetchall()
    cursor.close()

    upid_gca_dict = {}

    cursor = cnx.cursor(buffered=True)

    for pair in accessions:
        upid_gca_dict[pair["upid"]] = pair["assembly_acc"]

    for upid in upid_gca_dict.keys():
        # print assembly_acc
        #print upid_gca_dict[upid]

        upid_gca_dict[upid]

        if upid_gca_dict[upid][0:3] == 'GCF' or upid_gca_dict[upid] == '':
            continue

        data = fgm.fetch_gca_data(upid, upid_gca_dict[upid], 'kingdom')

        if "fields" in data:
            fields = data["fields"]
            if "chromosomes" in fields:
                for chromosome in fields["chromosomes"]:
                    cursor.execute(update_query % (str(chromosome["type"]), str(chromosome["name"]),
                                                   str(upid), str(chromosome["accession"])))

    cnx.commit()
    cursor.close()
    RfamDB.disconnect(cnx)
Esempio n. 20
0
def set_pdb_is_significant_to_zero(non_sig_seqs):
    """
    Sets pdb_full_region is_significant to 0 for non significant regions in
    non_sig_seqs list

    non_sig_seqs: A list of the non significant regions to be set to zero.
    The list is product of clan competition.

    returns: void
    """

    # reformat list by splitting pdb_id and chain
    pdb_reformatted_regions = []

    for competed_region in non_sig_seqs:
        # split pdb_id chain pairs by '_' used in concatenation for clan competition
        # pdb_id: pdb_id_chain_pairs[0] and chain: pdb_id_chain_pairs[2]
        pdb_id_chain_pairs = competed_region[1].partition('_')
        pdb_reformatted_regions.append((str(competed_region[0]), str(pdb_id_chain_pairs[0]),
                                        str(pdb_id_chain_pairs[2]), int(competed_region[2])))

    # connect to db
    cnx = RfamDB.connect()

    # get a new buffered cursor
    cursor = cnx.cursor(raw=True)

    # query to update is_significant field to 0
    query = ("update pdb_full_region set is_significant=0 "
             "where rfam_acc=%s and pdb_id=%s and chain=%s and pdb_start=%s")

    try:
        # execute query batched
        cursor.executemany(query, pdb_reformatted_regions)
        cnx.commit()

    except:
        print "MySQL Update Error. Rolling back..."
        cnx.rollback()
        cursor.close()
        RfamDB.disconnect(cnx)

    cursor.close()
    RfamDB.disconnect(cnx)
Esempio n. 21
0
def update_post_process(jobs_file):
    """
    Updates _post_process table with the job_ids per family assigned by lsf

    jobs_file: This is a tab separated txt file generated from running the
    job_dequeuer.py script that submits the rfam_view_process for each
    family.
    (rfam_acc uuid job_id ...)
    """

    job_ids = []

    jobs_file_fp = open(jobs_file, 'r')

    query = ("UPDATE _post_process SET lsf_id=%s "
             "WHERE rfam_acc=%s AND uuid=%s")

    # get lsf ids from file
    for line in jobs_file_fp:
        line = line.strip()
        line = string.split(line, '\t')
        job_ids.append((line[2], line[0], line[1]))

    jobs_file_fp.close()

    # connect to db
    cnx = RfamDB.connect()
    cursor = cnx.cursor(raw=True)

    # update db
    try:
        cursor.executemany(query, job_ids)
        cnx.commit()  # move this after except statement??

    except:
        # rollback to previous state
        print "MySQL Update Error. Rollback..."
        cnx.rollback()
        cursor.close()
        RfamDB.disconnect(cnx)

    cursor.close()
    RfamDB.disconnect(cnx)
Esempio n. 22
0
def fetch_clan_accessions():
    """
    Fetches all clan accessions from the database and returns then in the
    form of a list

    returns: A list of all clan accessions
    """
    cnx = RfamDB.connect()
    clan_cursor = cnx.cursor(buffered=True)

    clan_query = "SELECT clan_acc FROM clan"

    # fetch clans
    clan_cursor.execute(clan_query)
    clans = [str(x[0]) for x in clan_cursor.fetchall()]

    clan_cursor.close()
    RfamDB.disconnect(cnx)

    return clans
Esempio n. 23
0
def update_assembly_names(upid_gca_file):
    """
    Loads the upid_gca json files and parses the corresponding assembly xml files
    from ENA to fetch the assembly names and update the fields in genome table

    param upid_gca_file: A json file with upid: {"GCA" : GCAxxx, "DOM": domain }

    return: void
    """

    fp = open(upid_gca_file, 'r')
    acc_pairs = json.load(fp)
    fp.close()

    # a list of tuples to
    assembly_names = []

    for upid in acc_pairs.keys():
        data = fgm.fetch_gca_data(upid, acc_pairs[upid]["GCA"], acc_pairs[upid]["DOM"])

        if "fields" in data:
            if data["fields"]["assembly_name"] is not None:
                assembly_names.append((data["fields"]["assembly_name"], upid))

    # connect to db
    cnx = RfamDB.connect()

    # get a new buffered cursor
    cursor = cnx.cursor(buffered=True, dictionary=True)

    query = "update genome set assembly_name=%s where upid=%s"

    cursor.executemany(query, assembly_names)
    cnx.commit()

    cursor.close()
    RfamDB.disconnect(cnx)
Esempio n. 24
0
def fetch_all_upids():
    """
    Fetch all available genome accessions from genome table

    return: A list of UP/RG ids as stored in genome
    """

    # connect to db
    cnx = RfamDB.connect()

    # get a new buffered cursor
    cursor = cnx.cursor(buffered=True)

    # update is_significant field to 0
    query = "select upid from genome"

    cursor.execute(query)

    genome_accs = [str(x[0]) for x in cursor.fetchall()]

    cursor.close()
    RfamDB.disconnect(cnx)

    return genome_accs
Esempio n. 25
0
def fetch_clan_full_region_records(clan_acc):
    """
    Fetches all regions per clan

    param clan_acc: A valid Rfam clan accession

    returns: A list with all regions from full_region table for a specific  clan
    """

    cnx = RfamDB.connect()
    clan_cursor = cnx.cursor(buffered=True)

    clan_region_query = ("SELECT * FROM full_region\n"
                         "JOIN (SELECT rfam_acc FROM clan_membership WHERE clan_acc=\'%s\') as CLAN_FAMS\n"
                         "ON CLAN_FAMS.rfam_acc=full_region.rfam_acc")  # % (clan_acc)

    clan_cursor.execute(clan_region_query % clan_acc)

    clan_sequence_regions = clan_cursor.fetchall()

    clan_cursor.close()
    RfamDB.disconnect(cnx)

    return clan_sequence_regions
Esempio n. 26
0
def generate_fasta(seq_file, out_dir):
    """
    Uses esl-sfetch to generate family specific fasta files out of seq_file
    which is provided as source (e.g. rfamseq11.fa). It will generate fasta
    files for all families by default

    seq_file:   The path to rfamseq input file in fasta format, for
                generating the fasta files

    out_dir:    Destination directory where the files will be
                generated
    """

    sequence = ''
    fp_out = None
    seq_bits = None

    # logging sequences not exported
    # rename this to family log
    log_file = os.path.join(out_dir, "missing_seqs.log")
    logging.basicConfig(filename=log_file, filemode='w', level=logging.INFO)

    # connect to db
    cnx = RfamDB.connect()

    # get a new buffered cursor
    cursor = cnx.cursor(raw=True)

    # fetch clan specific family full_region data and sequence description

    query = (
        "SELECT fr.rfam_acc, fr.rfamseq_acc, fr.seq_start, fr.seq_end, rf.description\n"
        "FROM full_region fr, rfamseq rf\n"
        "WHERE fr.rfamseq_acc=rf.rfamseq_acc\n"
        "AND fr.is_significant=1\n"
        "ORDER BY fr.rfam_acc")

    # execute the query
    cursor.execute(query)

    for region in cursor:

        # new family
        if str(region[RFAM_ACC]) != rfam_acc:
            # check if there's no open file
            if fp_out is not None:
                fp_out.close()

            # open new fasta file
            fp_out = gzip.open(
                os.path.join(out_dir,
                             str(region[RFAM_ACC]) + ".fa.gz"), 'w')

        rfam_acc = region[RFAM_ACC]

        cmd = "%s -c %s/%s %s %s" % (ESL_PATH, str(
            region[START]), str(region[END]), seq_file, str(region[SEQ_ACC]))

        proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)

        seq = proc.communicate()[0]

        # get sequence
        sequence = ''
        seq_bits = seq.split('\n')[1:]
        sequence = sequence.join(seq_bits)

        # print sequence

        if sequence != '' and seq_validator(sequence) is True:
            # write header
            fp_out.write(
                ">%s/%s-%s %s\n" % (str(region[SEQ_ACC]), str(
                    region[START]), str(region[END]), str(region[DESC])))

            # write sequence
            fp_out.write(sequence + '\n')

        else:
            # logging sequences that have not been exported
            logging.info(sequence)

    # close last file
    fp_out.close()

    # disconnect from DB
    cursor.close()
    RfamDB.disconnect(cnx)
Esempio n. 27
0
def generate_fasta_single(seq_file, rfam_acc, out_dir):
    """
    Uses esl-sfetch to generate family specific fasta files out of seq_file
    which is provided as source. Works on single family based on rfam_acc.
    Files are generated in a compressed .fa.gz format

    seq_file:   This is the the path to rfamseq input file in fasta format,
                for generating the fasta files

    rfam_acc:   The rfam_acc of a specific family

    out_dir:    This is the destination directory where the files will be
                generated
    """

    sequence = ''
    fp_out = None
    seq_bits = None

    # logging sequences not exported
    # rename this to family log
    log_file = os.path.join(out_dir, rfam_acc + ".log")
    logging.basicConfig(filename=log_file, filemode='w', level=logging.INFO)

    # connect to db
    cnx = RfamDB.connect()

    # get a new buffered cursor
    cursor = cnx.cursor(raw=True)

    # fetch sequence accessions for specific family - significant only!!
    query = (
        "SELECT fr.rfam_acc, fr.rfamseq_acc, fr.seq_start, fr.seq_end, rf.description\n"
        "FROM full_region fr, rfamseq rf\n"
        "WHERE fr.rfamseq_acc=rf.rfamseq_acc\n"
        "AND fr.is_significant=1\n"
        "AND fr.rfam_acc=\'%s\'") % (rfam_acc)

    # execute the query
    cursor.execute(query)

    # open a new fasta output file
    fp_out = gzip.open(os.path.join(out_dir, str(rfam_acc) + ".fa.gz"), 'w')

    for region in cursor:

        cmd = "%s -c %s/%s %s %s" % (ESL_PATH, str(
            region[START]), str(region[END]), seq_file, str(region[SEQ_ACC]))

        proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE)

        seq = proc.communicate()[0]

        # get sequence
        sequence = ''
        seq_bits = seq.split('\n')[1:]
        sequence = sequence.join(seq_bits)

        # print sequence

        if sequence != '' and seq_validator(sequence) is True:
            # write header
            fp_out.write(
                ">%s/%s-%s %s\n" % (str(region[SEQ_ACC]), str(
                    region[START]), str(region[END]), str(region[DESC])))

            # write sequence
            fp_out.write(sequence + '\n')

        else:
            # logging sequences that have not been exported
            logging.info(str(region[SEQ_ACC]))

    # close last file
    fp_out.close()

    # disconnect from DB
    cursor.close()
    RfamDB.disconnect(cnx)
Esempio n. 28
0
def generate_fasta(seq_file, out_dir):
    """
    Uses esl-sfetch to generate family specific fasta files out of seq_file
    which is provided as source (e.g. rfamseq11.fa). It will generate fasta
    files for all families by default

    seq_file:   The path to rfamseq input file in fasta format, for
                generating the fasta files

    out_dir:    Destination directory where the files will be
                generated
    """

    sequence = ''
    fp_out = None
    seq_bits = None

    # logging sequences not exported
    # rename this to family log
    log_file = os.path.join(out_dir, "missing_seqs.log")
    logging.basicConfig(
        filename=log_file, filemode='w', level=logging.INFO)

    # connect to db
    cnx = RfamDB.connect()

    # get a new buffered cursor
    cursor = cnx.cursor(raw=True)

    # fetch clan specific family full_region data and sequence description

    query = ("SELECT fr.rfam_acc, fr.rfamseq_acc, fr.seq_start, fr.seq_end, rf.description\n"
             "FROM full_region fr, rfamseq rf\n"
             "WHERE fr.rfamseq_acc=rf.rfamseq_acc\n"
             "AND fr.is_significant=1\n"
             "ORDER BY fr.rfam_acc")

    # execute the query
    cursor.execute(query)

    for region in cursor:

        # new family
        if str(region[RFAM_ACC]) != rfam_acc:
            # check if there's no open file
            if fp_out is not None:
                fp_out.close()

            # open new fasta file
            fp_out = gzip.open(
                os.path.join(out_dir, str(region[RFAM_ACC]) + ".fa.gz"), 'w')

        rfam_acc = region[RFAM_ACC]

        cmd = "%s -c %s/%s %s %s" % (ESL_PATH,
                                     str(region[START]), str(region[END]),
                                     seq_file, str(region[SEQ_ACC]))

        proc = subprocess.Popen(
            cmd, shell=True, stdout=subprocess.PIPE)

        seq = proc.communicate()[0]

        # get sequence
        sequence = ''
        seq_bits = seq.split('\n')[1:]
        sequence = sequence.join(seq_bits)

        # print sequence

        if sequence != '' and seq_validator(sequence) is True:
            # write header
            fp_out.write(">%s/%s-%s %s\n" % (str(region[SEQ_ACC]),
                                             str(region[START]),
                                             str(region[END]),
                                             str(region[DESC])))

            # write sequence
            fp_out.write(sequence + '\n')

        else:
            # logging sequences that have not been exported
            logging.info(sequence)

    # close last file
    fp_out.close()

    # disconnect from DB
    cursor.close()
    RfamDB.disconnect(cnx)
Esempio n. 29
0
def generate_fasta_single(seq_file, rfam_acc, out_dir):
    """
    Uses esl-sfetch to generate family specific fasta files out of seq_file
    which is provided as source. Works on single family based on rfam_acc.
    Files are generated in a compressed .fa.gz format

    seq_file:   This is the the path to rfamseq input file in fasta format,
                for generating the fasta files

    rfam_acc:   The rfam_acc of a specific family

    out_dir:    This is the destination directory where the files will be
                generated
    """

    sequence = ''
    fp_out = None
    seq_bits = None

    # logging sequences not exported
    # rename this to family log
    log_file = os.path.join(out_dir, rfam_acc + ".log")
    logging.basicConfig(
        filename=log_file, filemode='w', level=logging.INFO)

    # connect to db
    cnx = RfamDB.connect()

    # get a new buffered cursor
    cursor = cnx.cursor(raw=True)

    # fetch sequence accessions for specific family - significant only!!
    query = ("SELECT fr.rfam_acc, fr.rfamseq_acc, fr.seq_start, fr.seq_end, rf.description\n"
             "FROM full_region fr, rfamseq rf\n"
             "WHERE fr.rfamseq_acc=rf.rfamseq_acc\n"
             "AND fr.is_significant=1\n"
             "AND fr.rfam_acc=\'%s\'") % (rfam_acc)

    # execute the query
    cursor.execute(query)

    # open a new fasta output file
    fp_out = gzip.open(
        os.path.join(out_dir, str(rfam_acc) + ".fa.gz"), 'w')

    for region in cursor:

        cmd = "%s -c %s/%s %s %s" % (ESL_PATH,
                                     str(region[START]), str(region[END]),
                                     seq_file, str(region[SEQ_ACC]))

        proc = subprocess.Popen(
            cmd, shell=True, stdout=subprocess.PIPE)

        seq = proc.communicate()[0]

        # get sequence
        sequence = ''
        seq_bits = seq.split('\n')[1:]
        sequence = sequence.join(seq_bits)

        # print sequence

        if sequence != '' and seq_validator(sequence) is True:
            # write header
            fp_out.write(">%s/%s-%s %s\n" % (str(region[SEQ_ACC]),
                                             str(region[START]),
                                             str(region[END]),
                                             str(region[DESC])))

            # write sequence
            fp_out.write(sequence + '\n')

        else:
            # logging sequences that have not been exported
            logging.info(str(region[SEQ_ACC]))

    # close last file
    fp_out.close()

    # disconnect from DB
    cursor.close()
    RfamDB.disconnect(cnx)
Esempio n. 30
0
def set_number_of_genomic_significant_hits(upid):
    """
    Sets the number of significant hits for a specific genome according to
    its corresponding upid id

    upid: A specific genome upid to update the number of significant hits

    return: void
    """

    # connect to db
    cnx = RfamDB.connect()

    # get a new buffered cursor
    cursor = cnx.cursor(buffered=True)

    if upid is None:

        upids = fetch_all_upids()

        for upid in upids:


            count_query = ("select count(fr.rfamseq_acc)\n"
                           "from full_region fr, genseq gs\n"
                           "where fr.rfamseq_acc=gs.rfamseq_acc\n"
                           "and fr.is_significant=1\n"
                           "and gs.upid=\'%s\'\n"
                           "and gs.version=\'%s\'")

            cursor.execute(count_query % (upid, version))
            count = cursor.fetchone()[0]

            # update is_significant field to 0
            update_query = "update genome set num_rfam_regions=%d where upid=\'%s\'"

            # execute query
            cursor.execute(update_query % (count, upid))
    else:

        count_query = ("select count(fr.rfamseq_acc)\n"
                       "from full_region fr, genseq gs\n"
                       "where fr.rfamseq_acc=gs.rfamseq_acc\n"
                       "and fr.is_significant=1\n"
                       "and gs.upid=\'%s\'\n"
                       "and gs.version=\'%s\'")


        cursor.execute(count_query % (upid, version))
        count = cursor.fetchone()[0]

        # update is_significant field to 0
        update_query = "update genome set num_rfam_regions=%d where upid=\'%s\'"

        # execute query
        cursor.execute(update_query % (count, upid))

    # commit changes and disconnect
    cnx.commit()
    cursor.close()
    RfamDB.disconnect(cnx)
Esempio n. 31
0
def export_sequences(seq_db, sql, filename=None, out_dir=None):
    """
    Exporting sequences from rfam_live and generating a fasta file
    by fetching the corresponding regions from seq_db provided as param

    seq_db:     A fasta sequence database to extract sequence regions from.
                Default seq_db is rfamseq11.fa
    sql:        The query to execute (string or valid .sql file)
    filename:   Ouput filename
    out_dir:    A path to the output directory
    """

    log_file = os.path.join(out_dir, "missing_seqs.log")
    logging.basicConfig(
        filename=log_file, filemode='w', level=logging.INFO)

    cnx = RfamDB.connect()
    cursor = cnx.cursor(raw=True)

    query = ''

    if os.path.isfile(sql):
        fp = open(sql, 'r')
        query = ' '.join(fp.readlines())

    else:
        query = sql

    cursor.execute(query)

    # open an output file
    fp_out = None

    if filename is not None:
        fp_out = gzip.open(
            os.path.join(out_dir, filename + ".fa.gz"), 'w')
    else:
        fp_out = gzip.open(os.path.join(out_dir, OUT_FILE_NAME), 'w')

    for region in cursor:
        cmd = "%s -c %s/%s %s %s" % (ESL_PATH,
                                     str(region[START]), str(region[END]),
                                     seq_file, str(region[SEQ_ACC]))

        proc = subprocess.Popen(
            cmd, shell=True, stdout=subprocess.PIPE)

        seq = proc.communicate()[0]

        # get sequence
        sequence = ''
        seq_bits = seq.split('\n')[1:]
        sequence = sequence.join(seq_bits)

        if (sequence != '' and seq_validator(sequence) is True):
            # write header
            fp_out.write(">%s/%s-%s %s\n" % (str(region[SEQ_ACC]),
                                             str(region[START]),
                                             str(region[END]),
                                             str(region[DESC])))

            # write sequence
            fp_out.write(sequence + '\n')

        else:
            logging.info(sequence)

    fp_out.close()

    cursor.close()
    RfamDB.disconnect(cnx)
Esempio n. 32
0
def reset_is_significant(clan_comp_type='FULL'):
    """
    This function resets full_region's is_singificant field's back to 1.
    This should be able to update all or part of the table for clan
    competition initialization and restoration.
    """
    seq_regs = []

    cnx = RfamDB.connect()

    # cursor to fetch data
    d_cursor = cnx.cursor(buffered=True)

    # query to fetch all non significant sequences
    if clan_comp_type.upper() == 'FULL':
        select_query = (
            "SELECT rfam_acc, rfamseq_acc, seq_start FROM full_region "
            "WHERE is_significant=0")

        # query to update 0 fields from s_query
        update_query = (
            "UPDATE full_region SET is_significant=1 "
            "WHERE rfam_acc=%s AND rfamseq_acc=%s AND seq_start=%s")

    elif clan_comp_type.upper() == 'PDB':
        select_query = (
            "SELECT rfam_acc, pdb_id, chain, pdb_start from pdb_full_region "
            "WHERE is_significant=0")

        update_query = (
            "UPDATE pdb_full_region SET is_significant=1 "
            "WHERE rfam_acc=%s AND pdb_id=%s AND chain=%s AND pdb_start=%s")

    d_cursor.execute(select_query)

    # construct region list here
    for row in d_cursor:
        if clan_comp_type.upper() == 'FULL':
            seq_regs.append((str(row[0]), str(row[1]), int(row[2])))

        elif clan_comp_type.upper() == 'PDB':
            seq_regs.append(
                (str(row[0]), str(row[1]), str(row[2]), int(row[3])))

    d_cursor.close()

    # get a new cursor for db updates
    u_cursor = cnx.cursor(raw=True)

    # update db
    try:
        u_cursor.executemany(update_query, seq_regs)
        cnx.commit()
    except:
        print "MySQL Update Error. Rolling back..."
        cnx.rollback()
        u_cursor.close()
        RfamDB.disconnect(cnx)

    u_cursor.close()
    RfamDB.disconnect(cnx)
def export_rfam_family_files(f_types, out_dir):
    """
    Fetches all Rfam family accessions from rfam_live, checks out each
    family and copies the files in f_types in their corresponding
    directories

    f_types: A list of file type keywords we need to
             export (e.g. ["SEED", "CM"])
    out_dir: The path to the output directory. If it does not exist it will
             be created
    """

    # Create the output directory if it does not exist
    if (not os.path.exists(out_dir)):
        os.mkdir(out_dir)

    # if current working directory isn't out_dir, change directory
    if (string.find(os.getcwd(), out_dir) == -1):
        os.chdir(out_dir)

    # generate specific output directories for each file type
    file_path = ''
    for f_type in f_types:
        file_path = os.path.join(out_dir, f_type)
        if (not os.path.exists(file_path)):
            os.mkdir(file_path)
        file_path = ''

    # get DB connection handle
    cnx = RfamDB.connect()

    # get mysql cursor
    cursor = cnx.cursor(buffered=True)

    # execute query
    cursor.execute("SELECT rfam_acc FROM family")

    cmd = ''

    # fetch files for all Rfam family accessions
    for rfam_acc in cursor:

        rfam_acc = str(rfam_acc[0])
        cmd = SVN_CHECKOUT % rfam_acc

        # Check out family in out_dir using rfco on lsf
        subprocess.call(cmd, shell=True)

        # path to
        fam_dir = os.path.join(out_dir, rfam_acc)

        # copy files and rename
        for f_type in f_types:
            filename = rfam_acc + '.' + f_type.lower()
            """
            if (f_type == "SEED"):
                # 1. open out file handler
                seed_out_fp = open(
                    os.path.join(os.path.join(out_dir, f_type), filename), 'w')
                # 2. open desc handler
                desc_fp = open(os.path.join(fam_dir, "DESC"), 'r')
                # 3. write desc to outfile
                seed_out_fp.writelines(desc_fp.readlines())
                seed_out_fp.write('\n')
                desc_fp.close()
                # 4. open seed and write in outfile

                continue
            """
            shutil.copyfile(
                os.path.join(fam_dir, f_type),
                os.path.join(os.path.join(out_dir, f_type), filename))

        # delete family dir
        shutil.rmtree(fam_dir)

        filename = ''
        fam_dir = ''
        cmd = ''

    # close DB connection
    cursor.close()
    RfamDB.disconnect(cnx)
Esempio n. 34
0
def export_rfam_family_files(f_types, out_dir):
    """
    Fetches all Rfam family accessions from rfam_live, checks out each
    family and copies the files in f_types in their corresponding
    directories

    f_types: A list of file type keywords we need to
             export (e.g. ["SEED", "CM"])
    out_dir: The path to the output directory. If it does not exist it will
             be created
    """

    # Create the output directory if it does not exist
    if (not os.path.exists(out_dir)):
        os.mkdir(out_dir)

    # if current working directory isn't out_dir, change directory
    if (string.find(os.getcwd(), out_dir) == -1):
        os.chdir(out_dir)

    # generate specific output directories for each file type
    file_path = ''
    for f_type in f_types:
        file_path = os.path.join(out_dir, f_type)
        if (not os.path.exists(file_path)):
            os.mkdir(file_path)
        file_path = ''

    # get DB connection handle
    cnx = RfamDB.connect()

    # get mysql cursor
    cursor = cnx.cursor(buffered=True)

    # execute query
    cursor.execute("SELECT rfam_acc FROM family")

    cmd = ''

    # fetch files for all Rfam family accessions
    for rfam_acc in cursor:

        rfam_acc = str(rfam_acc[0])
        cmd = SVN_CHECKOUT % rfam_acc

        # Check out family in out_dir using rfco on lsf
        subprocess.call(cmd, shell=True)

        # path to
        fam_dir = os.path.join(out_dir, rfam_acc)

        # copy files and rename
        for f_type in f_types:
            filename = rfam_acc + '.' + f_type.lower()
            """
            if (f_type == "SEED"):
                # 1. open out file handler
                seed_out_fp = open(
                    os.path.join(os.path.join(out_dir, f_type), filename), 'w')
                # 2. open desc handler
                desc_fp = open(os.path.join(fam_dir, "DESC"), 'r')
                # 3. write desc to outfile
                seed_out_fp.writelines(desc_fp.readlines())
                seed_out_fp.write('\n')
                desc_fp.close()
                # 4. open seed and write in outfile

                continue
            """
            shutil.copyfile(os.path.join(fam_dir, f_type),
                            os.path.join(os.path.join(out_dir, f_type), filename))

        # delete family dir
        shutil.rmtree(fam_dir)

        filename = ''
        fam_dir = ''
        cmd = ''

    # close DB connection
    cursor.close()
    RfamDB.disconnect(cnx)