def load_clans_from_db(): """ Retrieves all clan family members from DB and returns a dictionary in the in the form of {clan_id : [FAM1, FAM2, ... ], ... } clan_acc: Clan accession as in Rfam """ clans = {} cnx = RfamDB.connect() cursor = cnx.cursor(raw=True) query = "SELECT * FROM clan_membership" # execute query cursor.execute(query) # fetch the data rows = cursor.fetchall() cursor.close() RfamDB.disconnect(cnx) # create the dictionary for row in rows: if str(row[0]) not in clans.keys(): clans[str(row[0])] = [str(row[1])] else: clans[str(row[0])].append(str(row[1])) return clans
def set_is_singificant_to_zero_multi(non_sig_seqs): """ A function for batching the process of updating full_region tables upon clan competition. Updates the full_region table setting is_significant field to zero (0) for the list of non significant sequences passed in the form of (rfam_acc, rfamseq_acc, seq_start) tuples. non_sig_seqs: A list of the non significant regions to be set to zero. The list is product of clan competition. """ # connect to db cnx = RfamDB.connect() # get a new buffered cursor cursor = cnx.cursor(raw=True) # query to update is_significant field to 0 query = ("UPDATE full_region SET is_significant=0 " "WHERE rfam_acc=%s AND rfamseq_acc=%s AND seq_start=%s") try: # execute query batched cursor.executemany(query, non_sig_seqs) cnx.commit() except: print "MySQL Update Error. Rolling back..." cnx.rollback() cursor.close() RfamDB.disconnect(cnx) cursor.close() RfamDB.disconnect(cnx)
def load_clan_members_from_db(clan_acc): """ Retrieves all clan family members from DB and returns a list of the family accessions. clan_acc: Clan accession as in Rfam """ clan_members = [] cnx = RfamDB.connect() cursor = cnx.cursor(raw=True) query = ("SELECT rfam_acc FROM clan_membership " "WHERE clan_acc=\'%s\'") % (clan_acc) cursor.execute(query) rows = cursor.fetchall() cursor.close() RfamDB.disconnect(cnx) for fam in rows: clan_members.append(str(fam[0])) return clan_members
def fetch_author_orcid(author_name): """ Searches for author by name and :param author_name: :return: """ orcid = None cnx = RfamDB.connect() # Get a new buffered cursor cursor = cnx.cursor(buffered=True) query = """ Select orcid from author where name like '%s%s%s' or synonyms like '%s%s%s' """ cursor.execute(query % (chr(37), author_name, chr(37), chr(37), author_name, chr(37))) result = cursor.fetchone() if result is not None: orcid = result[0] cursor.close() RfamDB.disconnect(cnx) # This will return none if there's no ORCiD available return orcid
def set_genome_size(genome_sizes): """ Updates total_length in genome table genome_sizes: This can be a json file for multiple updates or a tuple in the form of (size, upid) for single genome, where size is in nucleotides return: A list of UP/RG ids as stored in genome """ # connect to db cnx = RfamDB.connect() # get a new buffered cursor cursor = cnx.cursor(buffered=True) # update is_significant field to 0 query = "update genome set total_length=%s where upid=%s" genome_size_list = [] if os.path.isfile(genome_sizes): gen_size_file = open(genome_sizes, 'r') genome_size_dict = json.load(gen_size_file) gen_size_file.close() genome_size_list = [(str(genome_size_dict[upid]), str(upid)) for upid in genome_size_dict.keys()] else: genome_size_list.append(genome_sizes) cursor.executemany(query, genome_size_list) cnx.commit() cursor.close() RfamDB.disconnect(cnx)
def get_full_region_seq_counts(): """ Builds a dictionary where keys are Rfam family accessions (rfam_acc) and values the number of sequences in full_region per family (e.g. {'RFXXXXX':N,...}) """ seq_counts = {} # get a connection object for RfamDB cnx = RfamDB.connect() cursor = cnx.cursor(buffered=True) query = ("SELECT rfam_acc, count(*) FROM full_region\n" "GROUP BY rfam_acc") cursor.execute(query) # get full_region sequence counts per family raw_counts = cursor.fetchall() # build dictionary for entry in raw_counts: seq_counts[str(entry[0])] = int(entry[1]) # close DB handles cursor.close() RfamDB.disconnect(cnx) # result dictionary return seq_counts
def fetch_rfam_accs_sorted(order='DESC'): """ Fetch all available Rfam accs and sort by specified order. DESC by default order: The order in which to sort the records (ASC, DESC) returns: void """ # connect to db cnx = RfamDB.connect() # get a new buffered cursor cursor = cnx.cursor(buffered=True) # update is_significant field to 0 query = ("select rfam_acc from seed_region\n" "group by rfam_acc\n" "order by count(*) %s" % order) cursor.execute(query) rfam_accs = [str(x[0]) for x in cursor.fetchall()] cursor.close() RfamDB.disconnect(cnx) return rfam_accs
def fetch_clanin_data(): """ Fetches all rfam_ids per clan. To be used for clanin file generation :return: void """ clan_members = {} cnx = RfamDB.connect() cursor = cnx.cursor(buffered=True) cursor.execute("select cm.clan_acc, f.rfam_id from clan_membership cm, family f " "where f.rfam_acc=cm.rfam_acc " "order by cm.clan_acc") clan_pairs = cursor.fetchall() cursor.close() # build clan membership dictionary for clan_pair in clan_pairs: clan_acc = clan_pair[0] rfam_id = clan_pair[1] if clan_acc not in clan_members.keys(): clan_members[clan_acc] = [rfam_id] else: clan_members[clan_acc].append(rfam_id) cursor.close() RfamDB.disconnect(cnx) return clan_members
def set_is_significant_to_zero_adv(rfam_acc, rfamseq_acc, region): """ Fetch the correct db entry from full_region table according to rfam_acc and rfamseq_acc and set is_significant field to zero (0) rfam_acc: RNA family accession rfamseq_acc: Family specific sequence accession """ # maybe have this working out of the list which will be returned from # connect to db cnx = RfamDB.connect() # get a new buffered cursor cursor = cnx.cursor(buffered=True) # update is_significant field to 0 query = ("UPDATE full_region SET is_significant=0 " "WHERE rfam_acc=\'%s\' AND rfamseq_acc=\'%s\' AND seq_start=%d") % (rfam_acc, rfamseq_acc, region) cursor.execute(query) cnx.commit() cursor.close() RfamDB.disconnect(cnx)
def fetch_clan_pdb_full_region_records(clan_acc): """ Fetches all regions per clan param clan_acc: A valid Rfam clan accession returns: A list with all pdb regions per clan """ cnx = RfamDB.connect() clan_cursor = cnx.cursor(buffered=True) clan_pdb_region_query = ("select pfr.rfam_acc, concat(pfr.pdb_id,'_',pfr.chain) as seq_acc, " "pfr.pdb_start, pfr.pdb_end, pfr.bit_score, pfr.evalue_score " "from pdb_full_region pfr, clan_membership cm " "where cm.rfam_acc=pfr.rfam_acc " "and cm.clan_acc=\'%s\' " "order by seq_acc") clan_cursor.execute(clan_pdb_region_query % clan_acc) clan_sequence_regions = clan_cursor.fetchall() clan_cursor.close() RfamDB.disconnect(cnx) return clan_sequence_regions
def reset_is_significant(clan_comp_type='FULL'): """ This function resets full_region's is_singificant field's back to 1. This should be able to update all or part of the table for clan competition initialization and restoration. """ seq_regs = [] cnx = RfamDB.connect() # cursor to fetch data d_cursor = cnx.cursor(buffered=True) # query to fetch all non significant sequences if clan_comp_type.upper() == 'FULL': select_query = ("SELECT rfam_acc, rfamseq_acc, seq_start FROM full_region " "WHERE is_significant=0") # query to update 0 fields from s_query update_query = ("UPDATE full_region SET is_significant=1 " "WHERE rfam_acc=%s AND rfamseq_acc=%s AND seq_start=%s") elif clan_comp_type.upper() == 'PDB': select_query = ("SELECT rfam_acc, pdb_id, chain, pdb_start from pdb_full_region " "WHERE is_significant=0") update_query = ("UPDATE pdb_full_region SET is_significant=1 " "WHERE rfam_acc=%s AND pdb_id=%s AND chain=%s AND pdb_start=%s") d_cursor.execute(select_query) # construct region list here for row in d_cursor: if clan_comp_type.upper() == 'FULL': seq_regs.append((str(row[0]), str(row[1]), int(row[2]))) elif clan_comp_type.upper() == 'PDB': seq_regs.append((str(row[0]), str(row[1]), str(row[2]), int(row[3]))) d_cursor.close() # get a new cursor for db updates u_cursor = cnx.cursor(raw=True) # update db try: u_cursor.executemany(update_query, seq_regs) cnx.commit() except: print "MySQL Update Error. Rolling back..." cnx.rollback() u_cursor.close() RfamDB.disconnect(cnx) u_cursor.close() RfamDB.disconnect(cnx)
def set_number_of_distinct_families_in_genome(upid): """ Sets the number distinct families with hits in a specific genome defined by its corresponding upid upid: A specific genome upid to update the number of distinct families return: void """ upids = [] # connect to db cnx = RfamDB.connect() # get a new buffered cursor cursor = cnx.cursor(buffered=True) if upid is None: upids = fetch_all_upids() for upid in upids: select_query = ("select count(distinct rfam_acc) from full_region fr, genseq gs\n" "where fr.rfamseq_acc=gs.rfamseq_acc\n" "and gs.upid=\'%s\'\n" "and gs.version=\'%s\'") cursor.execute(select_query % (upid, version)) count = cursor.fetchone()[0] # update is_significant field to 0 update_query = "update genome set num_families=%d where upid=\'%s\'" # execute query cursor.execute(update_query % (count, upid)) else: select_query = ("select count(distinct rfam_acc) from full_region fr, genseq gs\n" "where fr.rfamseq_acc=gs.rfamseq_acc\n" "and gs.upid=\'%s\'\n" "and gs.version=\'%s\'") cursor.execute(select_query % (upid, version)) count = cursor.fetchone()[0] # update is_significant field to 0 update_query = "update genome set num_families=%d where upid=\'%s\'" # execute query cursor.execute(update_query % (count, upid)) # commit changes and disconnect cnx.commit() cursor.close() RfamDB.disconnect(cnx)
def update_family_ncbi(): """ Updates table family ncbi by adding all distinct taxonomic ids per family :return: void """ cnx = RfamDB.connect() cursor = cnx.cursor(buffered=True) c_cursor = cnx.cursor(buffered=True) cursor.execute("Select rfam_acc from family") rfam_accs = cursor.fetchall() cursor.close() # family_ncbi query get_ncbi_ids = ("select distinct rs.ncbi_id, f.rfam_id, " "f.rfam_acc from full_region fr, rfamseq rs, family f " "where fr.rfamseq_acc=rs.rfamseq_acc " "and f.rfam_acc=fr.rfam_acc " "and fr.rfam_acc=\'%s\' " "and fr.is_significant=1") insert_query = "insert into family_ncbi (ncbi_id, rfam_id, rfam_acc) values (%s,%s,%s)" family_ncbi_entries = [] cursor = cnx.cursor(buffered=True) for rfam_acc in rfam_accs: c_cursor.execute(get_ncbi_ids % rfam_acc[0]) family_ncbi_entries = list(c_cursor.fetchall()) entries_reformatted = [(str(x[0]), str(x[1]), str(x[2])) for x in family_ncbi_entries] try: cursor.executemany(insert_query, entries_reformatted) cnx.commit() except: cnx.rollback() sys.exit("\nError updating family_ncbi table for family %s." % rfam_acc[0]) family_ncbi_entries = [] entries_reformatted = [] cursor.close() c_cursor.close() RfamDB.disconnect(cnx) print "Done updating family_ncbi."
def set_num_full_sig_seqs(): """ Updates num_full in family table to hold the number of significant sequences rather than the number of sequences in the full alignment """ cnx = RfamDB.connect() cursor = cnx.cursor(buffered=True) c_cursor = cnx.cursor(buffered=True) cursor.execute("Select rfam_acc from family") rfam_accs = cursor.fetchall() cursor.close() # query to count all significant sequences of a family count_query = ("select count(*)\n" "from full_region f\n" "where is_significant=1\n" "and type=\'full\'\n" "and rfam_acc=\'%s\'") # counts list counts = [] for acc in rfam_accs: c_cursor.execute(count_query % str(acc[0])) count = c_cursor.fetchall()[0][0] counts.append((count, str(acc[0]))) count = 0 c_cursor.close() c_cursor = cnx.cursor(buffered=True) update_query = ( "update family set num_full=%s where rfam_acc=%s") try: c_cursor.executemany(update_query, counts) cnx.commit() except: cnx.rollback() c_cursor.close() RfamDB.disconnect(cnx) print "Done"
def print_report(no_fams): ''' Calls all functions and displays the results on screen ''' cnx = RfamDB.connect() check_ss_images(cnx, no_fams) check_sunburst(cnx) count_rchie_diagrams(cnx, no_fams) check_alignment_and_tree(cnx, no_fams) check_html_alignment(cnx, no_fams) RfamDB.disconnect(cnx)
def fasta_gen_handler(seq_file, out_dir, rfam_accessions=None): """ The purpose of this script is to handle the fasta generation process, generate individual shell scripts for each available family and submit them to the cluster seq_file: Path to the input sequence file (e.g. rfamseq11.fa) out_dir: The output directory where the fasta files will be generated """ # fetch family accessions families = [] if rfam_accessions is None: cnx = RfamDB.connect() cursor = cnx.cursor(buffered=True) query = ("SELECT rfam_acc FROM family") cursor.execute(query) entries = cursor.fetchall() cursor.close() RfamDB.disconnect(cnx) families = [str(fam[0]) for fam in entries] else: fp = open(rfam_accessions, 'r') families = [x.strip() for x in fp] fp.close() # create scripts dir within output directory if not os.path.exists(os.path.join(out_dir, "scripts")): os.mkdir(os.path.join(out_dir, "scripts")) if not os.path.exists(os.path.join(out_dir, "log")): os.mkdir(os.path.join(out_dir, "log")) for fam in families: # 1. Generate script file sh_path = shell_script_generator( seq_file, fam, out_dir, os.path.join(out_dir, "scripts")) # 2. submit job under group cmd = "bsub < %s" % (sh_path) subprocess.call(cmd, shell=True)
def load_clan_seqs_from_db(clan_acc): # tested """ Loads specific clan family sequences from full_region table and returns a dictionary structure as {Rfam_acc:{Rfseq_acc:[start, end, evalue]}} for clan competition. This has been modified to accommodate sequence duplicates clan_acc: Clan accession as in Rfam """ fam_seqs = {} # connect to db cnx = RfamDB.connect() # get a new buffered cursor cursor = cnx.cursor(raw=True) # Fetch clan specific family full_region data query = ("SELECT full_region.rfam_acc, full_region.rfamseq_acc, \ full_region.seq_start, full_region.seq_end, full_region.evalue_score\n" "FROM full_region\n" "JOIN (SELECT rfam_acc FROM clan_membership WHERE clan_acc=\'%s\') as CLAN_FAMS\n" "ON CLAN_FAMS.rfam_acc=full_region.rfam_acc") % (clan_acc) # execute the query cursor.execute(query) # build family dictionary of sequences for row in cursor: if str(row[RFAM_ACC]) in fam_seqs.keys(): if str(row[SEQ_ACC]) in fam_seqs[str(row[RFAM_ACC])].keys(): fam_seqs[str(row[RFAM_ACC])][str(row[SEQ_ACC])].append( (int(row[START]), int(row[END]), float(row[EVAL]))) else: fam_seqs[str(row[RFAM_ACC])][str(row[SEQ_ACC])] = [(int(row[START]), int(row[END]), float(row[EVAL]))] else: fam_seqs[str(row[RFAM_ACC])] = { str(row[SEQ_ACC]): [(int(row[START]), int(row[END]), float(row[EVAL]))]} # close cursor and DB connection cursor.close() RfamDB.disconnect(cnx) return fam_seqs
def set_number_of_species(): """ Updates number_of_species in family table """ cnx = RfamDB.connect() cursor = cnx.cursor(buffered=True) c_cursor = cnx.cursor(buffered=True) cursor.execute("Select rfam_acc from family") rfam_accs = cursor.fetchall() cursor.close() count_query = ("select count(distinct ncbi_id)\n" "from full_region f, rfamseq r\n" "where r.rfamseq_acc=f.rfamseq_acc\n" "and is_significant=1 and rfam_acc=\'%s\'") # counts list counts = [] for acc in rfam_accs: c_cursor.execute(count_query % str(acc[0])) count = c_cursor.fetchall() counts.append((count[0][0], str(acc[0]))) count = 0 c_cursor.close() c_cursor = cnx.cursor(buffered=True) # query to update number_of_species in the family table update_query = ( "update family set number_of_species=%s where rfam_acc=%s") try: c_cursor.executemany(update_query, counts) cnx.commit() except: cnx.rollback() c_cursor.close() RfamDB.disconnect(cnx) print "Done"
def update_chromosome_info_in_genseq(): # connect to db cnx = RfamDB.connect() # get a new buffered cursor cursor = cnx.cursor(buffered=True, dictionary=True) genome_query = "select upid, assembly_acc from genome where assembly_acc is not NULL" update_query = """ update genseq set chromosome_type=\'%s\', chromosome_name=\'%s\' where upid=\'%s\' and rfamseq_acc=\'%s\' and version=14.0 """ cursor.execute(genome_query) accessions = cursor.fetchall() cursor.close() upid_gca_dict = {} cursor = cnx.cursor(buffered=True) for pair in accessions: upid_gca_dict[pair["upid"]] = pair["assembly_acc"] for upid in upid_gca_dict.keys(): # print assembly_acc #print upid_gca_dict[upid] upid_gca_dict[upid] if upid_gca_dict[upid][0:3] == 'GCF' or upid_gca_dict[upid] == '': continue data = fgm.fetch_gca_data(upid, upid_gca_dict[upid], 'kingdom') if "fields" in data: fields = data["fields"] if "chromosomes" in fields: for chromosome in fields["chromosomes"]: cursor.execute(update_query % (str(chromosome["type"]), str(chromosome["name"]), str(upid), str(chromosome["accession"]))) cnx.commit() cursor.close() RfamDB.disconnect(cnx)
def set_pdb_is_significant_to_zero(non_sig_seqs): """ Sets pdb_full_region is_significant to 0 for non significant regions in non_sig_seqs list non_sig_seqs: A list of the non significant regions to be set to zero. The list is product of clan competition. returns: void """ # reformat list by splitting pdb_id and chain pdb_reformatted_regions = [] for competed_region in non_sig_seqs: # split pdb_id chain pairs by '_' used in concatenation for clan competition # pdb_id: pdb_id_chain_pairs[0] and chain: pdb_id_chain_pairs[2] pdb_id_chain_pairs = competed_region[1].partition('_') pdb_reformatted_regions.append((str(competed_region[0]), str(pdb_id_chain_pairs[0]), str(pdb_id_chain_pairs[2]), int(competed_region[2]))) # connect to db cnx = RfamDB.connect() # get a new buffered cursor cursor = cnx.cursor(raw=True) # query to update is_significant field to 0 query = ("update pdb_full_region set is_significant=0 " "where rfam_acc=%s and pdb_id=%s and chain=%s and pdb_start=%s") try: # execute query batched cursor.executemany(query, pdb_reformatted_regions) cnx.commit() except: print "MySQL Update Error. Rolling back..." cnx.rollback() cursor.close() RfamDB.disconnect(cnx) cursor.close() RfamDB.disconnect(cnx)
def update_post_process(jobs_file): """ Updates _post_process table with the job_ids per family assigned by lsf jobs_file: This is a tab separated txt file generated from running the job_dequeuer.py script that submits the rfam_view_process for each family. (rfam_acc uuid job_id ...) """ job_ids = [] jobs_file_fp = open(jobs_file, 'r') query = ("UPDATE _post_process SET lsf_id=%s " "WHERE rfam_acc=%s AND uuid=%s") # get lsf ids from file for line in jobs_file_fp: line = line.strip() line = string.split(line, '\t') job_ids.append((line[2], line[0], line[1])) jobs_file_fp.close() # connect to db cnx = RfamDB.connect() cursor = cnx.cursor(raw=True) # update db try: cursor.executemany(query, job_ids) cnx.commit() # move this after except statement?? except: # rollback to previous state print "MySQL Update Error. Rollback..." cnx.rollback() cursor.close() RfamDB.disconnect(cnx) cursor.close() RfamDB.disconnect(cnx)
def fetch_clan_accessions(): """ Fetches all clan accessions from the database and returns then in the form of a list returns: A list of all clan accessions """ cnx = RfamDB.connect() clan_cursor = cnx.cursor(buffered=True) clan_query = "SELECT clan_acc FROM clan" # fetch clans clan_cursor.execute(clan_query) clans = [str(x[0]) for x in clan_cursor.fetchall()] clan_cursor.close() RfamDB.disconnect(cnx) return clans
def update_assembly_names(upid_gca_file): """ Loads the upid_gca json files and parses the corresponding assembly xml files from ENA to fetch the assembly names and update the fields in genome table param upid_gca_file: A json file with upid: {"GCA" : GCAxxx, "DOM": domain } return: void """ fp = open(upid_gca_file, 'r') acc_pairs = json.load(fp) fp.close() # a list of tuples to assembly_names = [] for upid in acc_pairs.keys(): data = fgm.fetch_gca_data(upid, acc_pairs[upid]["GCA"], acc_pairs[upid]["DOM"]) if "fields" in data: if data["fields"]["assembly_name"] is not None: assembly_names.append((data["fields"]["assembly_name"], upid)) # connect to db cnx = RfamDB.connect() # get a new buffered cursor cursor = cnx.cursor(buffered=True, dictionary=True) query = "update genome set assembly_name=%s where upid=%s" cursor.executemany(query, assembly_names) cnx.commit() cursor.close() RfamDB.disconnect(cnx)
def fetch_all_upids(): """ Fetch all available genome accessions from genome table return: A list of UP/RG ids as stored in genome """ # connect to db cnx = RfamDB.connect() # get a new buffered cursor cursor = cnx.cursor(buffered=True) # update is_significant field to 0 query = "select upid from genome" cursor.execute(query) genome_accs = [str(x[0]) for x in cursor.fetchall()] cursor.close() RfamDB.disconnect(cnx) return genome_accs
def fetch_clan_full_region_records(clan_acc): """ Fetches all regions per clan param clan_acc: A valid Rfam clan accession returns: A list with all regions from full_region table for a specific clan """ cnx = RfamDB.connect() clan_cursor = cnx.cursor(buffered=True) clan_region_query = ("SELECT * FROM full_region\n" "JOIN (SELECT rfam_acc FROM clan_membership WHERE clan_acc=\'%s\') as CLAN_FAMS\n" "ON CLAN_FAMS.rfam_acc=full_region.rfam_acc") # % (clan_acc) clan_cursor.execute(clan_region_query % clan_acc) clan_sequence_regions = clan_cursor.fetchall() clan_cursor.close() RfamDB.disconnect(cnx) return clan_sequence_regions
def generate_fasta(seq_file, out_dir): """ Uses esl-sfetch to generate family specific fasta files out of seq_file which is provided as source (e.g. rfamseq11.fa). It will generate fasta files for all families by default seq_file: The path to rfamseq input file in fasta format, for generating the fasta files out_dir: Destination directory where the files will be generated """ sequence = '' fp_out = None seq_bits = None # logging sequences not exported # rename this to family log log_file = os.path.join(out_dir, "missing_seqs.log") logging.basicConfig(filename=log_file, filemode='w', level=logging.INFO) # connect to db cnx = RfamDB.connect() # get a new buffered cursor cursor = cnx.cursor(raw=True) # fetch clan specific family full_region data and sequence description query = ( "SELECT fr.rfam_acc, fr.rfamseq_acc, fr.seq_start, fr.seq_end, rf.description\n" "FROM full_region fr, rfamseq rf\n" "WHERE fr.rfamseq_acc=rf.rfamseq_acc\n" "AND fr.is_significant=1\n" "ORDER BY fr.rfam_acc") # execute the query cursor.execute(query) for region in cursor: # new family if str(region[RFAM_ACC]) != rfam_acc: # check if there's no open file if fp_out is not None: fp_out.close() # open new fasta file fp_out = gzip.open( os.path.join(out_dir, str(region[RFAM_ACC]) + ".fa.gz"), 'w') rfam_acc = region[RFAM_ACC] cmd = "%s -c %s/%s %s %s" % (ESL_PATH, str( region[START]), str(region[END]), seq_file, str(region[SEQ_ACC])) proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) seq = proc.communicate()[0] # get sequence sequence = '' seq_bits = seq.split('\n')[1:] sequence = sequence.join(seq_bits) # print sequence if sequence != '' and seq_validator(sequence) is True: # write header fp_out.write( ">%s/%s-%s %s\n" % (str(region[SEQ_ACC]), str( region[START]), str(region[END]), str(region[DESC]))) # write sequence fp_out.write(sequence + '\n') else: # logging sequences that have not been exported logging.info(sequence) # close last file fp_out.close() # disconnect from DB cursor.close() RfamDB.disconnect(cnx)
def generate_fasta_single(seq_file, rfam_acc, out_dir): """ Uses esl-sfetch to generate family specific fasta files out of seq_file which is provided as source. Works on single family based on rfam_acc. Files are generated in a compressed .fa.gz format seq_file: This is the the path to rfamseq input file in fasta format, for generating the fasta files rfam_acc: The rfam_acc of a specific family out_dir: This is the destination directory where the files will be generated """ sequence = '' fp_out = None seq_bits = None # logging sequences not exported # rename this to family log log_file = os.path.join(out_dir, rfam_acc + ".log") logging.basicConfig(filename=log_file, filemode='w', level=logging.INFO) # connect to db cnx = RfamDB.connect() # get a new buffered cursor cursor = cnx.cursor(raw=True) # fetch sequence accessions for specific family - significant only!! query = ( "SELECT fr.rfam_acc, fr.rfamseq_acc, fr.seq_start, fr.seq_end, rf.description\n" "FROM full_region fr, rfamseq rf\n" "WHERE fr.rfamseq_acc=rf.rfamseq_acc\n" "AND fr.is_significant=1\n" "AND fr.rfam_acc=\'%s\'") % (rfam_acc) # execute the query cursor.execute(query) # open a new fasta output file fp_out = gzip.open(os.path.join(out_dir, str(rfam_acc) + ".fa.gz"), 'w') for region in cursor: cmd = "%s -c %s/%s %s %s" % (ESL_PATH, str( region[START]), str(region[END]), seq_file, str(region[SEQ_ACC])) proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) seq = proc.communicate()[0] # get sequence sequence = '' seq_bits = seq.split('\n')[1:] sequence = sequence.join(seq_bits) # print sequence if sequence != '' and seq_validator(sequence) is True: # write header fp_out.write( ">%s/%s-%s %s\n" % (str(region[SEQ_ACC]), str( region[START]), str(region[END]), str(region[DESC]))) # write sequence fp_out.write(sequence + '\n') else: # logging sequences that have not been exported logging.info(str(region[SEQ_ACC])) # close last file fp_out.close() # disconnect from DB cursor.close() RfamDB.disconnect(cnx)
def generate_fasta(seq_file, out_dir): """ Uses esl-sfetch to generate family specific fasta files out of seq_file which is provided as source (e.g. rfamseq11.fa). It will generate fasta files for all families by default seq_file: The path to rfamseq input file in fasta format, for generating the fasta files out_dir: Destination directory where the files will be generated """ sequence = '' fp_out = None seq_bits = None # logging sequences not exported # rename this to family log log_file = os.path.join(out_dir, "missing_seqs.log") logging.basicConfig( filename=log_file, filemode='w', level=logging.INFO) # connect to db cnx = RfamDB.connect() # get a new buffered cursor cursor = cnx.cursor(raw=True) # fetch clan specific family full_region data and sequence description query = ("SELECT fr.rfam_acc, fr.rfamseq_acc, fr.seq_start, fr.seq_end, rf.description\n" "FROM full_region fr, rfamseq rf\n" "WHERE fr.rfamseq_acc=rf.rfamseq_acc\n" "AND fr.is_significant=1\n" "ORDER BY fr.rfam_acc") # execute the query cursor.execute(query) for region in cursor: # new family if str(region[RFAM_ACC]) != rfam_acc: # check if there's no open file if fp_out is not None: fp_out.close() # open new fasta file fp_out = gzip.open( os.path.join(out_dir, str(region[RFAM_ACC]) + ".fa.gz"), 'w') rfam_acc = region[RFAM_ACC] cmd = "%s -c %s/%s %s %s" % (ESL_PATH, str(region[START]), str(region[END]), seq_file, str(region[SEQ_ACC])) proc = subprocess.Popen( cmd, shell=True, stdout=subprocess.PIPE) seq = proc.communicate()[0] # get sequence sequence = '' seq_bits = seq.split('\n')[1:] sequence = sequence.join(seq_bits) # print sequence if sequence != '' and seq_validator(sequence) is True: # write header fp_out.write(">%s/%s-%s %s\n" % (str(region[SEQ_ACC]), str(region[START]), str(region[END]), str(region[DESC]))) # write sequence fp_out.write(sequence + '\n') else: # logging sequences that have not been exported logging.info(sequence) # close last file fp_out.close() # disconnect from DB cursor.close() RfamDB.disconnect(cnx)
def generate_fasta_single(seq_file, rfam_acc, out_dir): """ Uses esl-sfetch to generate family specific fasta files out of seq_file which is provided as source. Works on single family based on rfam_acc. Files are generated in a compressed .fa.gz format seq_file: This is the the path to rfamseq input file in fasta format, for generating the fasta files rfam_acc: The rfam_acc of a specific family out_dir: This is the destination directory where the files will be generated """ sequence = '' fp_out = None seq_bits = None # logging sequences not exported # rename this to family log log_file = os.path.join(out_dir, rfam_acc + ".log") logging.basicConfig( filename=log_file, filemode='w', level=logging.INFO) # connect to db cnx = RfamDB.connect() # get a new buffered cursor cursor = cnx.cursor(raw=True) # fetch sequence accessions for specific family - significant only!! query = ("SELECT fr.rfam_acc, fr.rfamseq_acc, fr.seq_start, fr.seq_end, rf.description\n" "FROM full_region fr, rfamseq rf\n" "WHERE fr.rfamseq_acc=rf.rfamseq_acc\n" "AND fr.is_significant=1\n" "AND fr.rfam_acc=\'%s\'") % (rfam_acc) # execute the query cursor.execute(query) # open a new fasta output file fp_out = gzip.open( os.path.join(out_dir, str(rfam_acc) + ".fa.gz"), 'w') for region in cursor: cmd = "%s -c %s/%s %s %s" % (ESL_PATH, str(region[START]), str(region[END]), seq_file, str(region[SEQ_ACC])) proc = subprocess.Popen( cmd, shell=True, stdout=subprocess.PIPE) seq = proc.communicate()[0] # get sequence sequence = '' seq_bits = seq.split('\n')[1:] sequence = sequence.join(seq_bits) # print sequence if sequence != '' and seq_validator(sequence) is True: # write header fp_out.write(">%s/%s-%s %s\n" % (str(region[SEQ_ACC]), str(region[START]), str(region[END]), str(region[DESC]))) # write sequence fp_out.write(sequence + '\n') else: # logging sequences that have not been exported logging.info(str(region[SEQ_ACC])) # close last file fp_out.close() # disconnect from DB cursor.close() RfamDB.disconnect(cnx)
def set_number_of_genomic_significant_hits(upid): """ Sets the number of significant hits for a specific genome according to its corresponding upid id upid: A specific genome upid to update the number of significant hits return: void """ # connect to db cnx = RfamDB.connect() # get a new buffered cursor cursor = cnx.cursor(buffered=True) if upid is None: upids = fetch_all_upids() for upid in upids: count_query = ("select count(fr.rfamseq_acc)\n" "from full_region fr, genseq gs\n" "where fr.rfamseq_acc=gs.rfamseq_acc\n" "and fr.is_significant=1\n" "and gs.upid=\'%s\'\n" "and gs.version=\'%s\'") cursor.execute(count_query % (upid, version)) count = cursor.fetchone()[0] # update is_significant field to 0 update_query = "update genome set num_rfam_regions=%d where upid=\'%s\'" # execute query cursor.execute(update_query % (count, upid)) else: count_query = ("select count(fr.rfamseq_acc)\n" "from full_region fr, genseq gs\n" "where fr.rfamseq_acc=gs.rfamseq_acc\n" "and fr.is_significant=1\n" "and gs.upid=\'%s\'\n" "and gs.version=\'%s\'") cursor.execute(count_query % (upid, version)) count = cursor.fetchone()[0] # update is_significant field to 0 update_query = "update genome set num_rfam_regions=%d where upid=\'%s\'" # execute query cursor.execute(update_query % (count, upid)) # commit changes and disconnect cnx.commit() cursor.close() RfamDB.disconnect(cnx)
def export_sequences(seq_db, sql, filename=None, out_dir=None): """ Exporting sequences from rfam_live and generating a fasta file by fetching the corresponding regions from seq_db provided as param seq_db: A fasta sequence database to extract sequence regions from. Default seq_db is rfamseq11.fa sql: The query to execute (string or valid .sql file) filename: Ouput filename out_dir: A path to the output directory """ log_file = os.path.join(out_dir, "missing_seqs.log") logging.basicConfig( filename=log_file, filemode='w', level=logging.INFO) cnx = RfamDB.connect() cursor = cnx.cursor(raw=True) query = '' if os.path.isfile(sql): fp = open(sql, 'r') query = ' '.join(fp.readlines()) else: query = sql cursor.execute(query) # open an output file fp_out = None if filename is not None: fp_out = gzip.open( os.path.join(out_dir, filename + ".fa.gz"), 'w') else: fp_out = gzip.open(os.path.join(out_dir, OUT_FILE_NAME), 'w') for region in cursor: cmd = "%s -c %s/%s %s %s" % (ESL_PATH, str(region[START]), str(region[END]), seq_file, str(region[SEQ_ACC])) proc = subprocess.Popen( cmd, shell=True, stdout=subprocess.PIPE) seq = proc.communicate()[0] # get sequence sequence = '' seq_bits = seq.split('\n')[1:] sequence = sequence.join(seq_bits) if (sequence != '' and seq_validator(sequence) is True): # write header fp_out.write(">%s/%s-%s %s\n" % (str(region[SEQ_ACC]), str(region[START]), str(region[END]), str(region[DESC]))) # write sequence fp_out.write(sequence + '\n') else: logging.info(sequence) fp_out.close() cursor.close() RfamDB.disconnect(cnx)
def reset_is_significant(clan_comp_type='FULL'): """ This function resets full_region's is_singificant field's back to 1. This should be able to update all or part of the table for clan competition initialization and restoration. """ seq_regs = [] cnx = RfamDB.connect() # cursor to fetch data d_cursor = cnx.cursor(buffered=True) # query to fetch all non significant sequences if clan_comp_type.upper() == 'FULL': select_query = ( "SELECT rfam_acc, rfamseq_acc, seq_start FROM full_region " "WHERE is_significant=0") # query to update 0 fields from s_query update_query = ( "UPDATE full_region SET is_significant=1 " "WHERE rfam_acc=%s AND rfamseq_acc=%s AND seq_start=%s") elif clan_comp_type.upper() == 'PDB': select_query = ( "SELECT rfam_acc, pdb_id, chain, pdb_start from pdb_full_region " "WHERE is_significant=0") update_query = ( "UPDATE pdb_full_region SET is_significant=1 " "WHERE rfam_acc=%s AND pdb_id=%s AND chain=%s AND pdb_start=%s") d_cursor.execute(select_query) # construct region list here for row in d_cursor: if clan_comp_type.upper() == 'FULL': seq_regs.append((str(row[0]), str(row[1]), int(row[2]))) elif clan_comp_type.upper() == 'PDB': seq_regs.append( (str(row[0]), str(row[1]), str(row[2]), int(row[3]))) d_cursor.close() # get a new cursor for db updates u_cursor = cnx.cursor(raw=True) # update db try: u_cursor.executemany(update_query, seq_regs) cnx.commit() except: print "MySQL Update Error. Rolling back..." cnx.rollback() u_cursor.close() RfamDB.disconnect(cnx) u_cursor.close() RfamDB.disconnect(cnx)
def export_rfam_family_files(f_types, out_dir): """ Fetches all Rfam family accessions from rfam_live, checks out each family and copies the files in f_types in their corresponding directories f_types: A list of file type keywords we need to export (e.g. ["SEED", "CM"]) out_dir: The path to the output directory. If it does not exist it will be created """ # Create the output directory if it does not exist if (not os.path.exists(out_dir)): os.mkdir(out_dir) # if current working directory isn't out_dir, change directory if (string.find(os.getcwd(), out_dir) == -1): os.chdir(out_dir) # generate specific output directories for each file type file_path = '' for f_type in f_types: file_path = os.path.join(out_dir, f_type) if (not os.path.exists(file_path)): os.mkdir(file_path) file_path = '' # get DB connection handle cnx = RfamDB.connect() # get mysql cursor cursor = cnx.cursor(buffered=True) # execute query cursor.execute("SELECT rfam_acc FROM family") cmd = '' # fetch files for all Rfam family accessions for rfam_acc in cursor: rfam_acc = str(rfam_acc[0]) cmd = SVN_CHECKOUT % rfam_acc # Check out family in out_dir using rfco on lsf subprocess.call(cmd, shell=True) # path to fam_dir = os.path.join(out_dir, rfam_acc) # copy files and rename for f_type in f_types: filename = rfam_acc + '.' + f_type.lower() """ if (f_type == "SEED"): # 1. open out file handler seed_out_fp = open( os.path.join(os.path.join(out_dir, f_type), filename), 'w') # 2. open desc handler desc_fp = open(os.path.join(fam_dir, "DESC"), 'r') # 3. write desc to outfile seed_out_fp.writelines(desc_fp.readlines()) seed_out_fp.write('\n') desc_fp.close() # 4. open seed and write in outfile continue """ shutil.copyfile( os.path.join(fam_dir, f_type), os.path.join(os.path.join(out_dir, f_type), filename)) # delete family dir shutil.rmtree(fam_dir) filename = '' fam_dir = '' cmd = '' # close DB connection cursor.close() RfamDB.disconnect(cnx)
def export_rfam_family_files(f_types, out_dir): """ Fetches all Rfam family accessions from rfam_live, checks out each family and copies the files in f_types in their corresponding directories f_types: A list of file type keywords we need to export (e.g. ["SEED", "CM"]) out_dir: The path to the output directory. If it does not exist it will be created """ # Create the output directory if it does not exist if (not os.path.exists(out_dir)): os.mkdir(out_dir) # if current working directory isn't out_dir, change directory if (string.find(os.getcwd(), out_dir) == -1): os.chdir(out_dir) # generate specific output directories for each file type file_path = '' for f_type in f_types: file_path = os.path.join(out_dir, f_type) if (not os.path.exists(file_path)): os.mkdir(file_path) file_path = '' # get DB connection handle cnx = RfamDB.connect() # get mysql cursor cursor = cnx.cursor(buffered=True) # execute query cursor.execute("SELECT rfam_acc FROM family") cmd = '' # fetch files for all Rfam family accessions for rfam_acc in cursor: rfam_acc = str(rfam_acc[0]) cmd = SVN_CHECKOUT % rfam_acc # Check out family in out_dir using rfco on lsf subprocess.call(cmd, shell=True) # path to fam_dir = os.path.join(out_dir, rfam_acc) # copy files and rename for f_type in f_types: filename = rfam_acc + '.' + f_type.lower() """ if (f_type == "SEED"): # 1. open out file handler seed_out_fp = open( os.path.join(os.path.join(out_dir, f_type), filename), 'w') # 2. open desc handler desc_fp = open(os.path.join(fam_dir, "DESC"), 'r') # 3. write desc to outfile seed_out_fp.writelines(desc_fp.readlines()) seed_out_fp.write('\n') desc_fp.close() # 4. open seed and write in outfile continue """ shutil.copyfile(os.path.join(fam_dir, f_type), os.path.join(os.path.join(out_dir, f_type), filename)) # delete family dir shutil.rmtree(fam_dir) filename = '' fam_dir = '' cmd = '' # close DB connection cursor.close() RfamDB.disconnect(cnx)