def main(): # Get the password for webuser webuser_password = get_credentials('webuser') # Connect to the database as the webuser connection = psycopg2.connect( "dbname='%s' user='******' host='db' password='******'" % ('pfacts003_test', 'webuser', webuser_password)) # Make a cursor, through which the database will give us results cur = connection.cursor(cursor_factory=psycopg2.extras.DictCursor) # Construct an SQL query string sql = """ SELECT tree.family_id, tree_node_name.name FROM family, tree, tree_node, tree_node_name WHERE tree.id = family.canonical_tree_id AND tree_node.tree_id = tree.id AND tree_node.left_id = 1 AND tree_node_name.tree_node_id = tree_node.id AND tree_node_name.name LIKE '%Cellulase%'; """ # Execute the query on the database server cur.execute(sql) # Loop through the results for row in cur: # Format and print each result. # Newer versions of psycopg2 allow the columns to be accessed by name, but # these are not currently installed. family_accession = 'bpg%07d' % int(row[0]) description = '"%s"' % row[1] print '%s: %s' % (family_accession, description)
def connect_to_server(DB_NAME, USER): """ Connects to postgres database and returns the cursor. """ PWD = get_credentials(USER) conn = psycopg2.connect("dbname='%s' user='******' host='db1' password='******'" % (DB_NAME, USER, PWD)) return conn
def main(): dir = '/clusterfs/ohana/external/genomes/QuestForOrthologs/Release5' os.chdir(dir) info_of_uniprot_accession = {} f = open("all_uniprot_accessions.txt") for line in f.readlines(): taxon_id, accession = line.strip().split(',') info_of_uniprot_accession[accession] = {} info_of_uniprot_accession[accession]['taxon'] = taxon_id f.close() uniprot_accessions_of_uniprot_id = {} bpg_password = get_credentials('bpg_user') connection = psycopg2.connect( "dbname='%s' user='******' host='db' password='******'" % ('pfacts003_test', 'bpg_user', bpg_password)) cur = connection.cursor('server_side_cursor', cursor_factory=psycopg2.extras.DictCursor) sql = 'SELECT * FROM uniprot_dat_index' cur.execute(sql) for row in cur: if row[1] in info_of_uniprot_accession: accession = row[1] uniprot_id = int(row[2]) info_of_uniprot_accession[accession]['uniprot_id'] \ = uniprot_id if uniprot_id not in uniprot_accessions_of_uniprot_id: uniprot_accessions_of_uniprot_id[uniprot_id] = set() uniprot_accessions_of_uniprot_id[uniprot_id].add(accession) f = open("info_of_uniprot_accession.pkl", "w") cPickle.dump(info_of_uniprot_accession, f) f.close() f = open("uniprot_accessions_of_uniprot_id.pkl", "w") cPickle.dump(uniprot_accessions_of_uniprot_id, f) f.close()
def _handle_conflict(self, gene_info, taxon, gene_id, status, gn_accession, start_position, end_position, orientation): """Handle input file conflicts It is possible for the input file to have duplicate start and end position records. One such example is: taxid GeneID status start end orientation 9606 100500719 - 89453 90011 - 9606 100500719 INFERRED 100 658 + The above is a pseduo-gene and one we will not import. Another example is: taxid GeneID status start end orientation 3702 5007813 REVIEWED 21445902 21447340 - 3702 5007813 REVIEWED 21445902 21447340 - Although the above appears to be a duplicate record, there are fields that we are not importing. Showing the other field entries in the same row order shows how the entries are different: taxid GeneID RNA_nucl_accession RNA_nucl_gi protein_access. 3702 5007813 NM_001084269.2 186491663 NP_001077738.1 3702 5007813 NM_001124039.1 186491666 NP_001117511.1 """ # Determine if records are identical: if self.gene_id == gene_id and\ self.status == status and\ self.start_position == start_position and\ self.end_position == end_position and\ self.orientation == orientation: # For our concerns, these records are identical # so, it doesn't matter return # If we don't have the gene record, we don't care about this # record anyhow conn = psycopg2.connect('dbname=%s host=db user=%s password=%s' % (DB_NAME, DB_USER, get_credentials(DB_USER))) cur = conn.cursor() cur.execute("select uniprot_id from uniprot_gene_id where geneid=%d" % gene_id) uniprot = cur.fetchone() # If we do have the gene record, we need to take further steps if uniprot is not None: if self.gn_accession.startswith('NC_') and gn_accession.startswith( 'NW_'): # NC trumps NW return if self.gn_accession.startswith('NW_') and gn_accession.startswith( 'NC_'): # update with new and return self._overwrite(gene_info, taxon, gene_id, status, gn_accession, start_position, end_position, orientation) return print "Warning: Conflict in tax_on %s, gene %s. Start Position: %d fighting start position: %d" % ( self.taxon, self.gene_id, start_position, self.start_position)
PHOG_COLLECTION_SQL PHOG_DATA_SQL FAMILY_TAXA_SQL SEQUENCE_COUNT_SQL PFAM_ACCESSION_FROM_NAME FAMILY_PFAM_DOMAIN_NAME ''' ########## Connection to server import psycopg2 import psycopg2.extras from pfacts003.utils.credentials import get_credentials #Database connection globals DB_NAME = 'pfacts003_test' USER = '******' PWD = get_credentials(USER) def connect_to_server(): """ Connects to postgres database and returns the cursor. """ conn = psycopg2.connect("dbname='%s' user='******' host='db' password='******'" % (DB_NAME, USER, PWD)) cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) return cur def sql_results(cur, sql_query, parameter_tuple): """ Run the sql query and return results as a list of dictionaries.
def main(): # parse command line options usage = "%prog [options] fasta_file_to_cluster" opt_parser = OptionParser(usage=usage) opt_parser.add_option( "-s", "--candidate_seed_species", dest="species_str", default="", help="Comma-separated mnemonics of species which may be seeds") (options, args) = opt_parser.parse_args() if len(args) != 1: opt_parser.error('Incorrect number of arguments') fasta_file = args[0] if (not os.path.exists(fasta_file)): opt_parser.error('fasta file %s not found' % args[0]) species_list = [ species for species in options.species_str.upper().split(',') if species != '' ] if len(species_list) > 0: species_re = re.compile('|'.join(species_list)) is_desired_species = (lambda record: match_species(species_re, record)) else: is_desired_species = (lambda record: True) flock_password = get_credentials('flock_user') connection = pgdb.connect("db:flock_seeds:flock_user:%s" % flock_password) # connection = pgdb.connect("db:flock_seeds::") cursor = connection.cursor() handle, random_path = tempfile.mkstemp() need_random_name = True while need_random_name: random_name = os.path.split(random_path)[1] os.close(handle) os.unlink(random_path) sql = """CREATE TABLE %s ( sequence_key VARCHAR(255) PRIMARY KEY, length INTEGER, is_seed BOOLEAN, has_been_clustered BOOLEAN, random_integer INTEGER )""" % random_name try: cursor.execute(sql) connection.commit() need_random_name = False except pg.DatabaseError: pass print random_name sql = """CREATE INDEX %s_is_seed ON %s(is_seed)""" % (random_name, random_name) cursor.execute(sql) connection.commit() sql = """CREATE INDEX %s_has_been_clustered ON %s(is_seed)""" % (random_name, random_name) cursor.execute(sql) connection.commit() sql = """CREATE INDEX %s_random_integer ON %s(random_integer)""" % (random_name, random_name) cursor.execute(sql) connection.commit() num_inserted_records = 0 num_records = 0 f = open(fasta_file, "rU") seq_iterator = SeqIO.parse(f, "fasta") for record in seq_iterator: num_records += 1 if is_desired_species(record): sql = "SELECT sequence_key FROM %s WHERE sequence_key = '%s'" \ % (random_name, sql_escape(record.id)) cursor.execute(sql) connection.commit() if cursor.rowcount > 0: # There is a duplicate entry, which we do not expect, so report it print "Duplicate entry for sequence key %s" % sql_escape( record.id) else: sql = """INSERT INTO %s ( sequence_key, length, is_seed, has_been_clustered, random_integer ) VALUES ( '%s', %d, false, false, %d )""" % (random_name, sql_escape(record.id), len( record.seq), randint(1, 10000000)) cursor.execute(sql) connection.commit() num_inserted_records += 1 connection.commit() cursor.close() f.close() connection.close() print "Inserted %d of %d records" % (num_inserted_records, num_records)
from optparse import OptionParser import subprocess import sys try: from pfacts003.utils.credentials import get_credentials except ImportError: print """ I couldn't import credentials. Are you sure you set up the environment? Your choices are production, staging and development. """ sys.exit(1) USER = '******' password = get_credentials(USER) if not password: print "Could not get password." sys.exit(1) def create_submission(working_dir, dirname='schema_spy_output'): contents = """#!/bin/bash #PBS -e %(working_dir)s/schema_spy_error.log #PBS -o %(working_dir)s/schema_spy_output.log #PBS -N schema_spy # WARNING! # The bpg password had been retrieved and is included below. This
def read_from_database(comm, num_tree_servers, num_uniprot_processors, all_uniprot_ids): base_tree_server_id = 1 base_uniprot_processor_id = 1 + num_tree_servers tree_row_info = numpy.zeros(6, dtype='d') uniprot_row_info = numpy.zeros(3, dtype='i') bpg_password = get_credentials('bpg_user') connection = psycopg2.connect( "dbname='%s' user='******' host='db' password='******'" % ('pfacts003_test', 'bpg_user', bpg_password)) cur = connection.cursor('ortholog_cursor', cursor_factory = psycopg2.extras.DictCursor) db_row_fetch_time = 0.0 tree_row_send_time = 0.0 uniprot_row_send_time = 0.0 row_prep_time = 0.0 t1 = MPI.Wtime() sql = """SELECT tree_id, tree_node_left_id, tree_node_right_id, duplication_distance, greatest_duplication_distance_of_maximal_descendant, uniprot_id FROM tree_node_uniprot_taxonomy_materialized """ cur.execute(sql) num_database_rows = 0 db_row_start_t = MPI.Wtime() for row in cur: db_row_end_t = MPI.Wtime() db_row_fetch_time += db_row_end_t - db_row_start_t cur_tree_row_send_time = 0.0 cur_uniprot_row_send_time = 0.0 num_database_rows += 1 if num_database_rows > 0 and num_database_rows % 1000000 == 0: print "Read %d rows from database so far" % num_database_rows tree_id = row[0] greatest_duplication_distance_of_maximal_descendant = row[4] uniprot_id = row[5] if greatest_duplication_distance_of_maximal_descendant and \ (not row[3] or row[3] > row[4]) or \ uniprot_id is not None and uniprot_id in all_uniprot_ids: for i in range(6): if row[i] is not None: tree_row_info[i] = float(row[i]) else: tree_row_info[i] = None if uniprot_id is not None and uniprot_id not in all_uniprot_ids: tree_row_info[5] = None tree_server_num = tree_id % num_tree_servers tree_row_send_start_t = MPI.Wtime() comm.Send([tree_row_info,MPI.DOUBLE_PRECISION], dest=tree_server_num + base_tree_server_id, tag=TAG_DATABASE_ROW) tree_row_send_end_t = MPI.Wtime() cur_tree_row_send_time = tree_row_send_end_t - tree_row_send_start_t tree_row_send_time += cur_tree_row_send_time if not greatest_duplication_distance_of_maximal_descendant: uniprot_row_info[0] = int(row[0]) # tree_id uniprot_row_info[1] = int(row[1]) # tree_node_left_id uniprot_row_info[2] = int(row[5]) # uniprot_id uniprot_processor_num = uniprot_id % num_uniprot_processors uniprot_row_send_start_t = MPI.Wtime() comm.Send([uniprot_row_info,MPI.INT], dest=uniprot_processor_num + base_uniprot_processor_id, tag=TAG_DATABASE_ROW) uniprot_row_send_end_t = MPI.Wtime() cur_uniprot_row_send_time = uniprot_row_send_end_t \ - uniprot_row_send_start_t uniprot_row_send_time += cur_uniprot_row_send_time db_row_start_t = MPI.Wtime() row_prep_time += (db_row_start_t - db_row_end_t) \ - cur_tree_row_send_time - cur_uniprot_row_send_time t2 = MPI.Wtime() print "Finished reading ", num_database_rows, print " rows of the database in ", t2 - t1, " secs" print "Total time fetching database rows: ", db_row_fetch_time print "Total time sending tree rows: ", tree_row_send_time print "Total time sending uniprot rows: ", uniprot_row_send_time print "Total time preparing rows to send out: ", row_prep_time for tree_server_num in range(num_tree_servers): comm.Send([MPI.BOTTOM,MPI.INT], dest=tree_server_num + base_tree_server_id, tag=TAG_DATABASE_DONE) for uniprot_processor_num in range(num_uniprot_processors): comm.Send([MPI.BOTTOM,MPI.INT], dest=uniprot_processor_num + base_uniprot_processor_id, tag=TAG_DATABASE_DONE)