def clear_data_for_metaomic_assembly(self, metaomic_assembly): self.logger.info("Clearing data for meta-omic assembly '%s'" % metaomic_assembly) cur = db.get_psycopg2_cursor() try: # Get MetagenomeSequences. cur.execute("select ms.id from metagenome_sequence ms where ms.metagenome_id = %s", (metaomic_assembly,)) metagenome_sequences = cur.fetchall() if metagenome_sequences is not None: # Delete Sequences, Annotations and Digests self.logger.info("Deleting TaxonDigestPeptides and TaxonDigests") for ms in metagenome_sequences: cur.execute("delete from metagenome_sequence_digest_peptide where metagenome_sequence_id = %s", (ms[0],)) cur.execute("delete from metagenome_annotations where metagenome_sequence_id = %s", (ms[0],)) db.psycopg2_connection.commit() cur = db.get_psycopg2_cursor() for ms in metagenome_sequences: cur.execute("delete from metagenome_sequence where metagenome_id = %s", (ms[0],)) db.psycopg2_connection.commit() cur = db.get_psycopg2_cursor() # Delete Metagenome cur.execute("delete from metagenome where id = %s;", (metaomic_assembly,)) except Exception as e: self.logger.error("Problem removing '%s'" % metaomic_assembly) traceback.print_exc() else: # Commit the deletes. db.psycopg2_connection.commit()
def get_digest(logger, digest_def): """ Fetch or create a digest from a digest definition.""" #session = db.get_session() # Get or create protease. protease = Protease(**digest_def['protease']) protease_id = str(digest_def['protease']['id']) cur = db.get_psycopg2_cursor() cur.execute("select * from protease where protease.id=%s;", (protease_id,)) results = cur.fetchone() if results is None: logger.info( "No protease exists for the given definition, creating...") protease = Protease(**digest_def['protease']) cur.execute("insert into protease (id, cleavage_rule) values( %s, %s);", (str(digest_def['protease']['id']), str(digest_def['protease']['cleavage_rule']),)) else: protease = Protease(id=results[0], cleavage_rule=results[1]) db.psycopg2_connection.commit() # Get or create digest object. cur = db.get_psycopg2_cursor() #not all possible digestion parameters will have a value so build the query to account for this query_params = [protease.id] digest_query = "select * from digest where digest.protease_id = %s"; if digest_def.get('max_missed_cleavages') is not None: digest_query = digest_query + " and digest.max_missed_cleavages = %s " query_params.append(digest_def.get('max_missed_cleavages')) if digest_def.get('min_acids') is not None: digest_query = digest_query + " and digest.min_acids = %s " query_params.append(digest_def.get('min_acids')) if digest_def.get('max_acids') is not None: digest_query = digest_query + " and digest.max_acids = %s " query_params.append(digest_def.get('max_acids')) cur.execute(digest_query, (query_params)) results = cur.fetchone() db.psycopg2_connection.commit if results is None: #if not digest: logger.info( "No digest exists for the given definition, creating...") digest_kwargs = {} digest_kwargs.update(digest_def) digest_kwargs['protease'] = protease digest = Digest(**digest_kwargs) cur = db.get_psycopg2_cursor() cur.execute("select * from digest_insert( %s, %s, %s, %s);", (protease.id, digest.max_missed_cleavages, digest.min_acids, digest.max_acids,)) digest_result = cur.fetchone() if digest_result: digest = Digest(id=digest_result[0], protease = protease, max_missed_cleavages=digest_result[2], min_acids = digest_result[3], max_acids = digest_result[4]) else: digest = Digest(id=results[0], protease = protease, max_missed_cleavages=results[2], min_acids = results[3], max_acids = results[4]) db.psycopg2_connection.commit() return digest
def count_peptide_union(taxon_digests=[], logger=None): taxon_digest_ids = [taxon_digest.id for taxon_digest in taxon_digests] cur = db.get_psycopg2_cursor(); cur.execute("select * from taxon_count_peptide_union(%s)", (taxon_digest_ids,)) count = len(cur.fetchall()); db.psycopg2_connection.commit() return count
def count_peptide_union_sa(specialized_assemblies=[], logger=None): sa_ids = [sa.id for sa in specialized_assemblies] cur = db.get_psycopg2_cursor(); cur.execute("select * from sadp_count_peptide_union(%s)", (sa_ids,)) count = len(cur.fetchall()); db.psycopg2_connection.commit() return count
def process_taxon_digest_peptide_batch(self, taxon_digest, batch, logger=None): if not logger: logger = self.logger start_time = time.time() dicts = [] taxon_digest_ids = [] pepdide_ids = [] peptide_count = [] logger.info("Creating %s new taxon digest peptides..." % (len(batch))) for row in batch: dicts.append({ 'taxon_digest_id': row[2], 'peptide_id': row[0], 'count': row[1], }) taxon_digest_ids.append(row[2]) pepdide_ids.append(row[0]) peptide_count.append(row[1]) cur = db.get_psycopg2_cursor() cur.execute("select taxon_digest_peptide_insert(%s, %s, %s);", (pepdide_ids, taxon_digest_ids, peptide_count)) db.psycopg2_connection.commit() total_time = time.time() - start_time logger.info("taxon digest time elapsed: %s" % (total_time))
def run(self): try: # Get session. cur = db.get_psycopg2_cursor() self.logger.info("Clearing data for specialized assemblies '%s'" % self.specialized_assembly_ids) cur.execute( "select sa.id from specialized_assembly sa where sa.genome_name in %s", (tuple(self.specialized_assembly_ids), )) specialized_assembly_results = cur.fetchall() if specialized_assembly_results is None: self.logger.info( "No matching taxons found. Nothing was changed") exit() for specialized_assembly in specialized_assembly_results: self.logger.info( "Clearing data for specialized assembly '%s'" % specialized_assembly) self.clear_data_for_specialized_assembly(specialized_assembly) except Exception as e: self.logger.error("Problem removing specialized_assemblies: '%s'" % e) traceback.print_exc() db.psycopg2_connection.commit() else: db.psycopg2_connection.commit()
def clear_data_for_taxon(self, taxon): self.logger.info("Clearing data for taxon '%s'" % taxon) cur = db.get_psycopg2_cursor() try: # Get TaxonDigests. cur.execute( "select td.id from taxon_digest td where td.taxon_id = %s", (taxon, )) taxon_digests = cur.fetchall() if taxon_digests is not None: # Delete TaxonDigestPeptides and TaxonDigests self.logger.info( "Deleting TaxonDigestPeptides and TaxonDigests") for td in taxon_digests: cur.execute( "delete from taxon_digest_peptide where taxon_digest_id = %s", (td[0], )) cur.execute("delete from taxon_digest where id = %s", (td[0], )) # Delete TaxonProteins. self.logger.info("Deleting TaxonProteins") cur.execute("delete from taxon_protein where taxon_id = %s", (taxon, )) # Delete Taxon cur.execute("delete from taxon where id = %s;", (taxon, )) except Exception as e: self.logger.error("Problem removing '%s'" % taxon) traceback.print_exc() else: # Commit the deletes. db.psycopg2_connection.commit()
def run(self): try: # Get session. cur = db.get_psycopg2_cursor() self.logger.info("Clearing data for taxon '%s'" % self.taxon_ids) cur.execute("select t.id from taxon t where t.id in %s", (tuple(self.taxon_ids), )) taxon_results = cur.fetchall() if taxon_results is None: self.logger.info( "No matching taxons found. Nothing was changed") exit() for taxon in taxon_results: self.logger.info("Clearing data for taxon '%s'" % taxon) self.clear_data_for_taxon(taxon) except Exception as e: self.logger.error("Problem removing taxons: '%s'" % e) traceback.print_exc() db.psycopg2_connection.commit() else: db.psycopg2_connection.commit()
def run(self): # Read in sequences to query. sequences = [] max_dist = self.args.max_distance if self.args.sequence_file: with open(self.args.sequence_file, 'rb') as f: sequences = [line.strip() for line in f.readlines()] elif self.args.sequence: sequences = [self.args.sequence] # Read in whether to query just genomes ('g'), just metagenomes ('g') or both ('b'). # Genomes is the default search type = 'g' if self.args.type: type = self.args.type if not sequences: argparser.error( "Provide a query sequence via the '--sequence' option, " "or a set of sequences via the --sequence-file option") # Print headers. headers = ['query', 'taxon', 'lev_distance', 'match'] print(','.join(headers)) # Execute query for each sequence and print results. cur = db.get_psycopg2_cursor() for seq in sequences: if type == 'g' or type == 'all': print('GENOMIC RESULTS') print('search sequence,id,name') #cur.execute("select taxon_digest_taxon_id from genomic_query_by_peptide_sequence(%s)", (seq,)) cur.execute( "select id, genome_name from genomic_query_taxon_by_peptide_sequence_new(%s)", (seq, )) for row in cur.fetchall(): print(','.join([str(s) for s in [seq] + list(row)])) if type == 'sa' or type == 'all': print('\n') print('SPECIALIZED ASSEMBLY RESULTS') print('search sequence,genome name, sequence id') cur.execute( "select specialized_assembly_name, specialized_assembly_sequence from specialized_assembly_taxon_query_by_peptide_sequence(%s)", (seq, )) for row in cur.fetchall(): print(','.join([str(s) for s in [seq] + list(row)])) if type == 'm' or type == 'all': print('\n') print('METAGENOMNIC RESULTS') print('search sequence, metagenome name') cur.execute( "select metagenome_name from metagenomic_query_by_peptide_sequence(%s)", (seq, )) for row in cur.fetchall(): print(','.join([str(s) for s in [seq] + list(row)]))
def update_existing_peptides_(self, sequences, existing_peptides): if not sequences: return cur = db.get_psycopg2_cursor() cur.execute("select * from peptide where peptide.sequence in %s", (tuple(sequences), )) for record in cur.fetchall(): peptide = Peptide(id=record[0], sequence=record[1], mass=record[2]) existing_peptides[peptide.sequence] = peptide db.psycopg2_connection.commit()
def main(): logger = logging.getLogger('metaomic_assemblies') logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) cur = db.get_psycopg2_cursor() cur.execute("select m.name from metagenome m;") logger.info("Meta-omic Assemblies") for record in cur: logger.info("%s" % (record[0])) db.psycopg2_connection.commit()
def main(): logger = logging.getLogger('taxons') logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) cur = db.get_psycopg2_cursor(); cur.execute("select t.id from taxon t;") logger.info("Taxons"); for record in cur: logger.info("%s" % (record[0])) db.psycopg2_connection.commit()
def main(): args = argparser.parse_args() logger = logging.getLogger('redundancy_tables') logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) # Check that specialized assembly genome names or file were provided. if not (args.sa_ids or args.sa_id_file): raise Exception("Must provide --sa-ids or --sa-id-file option") # Get specialized assemblies. if args.sa_ids: sa_ids = args.sa_ids else: with open(args.sa_id_file, 'r') as f: sa_ids = [row[0] for row in csv.reader(f)] logger.info("Specialized Assembly Ids: %s" % (sa_ids)) # Create output dir if it does not exist. if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) cur = db.get_psycopg2_cursor() cur.execute( "select sa.id, sa.genome_name from specialized_assembly sa where sa.genome_name = any(%s);", (sa_ids, )) sa_digests = [] for record in cur: sa = Specialized_Assembly(id=record[0], genome_name=record[1]) logger.info("Specialized Assembly: %s %s" % (sa.genome_name, sa.id)) sa_digests.append(sa) db.psycopg2_connection.commit() # Generate the redundancy tables. tables = redundancy.generate_redundancy_tables_sa(sa_digests, logger=logger) #Output tables. for table_id, table in list(tables.items()): table_file = os.path.join(args.output_dir, table_id + '.csv') logger.info("Writing '%s'..." % table_file) with open(table_file, 'w', newline='') as f: w = csv.writer(f) for row in table: w.writerow(row) logger.info("Done.")
def get_venter_annotations(self, seq_ids, logger=None): if not logger: logger = self.logger metagenome_sequence_ids = [] the_annotations = extract_venter_annotations(self, list(seq_ids.keys()), logger) # logger.info("annotations" % (the_annotations)) accession_numbers = [] scaffold_ids = [] orf_ids = [] orf_nums = [] annotations = [] gene_names = [] orf_tax_levels = [] orf_taxonomies = [] orf_tax_ids = [] contig_tax_ids = [] contig_taxonomies = [] contig_tax_levels = [] for annot in the_annotations: accession_numbers.append(annot[0]) metagenome_sequence_ids.append(seq_ids[annot[0]]) scaffold_ids.append(annot[1]) orf_ids.append(annot[2]) if annot[3] is not None: orf_nums.append(int(annot[3])) else: orf_nums.append(annot[3]) annotations.append(annot[4]) gene_names.append(annot[5]) orf_tax_levels.append(annot[6]) orf_taxonomies.append(annot[7]) if annot[8] is not None: orf_tax_ids.append(int(annot[8])) else: orf_tax_ids.append(annot[8]) contig_tax_levels.append(annot[9]) contig_taxonomies.append(annot[10]) if annot[11] is not None: contig_tax_ids.append(int(annot[11])) else: contig_tax_ids.append(annot[11]) cur = db.get_psycopg2_cursor() cur.execute( "select * from metagenome_annotation_insert(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);", (accession_numbers, metagenome_sequence_ids, scaffold_ids, orf_ids, orf_nums, annotations, gene_names, orf_tax_levels, orf_taxonomies, orf_tax_ids, contig_tax_ids, contig_taxonomies, contig_tax_levels)) db.psycopg2_connection.commit()
def main(): args = argparser.parse_args() logger = logging.getLogger('redundancy_tables') logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) # Check that taxon ids or taxon id file were provided. if not (args.taxon_ids or args.taxon_id_file): raise Exception("Must provide --taxon-ids or --taxon-id-file option") # Get taxons. if args.taxon_ids: taxon_ids = args.taxon_ids else: with open(args.taxon_id_file, 'r') as f: taxon_ids = [row[0] for row in csv.reader(f)] logger.info("Taxon Ids: %s" % (taxon_ids)) cur = db.get_psycopg2_cursor() cur.execute( "select * from taxon_digest td where td.taxon_id in (select t.id from taxon t where t.id = any(%s));", (taxon_ids, )) taxon_digests = [] for record in cur: td = TaxonDigest(id=record[0], taxon=record[1], digest=record[2]) logger.info("Taxon Digest: %s" % (td.id)) taxon_digests.append(td) db.psycopg2_connection.commit() # Generate the redundancy tables. tables = redundancy.generate_redundancy_tables(taxon_digests, logger=logger) # Create output dir if it does not exist. if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) # Output tables. for table_id, table in list(tables.items()): table_file = os.path.join(args.output_dir, table_id + '.csv') logger.info("Writing '%s'..." % table_file) with open(table_file, 'w', newline='') as f: w = csv.writer(f) for row in table: w.writerow(row) logger.info("Done.")
def clear_data_for_specialized_assembly(self, specialized_assembly): self.logger.info("Clearing data for specialized assembly '%s'" % specialized_assembly) cur = db.get_psycopg2_cursor() try: # Get Specialized Assembly Sequences. cur.execute( "select sas.id from specialized_assembly_sequence sas where sas.specialized_assembly_id = %s", (specialized_assembly, )) specialized_assembly_sequences = cur.fetchall() if specialized_assembly_sequences is not None: # Delete Sequences, Annotations and Digests self.logger.info( "Deleting Specialized Assembly Sequences and Digests") for sas in specialized_assembly_sequences: # self.logger.info("SAS '%s'" % sas) cur.execute( "delete from specialized_assembly_digest_peptide where specialized_assembly_sequence_id = %s", (sas[0], )) db.psycopg2_connection.commit() cur = db.get_psycopg2_cursor() for sas in specialized_assembly_sequences: cur.execute( "delete from specialized_assembly_sequence where id = %s", (sas[0], )) db.psycopg2_connection.commit() cur = db.get_psycopg2_cursor() # Delete Specialized Assembly cur.execute("delete from specialized_assembly where id = %s;", (specialized_assembly, )) except Exception as e: self.logger.error("Problem removing '%s'" % specialized_assembly) traceback.print_exc() else: # Commit the deletes. db.psycopg2_connection.commit()
def main(): logger = logging.getLogger('specialized_assemblies') logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) cur = db.get_psycopg2_cursor() cur.execute( "select sa.genome_name, sa.type_flag from specialized_assembly sa;") logger.info("Specialized Assemblies") logger.info("Genome Name Type") for record in cur: logger.info("%s %s" % (record[0], record[1])) db.psycopg2_connection.commit()
def main(): logger = logging.getLogger('metaomic_taxons') logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) cur = db.get_psycopg2_cursor(); cur.execute("select mt.tax_species from metagenome_taxon mt where mt.ncbi_id in \ (select distinct ma.contig_tax_id from metagenome_annotations ma) or \ mt.ncbi_id in (select distinct ma.orf_tax_id from metagenome_annotations ma)") logger.info("Meta-omic Assembly Taxons") for record in cur: logger.info("%s" % (record[0])) db.psycopg2_connection.commit()
def count_peptide_union_combined(sa_ids=[], td_ids=[], logger=None): #logger.info("In Union Common Pepdides Combined for SA %s and TD %s" % (sa_ids, td_ids)) cur = db.get_psycopg2_cursor(); cur.execute("select * from sa_taxon_count_peptide_union(%s, %s)", (sa_ids, td_ids)) # cur.execute("SELECT peptide_id from "\ # "((SELECT distinct sadp.peptide_id AS peptide_id "\ # "FROM specialized_assembly_digest_peptide sadp "\ # "JOIN specialized_assembly_sequence ON specialized_assembly_sequence.id = sadp.specialized_assembly_sequence_id "\ # "join specialized_assembly on specialized_assembly.id = specialized_assembly_sequence.specialized_assembly_id "\ # "WHERE specialized_assembly_sequence.specialized_assembly_id = any(array[%s])) "\ # "union all "\ # "(SELECT distinct taxon_digest_peptide.peptide_id AS peptide_id "\ # "FROM taxon_digest_peptide JOIN taxon_digest ON taxon_digest.id = taxon_digest_peptide.taxon_digest_id "\ # "WHERE taxon_digest.id = any(array[%s]))) as peptide_unions "\ # "group by peptide_id;", (sa_ids, td_ids)) count = len(cur.fetchall()); #logger.info("Union Count %s " % (count)) db.psycopg2_connection.commit() return count
def run(self): try: # Get session. cur = db.get_psycopg2_cursor(); self.logger.info("Clearing data for meta-omic assemblies '%s'" % self.metaomic_ids) cur.execute("select m.id from metagenome m where m.id in %s", (tuple(self.metaomic_ids),)) metaomic_results = cur.fetchall() if metaomic_results is None: self.logger.info("No matching taxons found. Nothing was changed") exit() for metaomic_assembly in metaomic_results: self.logger.info("Clearing data for meta-omic assembly '%s'" % metaomic_assembly) self.clear_data_for_metaomic_assembly(metaomic_assembly) except Exception as e: self.logger.error("Problem removing metaomic_assemblies: '%s'" % e) traceback.print_exc() db.psycopg2_connection.commit() else: db.psycopg2_connection.commit()
def process_peptide_batch(self, metagenome_sequence_digests_dict, logger=None): if not logger: logger = self.logger # Assemble combined peptide sequences and metagenome digests. Each metagenome sequence can have many peptides. combined_peptide_sequences = set() for proteinId, data in list(metagenome_sequence_digests_dict.items()): for sequence in data['peptide_sequences']: combined_peptide_sequences.add(sequence) # Get existing peptides. existing_peptides = {} # Create non-existent peptides in bulk. start_time = time.time() num_new_peptides = 0 peptide_sequences = [] peptide_masses = [] peptide_file = '' for sequence in combined_peptide_sequences: num_new_peptides += 1 #calculate mass of peptide mass = get_aa_sequence_mass(sequence) peptide_sequences.append(sequence) peptide_masses.append(mass) logger.info("Creating %s new peptides..." % num_new_peptides) cur = db.get_psycopg2_cursor() cur.execute("select * from peptide_insert(%s, %s);", (peptide_sequences, peptide_masses)) for record in cur: try: peptide = Peptide( id=record[0], sequence=record[1], ) existing_peptides[peptide.sequence] = peptide except Exception as e: logger.exception("Error processing peptide, skipping") continue total_time = time.time() - start_time self.total_peptide_time = self.total_peptide_time + total_time logger.info("peptide time elapsed: %s" % (total_time)) self.stats['Peptide'] += num_new_peptides # Create histogram of peptide sequence occurences for each protein. num_peptide_instances = 0 for sequenceId, data in list(metagenome_sequence_digests_dict.items()): peptides_histogram = defaultdict(int) for sequence in data['peptide_sequences']: peptides_histogram[sequence] += 1 data['peptide_histogram'] = peptides_histogram # Update number of peptide instances. num_peptide_instances += len(peptides_histogram) # Create protein digest peptide instances in bulk. start_time = time.time() pdp_peptide_ids = [] pdp_metagenome_sequence_ids = [] pdp_digest_ids = [] pdp_peptide_count = [] pdp_counter = 0 for sequenceId, data in list(metagenome_sequence_digests_dict.items()): for sequence, count in list(data['peptide_histogram'].items()): pdp_counter += 1 peptide = existing_peptides[sequence] pdp_peptide_ids.append(peptide.id) pdp_metagenome_sequence_ids.append( data['metagenome_sequence'].id) pdp_digest_ids.append(data['digest'].id) pdp_peptide_count.append(count) total_time = time.time() - start_time cur.execute( "select metagenome_sequence_digest_peptide_insert(%s, %s, %s, %s);", (pdp_peptide_ids, pdp_metagenome_sequence_ids, pdp_digest_ids, pdp_peptide_count)) db.psycopg2_connection.commit() total_time = time.time() - start_time # logger.info("protein digest time elapsed: %s" % (total_time)) self.stats['ProteinDigestPeptide'] += num_peptide_instances
def process_peptide_batch(self, batch, logger=None): if not logger: logger = self.logger # Assemble combined peptide sequences and protein digests. combined_peptide_sequences = set() protein_ids = [] digest_ids = [] protein_digests = [] protein_digests_dict = {} for proteinId, data in list(batch.items()): for sequence in data['peptide_sequences']: combined_peptide_sequences.add(sequence) pd = data['protein_digest'] protein_ids.append(pd.protein.id) digest_ids.append(pd.digest.id) cur = db.get_psycopg2_cursor() cur.execute("select * from protein_digest_insert(%s, %s);", (protein_ids, digest_ids)) # iterate through the protein_digest records returned from the insert and build a protein_digest object for record in cur: try: protein_digest = ProteinDigest(id=record[0], protein=record[1], digest=record[2]) protein_digests.append(protein_digest) batch_record = batch.get(record[1]) protein_digests_dict[record[1]] = { 'peptide_sequences': batch_record['peptide_sequences'], 'protein_digest': protein_digest, } except Exception as e: logger.exception("Error processing protein digest, skipping") continue db.psycopg2_connection.commit() self.stats['ProteinDigest'] += len(protein_digests) # Get existing peptides. existing_peptides = {} # Create non-existent peptides in bulk. start_time = time.time() num_new_peptides = 0 peptide_dicts = [] peptide_sequences = [] peptide_masses = [] for sequence in combined_peptide_sequences: #if sequence not in existing_peptides: num_new_peptides += 1 mass = get_aa_sequence_mass(sequence) peptide_dicts.append({ 'sequence': sequence, 'mass': mass, }) peptide_sequences.append(sequence) peptide_masses.append(mass) logger.info("Creating %s new peptides..." % num_new_peptides) cur = db.get_psycopg2_cursor() cur.execute("select * from peptide_insert(%s, %s);", (peptide_sequences, peptide_masses)) for record in cur: try: peptide = Peptide( id=record[0], sequence=record[1], ) existing_peptides[peptide.sequence] = peptide except Exception as e: logger.exception("Error processing peptide, skipping") continue self.stats['Peptide'] += num_new_peptides # Create histogram of peptide sequence occurences for each protein. num_peptide_instances = 0 for proteinId, data in list(protein_digests_dict.items()): peptides_histogram = defaultdict(int) for sequence in data['peptide_sequences']: peptides_histogram[sequence] += 1 data['peptide_histogram'] = peptides_histogram # Update number of peptide instances. num_peptide_instances += len(peptides_histogram) total_time = time.time() - start_time logger.info("peptide time elapsed: %s" % (total_time)) # Create protein digest peptide instances in bulk. logger.info("Creating %s new protein digest peptides..." % (num_peptide_instances)) start_time = time.time() pdp_batch = [] pdp_peptide_ids = [] pdp_protein_digest_ids = [] pdp_peptide_count = [] pdp_counter = 0 for proteinId, data in list(protein_digests_dict.items()): for sequence, count in list(data['peptide_histogram'].items()): pdp_counter += 1 peptide = existing_peptides[sequence] pdp_peptide_ids.append(peptide.id) pdp_protein_digest_ids.append(data['protein_digest'].id) pdp_peptide_count.append(count) total_time = time.time() - start_time logger.info("protein digest loop time elapsed: %s" % (total_time)) cur = db.get_psycopg2_cursor() cur.execute( "select protein_digest_peptide_insert(%s, %s, %s);", (pdp_peptide_ids, pdp_protein_digest_ids, pdp_peptide_count)) db.psycopg2_connection.commit() total_time = time.time() - start_time logger.info("protein digest time elapsed: %s" % (total_time)) self.stats['ProteinDigestPeptide'] += num_peptide_instances
def main(): start_time = time.time() args = argparser.parse_args() logger = logging.getLogger('proteomz_annotations') logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) filename = args.annotation_file # Parse digest definition if given. if filename: logger.info("Starting annotation ingest") metagenome_annotations = {} metagenome_sequence_ids = [] orf_ids = [] #i.e. node_123_orf format contig_ncbi_tax_ids = [] contig_tax_names = [] orf_ncbi_tax_ids = [] orf_tax_names = [] #open the file and iterate through it line by line total_annotations = 0 line_count = 0 with open(filename, 'rt') as f: reader = csv.reader(f) line_count = len(list(reader)) with open(filename, 'rt') as f: reader = csv.reader(f) try: count = 1 batch_count = 0 #handle ingestion in batches to reduce number of hits on the db for annot in reader: if count > 1: orf_id = annot[0] contig_tax_name = annot[1] if not contig_tax_name: contig_tax_name = None contig_ncbi_tax_id = annot[2] #print(contig_ncbi_tax_id) if not contig_ncbi_tax_id or contig_ncbi_tax_id == '#N/A' or contig_ncbi_tax_id == '': contig_ncbi_tax_id = None else: contig_ncbi_tax_id = int(contig_ncbi_tax_id) orf_tax_name = annot[3] if not orf_tax_name: orf_tax_name = None orf_ncbi_tax_id = annot[3] if not orf_ncbi_tax_id or orf_ncbi_tax_id == '#N/A' or orf_ncbi_tax_id == '': orf_ncbi_tax_id = None else: orf_ncbi_tax_id = int(orf_ncbi_tax_id) #get all the annotations we need from the csv and save them into a dict for later use ma = { "orf_id": orf_id, "contig_tax_name": contig_tax_name, "contig_tax_id": contig_ncbi_tax_id, "orf_tax_name": orf_tax_name, "orf_tax_id": orf_ncbi_tax_id } orf_ids.append(orf_id) metagenome_annotations[orf_id] = ma batch_count = batch_count + 1 if batch_count == 1000 or count == line_count: a_numbers = [] metagenome_sequence_ids = [] contig_ncbi_tax_ids = [] contig_tax_names = [] orf_ncbi_tax_ids = [] orf_tax_names = [] #look up the already populated sequence id to match it to the accession number cur = db.get_psycopg2_cursor() cur.execute( "select id, sequence_id from metagenome_sequence ms where ms.sequence_id = any(%s);", (orf_ids, )) db.psycopg2_connection.commit() for seq_id, a_number in cur: if seq_id is not None: batch_count = batch_count + 1 #populate the arrays that will be passed to postgres for a bulk insert metagenome_sequence_ids.append(seq_id) meta_annon = metagenome_annotations[ a_number] #print meta_annon["accession_number"] a_numbers.append(meta_annon["orf_id"]) contig_ncbi_tax_ids.append( meta_annon["contig_tax_id"]) contig_tax_names.append( meta_annon["contig_tax_name"]) orf_ncbi_tax_ids.append( meta_annon["orf_tax_id"]) orf_tax_names.append( meta_annon["orf_tax_name"]) cur = db.get_psycopg2_cursor() cur.execute( "select * from metagenome_annotation_insert(%s::text[], %s::numeric[], %s::numeric[], %s::text[], %s::numeric[], %s::text[]);", (a_numbers, metagenome_sequence_ids, contig_ncbi_tax_ids, contig_tax_names, orf_ncbi_tax_ids, orf_tax_names)) db.psycopg2_connection.commit() logger.info("ingested: %s annotations" % (total_annotations)) # logger.info("annotations" % (the_annotations)) metagenome_annotations = {} orf_ids = [] total_annotations = total_annotations + batch_count batch_count = 0 count = count + 1 except csv.Error as e: logger.error('file %s, line %d: %s' % (filename, reader.line_num, e)) print(e)
def process_fasta_file(self, path): base_msg = "Processing file '%s'..." % path file_logger = self.get_child_logger(id(path), base_msg, self.logger) #check to make sure the specified file exists before we try to do anything with it if os.path.exists(path): file_logger.info("Found file to ingest.") else: file_logger.info("Could not find file to ingest at %s" % path) exit() # Get taxon from filename. taxon_id = os.path.splitext(os.path.basename(path))[0] # Get taxon object from db or create a new one. taxon = Taxon(id=taxon_id) cur = db.get_psycopg2_cursor() cur.execute("select t.id from taxon t where t.id = %s;", (taxon_id, )) taxon_result = cur.fetchone() db.psycopg2_connection.commit() if taxon_result is None: #add a taxon to the DB cur.execute("insert into taxon (id) values(%s);", (taxon_id, )) db.psycopg2_connection.commit() self.stats['Taxon'] += 1 file_logger.info("Created taxon '%s'" % taxon_id) # Check if TaxonDigest record exists in db. cur.execute( "select t.id from taxon_digest t where t.taxon_id = %s and t.digest_id = %s;", ( taxon_id, self.digest.id, )) db.psycopg2_connection.commit() taxon_digest_result = cur.fetchone() taxon_digest = TaxonDigest(taxon=taxon, digest=self.digest) if taxon_digest_result: # If digest has been run on this taxon, don't do anything. file_logger.info( ("Taxon '%s' has already been digested with" " digest '%s', skipping.") % (taxon_id, self.digest)) return else: # Otherwise create a new TaxonDigest. cur.execute( "insert into taxon_digest (taxon_id, digest_id) values(%s,%s);", ( taxon_digest.taxon.id, taxon_digest.digest.id, )) db.psycopg2_connection.commit() self.stats['TaxonDigest'] += 1 # Process protein sequences in batches. file_logger.info("Counting # of protein sequences...") num_proteins = 0 for metadata, sequence in fasta.read(path): num_proteins += 1 file_logger.info("%s total protein sequences." % num_proteins) batch_size = 1000 batch_counter = 0 batch = [] protein_logger = self.get_child_logger("%s_proteins" % id(file_logger), "Processing proteins...", file_logger) protein_logger.info("") #check sequence against expected amino acids, if this regex returns true it means it is not a valid sequence (contains a non amino acid character) for metadata, sequence in fasta.read(path): if VALID_AAS.search(sequence): file_logger.info( "Tried to ingest invalid protein sequence %s" % sequence) else: batch.append(( metadata, sequence, )) batch_counter += 1 if (batch_counter % batch_size) == 0: self.process_protein_batch(batch, taxon, logger=protein_logger) protein_logger.info(("%s of %s (%.1f%%)") % (batch_counter, num_proteins, 100.0 * batch_counter / num_proteins)) batch = [] self.process_protein_batch(batch, taxon, logger=protein_logger) batch_size = 1e4 cur = db.get_psycopg2_cursor() cur.execute("select * from get_peptide_count(%s, %s);", ( self.digest.id, taxon.id, )) tdp_batch = [] tdp_counter = 0 for row in cur.fetchall(): tdp_counter += 1 tdp_batch.append(row) if (tdp_counter % batch_size) == 0: self.process_taxon_digest_peptide_batch(taxon_digest, tdp_batch, logger=file_logger) tdp_batch = [] self.process_taxon_digest_peptide_batch(taxon_digest, tdp_batch, logger=file_logger) self.stats['TaxonDigestPeptide'] += tdp_counter self.logger.info("Done processing file '%s'" % path) db.psycopg2_connection.commit()
def main(): """ Process arguments. """ argparser = argparse.ArgumentParser( description=('Update specialized assembly lineage from CSV file.')) argparser.add_argument('--filepath', help=('CSV file containing genome lineage')) start_time = time.time() logger = logging.getLogger('update_specialized_Assembly_taxons') logger.addHandler(logging.StreamHandler()) logger.setLevel(logging.INFO) args = argparser.parse_args() if args.filepath: filename = args.filepath print(filename) with open(filename, 'rt') as f: reader = csv.reader(f) try: count = 1 for row in reader: #get the ncbi_id and genome (MAG or SAG) name and update the specialized_assembly table with this info if count > 1: fasta_file_name = row[0] genome_id = os.path.splitext( os.path.basename(fasta_file_name))[0] taxon_name = row[1] print("Genome ID", genome_id) ncbi_id = row[2] tax_group = row[3] tax_kingdom = row[4] tax_phylum = row[5] tax_class = row[6] tax_order = row[7] tax_family = row[8] tax_genus = row[9] tax_species = row[10].replace(".", "") ncbi_taxon_name = row[11] cur = db.get_psycopg2_cursor() cur.execute( "update specialized_assembly set ncbi_id = %s where genome_name = %s", (ncbi_id, taxon_name)) db.psycopg2_connection.commit() s = "." seq = (tax_group, tax_kingdom, tax_phylum, tax_class, tax_order, tax_family, tax_genus, tax_species) hierachy = s.join(seq) hierachy = hierachy.replace(" ", "_") hierachy = hierachy.replace("-", "_") hierachy = hierachy.replace("(", "_") hierachy = hierachy.replace(")", "_") hierachy = hierachy.replace("[", "") hierachy = hierachy.replace("]", "") hierachy = hierachy.replace("'", "") hierachy = hierachy.replace("'", "") hierachy = hierachy.replace("..", ".") hierachy = hierachy.replace(":", ".") if hierachy.endswith("."): hierachy = hierachy[:-1] ncbi_url = "http://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?id=" + ncbi_id print("URL: ", ncbi_url) cur.execute( "select nt.ncbi_id from ncbi_taxonomy nt where nt.ncbi_id = %s;", (ncbi_id, )) taxon_result = cur.fetchone() if taxon_result is None: cur.execute( "insert into ncbi_taxonomy(ncbi_id, tax_group, tax_kingdom, tax_phylum, tax_class, " "tax_order, tax_family, tax_genus, tax_species, hierachy, ncbi_url)" " values(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s);", (ncbi_id, tax_group, tax_kingdom, tax_phylum, tax_class, tax_order, tax_family, tax_genus, tax_species, hierachy, ncbi_url)) db.psycopg2_connection.commit() else: cur.execute( "update ncbi_taxonomy set ncbi_id = %s, tax_group = %s, tax_kingdom = %s, tax_phylum = %s, tax_class = %s, " "tax_order = %s, tax_family = %s, tax_genus = %s, tax_species = %s, hierachy = %s, ncbi_url = %s where ncbi_id=%s", (ncbi_id, tax_group, tax_kingdom, tax_phylum, tax_class, tax_order, tax_family, tax_genus, tax_species, hierachy, ncbi_url, ncbi_id)) db.psycopg2_connection.commit() count = count + 1 except csv.Error as e: logger.error('file %s, line %d: %s' % (filename, reader.line_num, e))
def count_peptide_union_sa_ids(sa_ids=[], logger=None): cur = db.get_psycopg2_cursor(); cur.execute("select * from sadp_count_peptide_union(%s)", (sa_ids,)) count = len(cur.fetchall()); db.psycopg2_connection.commit() return count
def run(self): # Read in sequences to query. sequences = [] max_dist = self.args.max_distance if self.args.sequence_file: with open(self.args.sequence_file, 'rb') as f: sequences = [line.strip() for line in f.readlines()] elif self.args.sequence: sequences = [self.args.sequence] # Read in whether to query just genomes ('g'), just metagenomes ('g') or both ('b'). # Genomes is the default search type = 'g' if self.args.type: type = self.args.type if not sequences: argparser.error("Provide a query sequence via the '--sequence' option, " "or a set of sequences via the --sequence-file option") # Print headers. headers = ['query', 'taxon', 'lev_distance', 'match'] print(','.join(headers)) # Execute query for each sequence and print results. cur = db.get_psycopg2_cursor() for seq in sequences: if type == 'g' or type == 'all': hierachy = [] results=[] taxon_lca='' print('GENOMIC RESULTS') print('LCA,search sequence,id,name') if max_dist == 0: cur.execute("select id, genome_name, hierachy from genomic_query_taxon_by_peptide_sequence_new(%s)", (seq,)) else: #change this query when we add fuzzy matching cur.execute("select id, genome_name, hierachy from genomic_query_taxon_by_peptide_sequence_new(%s)", (seq,)) for row in cur.fetchall(): hierachy.append(row[0]) results.append(','.join([seq, str(row[0]), row[1]])) #print(','.join([str(s) for s in [seq] + list(row)])) cur.execute("select * from genomic_lca(%s);", [hierachy]) if hierachy: lca = cur.fetchone() if lca is not None: taxonHierachy = lca[0].split(".") # iterate through taxon hierachy backwards until we find the first record that is not "unclassified" for l in taxonHierachy[::-1]: if l.lower() != "unclassified": taxon_lca = l break else: taxon_lca = "Unknown" for r in results: print(','.join([taxon_lca, r])) if type == 'sa' or type == 'all': hierachy=[] results=[] print('\n'); print('SPECIALIZED ASSEMBLY RESULTS') print('LCA,search sequence,genome name, sequence id, NCBI ID') cur.execute( "select specialized_assembly_name, specialized_assembly_sequence, ncbi_id from specialized_assembly_taxon_query_by_peptide_sequence(%s)", (seq,)) for row in cur.fetchall(): ncbi_id = row[2] hierachy.append(ncbi_id) results.append(','.join([str(s) for s in [seq] + list(row)])) if hierachy: assembly_lca = "" # get the least common ancester for hierachy list cur.execute("select * from specialized_assembly_lca(%s);", [hierachy]) lca = cur.fetchone() if lca is not None: # print lca[0] taxonHierachy = lca[0].split(".") # iterate through taxon hierachy backwards until we find the first record that is not "unclassified" for l in taxonHierachy[::-1]: if l.lower() != "unclassified": assembly_lca = l break else: assembly_lca = "Unknown" for r in results: print(','.join([assembly_lca, r])) if type == 'm' or type == 'all': hierachy = [] results = [] print('\n'); print('METAGENOMNIC RESULTS') print('LCA, search sequence, metagenome name, NCBI ID') cur.execute("select metagenome_name, contig_tax_id, orf_tax_id from metagenomic_query_by_peptide_sequence(%s)", (seq,)) for row in cur.fetchall(): name = row[0] contig_tax_id=row[1] orf_tax_id=row[2] if contig_tax_id: prefered_tax_id = contig_tax_id elif orf_tax_id: prefered_tax_id = orf_tax_id hierachy.append(prefered_tax_id) results.append(','.join([seq,name,str(prefered_tax_id)])) # get the least common ancester for hierachy list if hierachy: cur.execute("select * from metagenomic_lca2(%s);", [hierachy]) lca = cur.fetchone() metagenome_lca = "" if lca is not None: taxonHierachy = lca[0].split(".") # iterate through taxon hierachy backwards until we find the first record that is not "unclassified" for l in taxonHierachy[::-1]: # print l if l.lower() != "unclassified": metagenome_lca = l break else: metagenome_lca = "Unknown" for r in results: print(','.join([metagenome_lca, r])) db.psycopg2_connection.commit()
def process_metagenome_sequence_batch(self, batch, metagenome, logger=None): """ Process a batch of metagenome sequences with the given digest. """ if not batch: return if not logger: logger = self.logger # Get existing metagenome sequences (proteins) by searching for sequences. sequences = [] metadataList = [] for metadata, sequence in batch: sequences.append(sequence) # Initialize collection of undigested proteins. undigested_sequences = {} metagenome_sequences = [] metagenome_digest_ids = [] metagenome_ids = [] metagenome_accesion_ids = {} # Create proteins which do not exist in the db and add to undigested # collection. start_time = time.time() num_new_sequences = 0 for metadata, sequence in batch: num_new_sequences += 1 # add sequence and mass to their respective lists to be passed to postgres stored procedure metagenome_sequences.append(sequence) metadataList.append(metadata) metagenome_digest_ids.append(self.digest.id) metagenome_ids.append(metagenome.id) # logger.info("creating %s new metagenome sequences..." % ( # num_new_sequences)) cur = db.get_psycopg2_cursor() cur.execute("select * from metagenome_sequence_insert(%s, %s, %s);", (metagenome_sequences, metagenome_ids, metadataList)) # iterate through the protein records returned from the insert and build a protein object for record in cur: try: meta_seq = Metagenome_Sequence(id=record[0], sequence=record[1], metagenome_id=record[2], sequence_id=record[3]) except Exception as e: logger.exception( "Error processing metagenome sequence, skipping") continue undigested_sequences[record[0]] = meta_seq metagenome_accesion_ids[meta_seq.sequence_id] = meta_seq.id db.psycopg2_connection.commit() total_time = time.time() - start_time logger.info("time elapsed: %s" % (total_time)) self.stats['Protein'] += num_new_sequences # Digest undigested proteins. if undigested_sequences: #logger.info("digesting %s proteins" % num_new_sequences) undigested_batch = {} peptide_counter = 0 for metagenome_sequence in list(undigested_sequences.values()): peptide_sequences = cleave( metagenome_sequence.sequence, self.digest.protease.cleavage_rule, self.logger, self.digest.max_missed_cleavages, min_acids=self.digest.min_acids, max_acids=self.digest.max_acids, ) peptide_counter += len(peptide_sequences) undigested_batch[metagenome_sequence.id] = { 'peptide_sequences': peptide_sequences, 'metagenome_sequence': metagenome_sequence, 'digest': self.digest, } self.process_peptide_batch(undigested_batch, logger)
def count_common_peptides_ids(taxon_digest_ids=[], logger=None): cur = db.get_psycopg2_cursor(); cur.execute("select * from taxon_count_common_peptides(%s, %s)", (taxon_digest_ids, len(taxon_digest_ids))) count = len(cur.fetchall()); db.psycopg2_connection.commit() return count
def process_fasta_file(self, path): base_msg = "Processing file '%s'..." % path file_logger = self.get_child_logger(id(path), base_msg, self.logger) # Get metagenome name from filename. # This may not be the best way to do this, might be better to allow user to input a name? metagenome_name = os.path.splitext(os.path.basename(path))[0] cur = db.get_psycopg2_cursor() cur.execute("select m.id from metagenome m where m.name = %s;", (metagenome_name, )) metagenome_result = cur.fetchone() db.psycopg2_connection.commit() if metagenome_result is None: # add a metagenome to the DB cur.execute("select * from metagenome_insert(%s);", (metagenome_name, )) # db.psycopg2_connection.commit() metagenome_result = cur.fetchone() self.stats['Metagenome'] += 1 file_logger.info("Created metagenome '%s'" % metagenome_name) metagenome = Metagenome(id=metagenome_result[0], name=metagenome_name) # Check if metagenome has already been digested with given digestion agent. cur.execute( "select md.digest_id from metagenome_sequence_digest_peptide md where md.metagenome_sequence_id in (select ms.id from metagenome_sequence ms where ms.metagenome_id = %s) and md.digest_id = %s;", ( metagenome.id, self.digest.id, )) db.psycopg2_connection.commit() metagenome_digest_result = cur.fetchone() if metagenome_digest_result: # If digest has been run on this metagenome, don't do anything. file_logger.info( ("Metagenome '%s' has already been digested with" " digest '%s', skipping.") % (metagenome_name, self.digest)) return # Process metagenome sequences in batches. file_logger.info("Counting # of metagenome sequences...") num_proteins = 0 for metadata, sequence in fasta.read(path): num_proteins += 1 file_logger.info("%s total metagenome sequences." % num_proteins) batch_size = 999 batch_counter = 0 batch = [] protein_logger = self.get_child_logger( "%s_proteins" % id(file_logger), "Processing metagenome sequences...", file_logger) protein_logger.info("") for metadata, sequence in fasta.read(path): # check sequence against expected amino acids, if this regex returns true it means it is not a valid sequence (contains a non amino acid character) if VALID_AAS.search(sequence): file_logger.info( "Tried to ingest invalid protein sequence %s" % sequence) else: batch.append(( metadata, sequence, )) batch_counter += 1 if (batch_counter % batch_size) == 0: self.process_metagenome_sequence_batch(batch, metagenome, logger=protein_logger) protein_logger.info(("%s of %s (%.1f%%)") % (batch_counter, num_proteins, 100.0 * batch_counter / num_proteins)) batch = [] self.process_metagenome_sequence_batch(batch, metagenome, logger=protein_logger) protein_logger.info("Total Peptide Time: %s" % self.total_peptide_time)