def add_kingdoms(cazy_taxa_dict, connection): """Add new Kingdoms objects to database. Check existing kingdom objects in the db against kingdoms retrieved from the CAZy txt file, so as to only add new kingdoms. :param cazy_taxa_dict: dict of kingdoms and organisms from the cazy_data dict {kingdom: {organism}} :param connection: open sqlalchemy connection to a local SQLite db engine Return nothing """ kingdom_table_dict = get_table_dicts.get_kingdom_table_dict(connection) # dict {kingdom: {organisms}} # retrieve the Kingdoms retrieved from the CAZy txt file cazy_kingdoms = list(cazy_taxa_dict.keys()) existing_kingdom_records = list(kingdom_table_dict.keys()) # create list of tuples for db insert kingdoms_db_insert_values = [(kngdm, ) for kngdm in cazy_kingdoms if kngdm not in existing_kingdom_records] if len(kingdoms_db_insert_values) != 0: insert_data(connection, 'Kingdoms', ['kingdom'], kingdoms_db_insert_values) return
def add_cazy_families(cazy_data, connection): """Add CAZy families and subfamilies to local CAZyme database :param cazy_data: dict of data extracted from the txt file {gbk_accession: {kingdom:str, organism:str, families{fam:subfam}}} :param connection: open sqlalchemy connection to an SQLite db engine Return nothing""" logger = logging.getLogger(__name__) # get list of CAZy (sub)families already present in the db fam_table_dict = get_table_dicts.get_fams_table_dict( connection) # {family subfamily: db_family_id} existing_fam_records = list(fam_table_dict.keys()) families_db_insert_values = set() # new fam records to add to db for genbank_accession in tqdm(cazy_data, desc='Extracting CAZy fams from CAZy data'): for cazy_fam in cazy_data[genbank_accession]["families"]: subfamilies = cazy_data[genbank_accession]["families"][cazy_fam] for subfam in subfamilies: if subfam is None: cazy_subfam = "_" else: cazy_subfam = subfam fam_key = f"{cazy_fam} {cazy_subfam}" if fam_key not in existing_fam_records: families_db_insert_values.add( (cazy_fam, subfam)) # add None for subfam if is None if len(families_db_insert_values) != 0: logger.info(f"Inserting {len(families_db_insert_values)} " "new family records into the CazyFamilies table") insert_data(connection, 'CazyFamilies', ['family', 'subfamily'], list(families_db_insert_values)) else: logger.info( "Found no new CAZy family records to add the CazyFamilies table") return
def add_source_organisms(taxa_dict, connection): """Add taxonomy (source organism) data to the local CAZyme database :param taxa_dict: dict of taxa data {kingdom: set(organism)} to be added to the db :param connection: open sqlalchemy connection to SQLite engine Return nothing """ logger = logging.getLogger(__name__) # retrieve db kingdom objects for retrieving the kingdom_id for the Taxs table kingdom_table_dict = get_table_dicts.get_kingdom_table_dict(connection) # {kingdom: kingdom_id} tax_table_dict = get_table_dicts.get_taxs_table_dict(connection) # {genus species: {'tax_id': int(db_tax_id), 'kingdom_id': int(kingdom_id)} # compare taxa already in the db against taxa retrieved from the CAZy txt file # to identify new taxa objects to be added to the db taxonomy_db_insert_values = set() records_to_update = set() # used incase kingdom has changed for a species for kingdom in tqdm( taxa_dict, total=len(list(taxa_dict.keys())), desc='Create tax objects per Kingdom', ): kingdom_id = kingdom_table_dict[kingdom] organisms = taxa_dict[kingdom] for organism in organisms: # organisms from the CAZy txt file try: existing_record_data = tax_table_dict[organism] # check kingdom is correct existing_record_kngdm_id = tax_table_dict[organism][ 'kingdom_id'] if existing_record_kngdm_id != kingdom_id: records_to_update.add(( genus, species, kingdom_id, )) except KeyError: # organism not in the db, build new record genus = organism.split(" ")[0] species = ' '.join(organism.split(" ")[1:]) new_record = ( genus, species, kingdom_id, ) taxonomy_db_insert_values.add(new_record) if len(taxonomy_db_insert_values) != 0: logger.info( f"Adding {len(taxonomy_db_insert_values)} new tax records to the db" ) insert_data( connection, 'Taxs', ['genus', 'species', 'kingdom_id'], list(taxonomy_db_insert_values), ) else: logger.info("No new tax records to add to the db") if len(records_to_update) != 0: logger.info( f"Updating the parent Kingdom for {len(records_to_update)} tax records in the db" ) with connection.begin(): for record in records_to_update: connection.execute( text( "UPDATE Taxs " f"SET kingdom_id = {record[2]} " f"WHERE genus = '{record[0]}' AND species = '{record[1]}'" )) return
def add_genbank_fam_relationships(cazy_data, connection, args): """Add GenBank accession - CAZy family relationships to db :param cazy_data: dict of data extracted from the txt file {gbk_accession: {kingdom:str, organism:str, families{fam:subfam}}} :param connection: open sqlalchemy connection to an SQLite db engine :param args: cmd-line args parser Return nothing """ logger = logging.getLogger(__name__) gbk_fam_db_insert_values = set() # new records to add gbk_fam_records_to_del = set() # records/relationships to delete # get dict of GenBank and CazyFamilies tables, used for getting gbk_ids and fam_ids of accessions and # families without entries in the CazyFamilies_Genbanks table gbk_table_dict = get_table_dicts.get_gbk_table_dict(connection) # {genbank_accession: 'taxa_id': int, 'gbk_id': int} fam_table_dict = get_table_dicts.get_fams_table_dict(connection) # {'fam subfam': fam_id} # load current relationships in the db gbk_fam_table_dict, existing_rel_tuples = get_table_dicts.get_gbk_fam_table_dict( connection) # {genbank_accession: {'families': {str(fam subfam): int(fam_id)}, 'gbk_id': int(gbk_db_id)} } for genbank_accession in tqdm( cazy_data, desc="Extracting Genbank-Family relationships from CAZy data"): gbk_id = gbk_table_dict[genbank_accession]['gbk_id'] cazy_fam_dict = cazy_data[genbank_accession]['families'] # cazy_data = { gbk_acc: {'families': {fam: {subfamilies}} } } # cazy_fam_dict = {fam : {subfamilies}} try: existing_relation_dict = gbk_fam_table_dict[genbank_accession][ 'families'] # existing_relation_dict = { str(fam subfam) : fam_id } for fam in cazy_fam_dict: subfamilies = cazy_fam_dict[ fam] # set of subfams for parent fam from CAZy data for subfam in subfamilies: # add each fam-subfam pair if subfam is None: family_key = f"{fam} _" else: family_key = f"{fam} {subfam}" try: existing_relation_dict[family_key] # already in db, don't add relationship again except KeyError: fam_id = fam_table_dict[family_key] new_row = ( gbk_id, fam_id, ) if new_row not in existing_rel_tuples: gbk_fam_db_insert_values.add(( gbk_id, fam_id, )) except KeyError: # GenBank not present in the Genbanks_CazyFamilies table, create new record for fam in cazy_fam_dict: subfamilies = cazy_fam_dict[ fam] # set of subfams for parent fam from CAZy data for subfam in subfamilies: # add each fam-subfam pair if subfam is None: family_key = f"{fam} _" else: family_key = f"{fam} {subfam}" fam_id = fam_table_dict[family_key] new_row = ( gbk_id, fam_id, ) if new_row not in existing_rel_tuples: gbk_fam_db_insert_values.add(( gbk_id, fam_id, )) if len(gbk_fam_db_insert_values) != 0: logger.info( f"Adding {len(gbk_fam_db_insert_values)} new GenBank accession - " "CAZy (sub)family relationships to the db") insert_data( connection, 'Genbanks_CazyFamilies', ['genbank_id', 'family_id'], list(gbk_fam_db_insert_values), ) else: logger.info( "No new Genbank accession-CAZy (sub)family relationships to add to the db" ) if (len(gbk_fam_records_to_del) != 0): logger.info( "Deleting {(len(gbk_fam_records_to_del)} GenBank accession - " "CAZy (sub)family relationships\n" "that are the db but are no longer in CAZy") with connection.begin(): for record in gbk_fam_records_to_del: # record = (genbank_id, fam_id,) stmt = ( delete(genbanks_families).\ where(genbanks_families.c.genbank_id == record[0]).\ where(genbanks_families.c.family_id == record[1]) ) connection.execute(stmt) return
def add_genbanks(cazy_data, connection): """Add GenBank accessions with tax data to the db :param cazy_data: dict of CAZy data {gbk_acc: {kingdom: {str}, organism: {str}, families: {(fam, subfam, )}}} :param connection: open sqlalchemy connection to an SQLite db Return Nothing """ logger = logging.getLogger(__name__) # retrieve existing records from the db gbk_table_dict = get_table_dicts.get_gbk_table_dict(connection) existing_gbk_records = list(gbk_table_dict.keys()) taxa_table_dict = get_table_dicts.get_taxs_table_dict(connection) # {genus species: {'tax_id': db_tax_id, 'kingdom_id': kingdom_id} gbk_record_updates = set( ) # {gbk_accession: 'taxa_id': (new taxa_id) int, 'gbk_id': int} gbk_db_insert_values = set() for gbk_accession in tqdm(cazy_data, desc="Compiling Genbank records for insertion"): if gbk_accession not in existing_gbk_records: organism = list(cazy_data[gbk_accession]['organism'])[0] taxa_id = taxa_table_dict[organism]['tax_id'] gbk_db_insert_values.add(( gbk_accession, taxa_id, )) else: # check if need to update taxa_id # get the taxa_id for the existing record existing_record_id = gbk_table_dict[gbk_accession]['taxa_id'] # get the taxa_id for the organism listed in the CAZy txt file organism = list(cazy_data[gbk_accession]['organism'])[0] cazy_data_taxa_id = taxa_table_dict[organism]['tax_id'] if cazy_data_taxa_id != existing_record_id: # need to update the record gbk_record_updates.add( (cazy_data_taxa_id, gbk_table_dict[gbk_accession]['gbk_id'])) if len(gbk_db_insert_values) != 0: logger.info( f"Inserting {len(gbk_db_insert_values)} GenBank accessions into the db" ) insert_data(connection, 'Genbanks', ['genbank_accession', 'taxonomy_id'], list(gbk_db_insert_values)) if len(gbk_record_updates) != 0: logger.info( f"Updating {len(gbk_record_updates)} Genbank table records with new taxonomy IDs" ) with connection.begin(): for record in gbk_record_updates: connection.execute( text("UPDATE Taxs " f"SET taxonomy_id = {record[1]} " f"WHERE genbank_id = '{record[0]}'")) return
def add_uniprot_accessions(uniprot_dict, gbk_dict, connection, args): """Add UniProt data to the local CAZyme database :param uniprot_dict: dict containing data retrieved from UniProt :param gbk_dict: dict representing data from the Genbanks table :param connection: open sqlalchemy conenction to an SQLite db engine :param args: cmd-line args parser Return nothing. """ # load the current Uniprot records in the local CAZyme db # {acc: {name: str, gbk_id: int, seq: str, seq_date:str } } uniprot_table_dict = get_table_dicts.get_uniprot_table_dict(connection) uniprot_insert_values = set() # the following variables containing objects in the local db that are to be updated update_record_gbk_id = set() # ((uniprot_accession, gbk_id),) update_record_name = set() # ((uniprot_accession, retrieved_name), ) update_record_seq = set() # ((uniprot_accession, retrieved_seq), ) for uniprot_acc in tqdm(uniprot_dict, desc="Separating new and existing records"): # check if the UniProt accession is already in the local CAZyme db try: uniprot_table_dict[uniprot_acc] # Uniprot accession is already in the local CAZyme db # check if the GenBank id is the same existing_gbk_id = uniprot_table_dict[uniprot_acc]['genbank_id'] retrieved_gbk_acc = uniprot_dict[uniprot_acc]["genbank_accession"][ 'gbk_acc'] retrieved_gbk_id = gbk_dict[retrieved_gbk_acc] if existing_gbk_id != retrieved_gbk_id: update_record_gbk_id.add((uniprot_acc, retrieved_gbk_id)) if args.name_update: # check if the name has changed existing_name = uniprot_table_dict[uniprot_acc]['name'] retrieved_name = uniprot_dict[uniprot_acc]['name'] if existing_name != retrieved_name: update_record_name.add((uniprot_acc, retrieved_name)) if args.sequence: # only add seq if sequence is not there if uniprot_table_dict[uniprot_acc]['seq'] is None: update_record_seq.add( (uniprot_acc, uniprot_dict[uniprot_acc]["sequence"])) if args.seq_update: # update seq if a newer version is available or no seq present if uniprot_table_dict[uniprot_acc]['seq'] is None: update_record_seq.add(( uniprot_acc, uniprot_dict[uniprot_acc]["sequence"], uniprot_dict[uniprot_acc]["seq_date"], )) else: existing_date = uniprot_table_dict[uniprot_acc]['seq_date'] retrieved_date = uniprot_dict[uniprot_acc]['seq_date'] if existing_date < retrieved_date: # existing date is older, update seq update_record_seq.add(( uniprot_acc, uniprot_dict[uniprot_acc]["sequence"], uniprot_dict[uniprot_acc]["seq_date"], )) except KeyError: # new record to add to local CAZyme db if args.sequence or args.seq_update: genbank_acc = uniprot_dict[uniprot_acc]["genbank_accession"][ 'gbk_acc'] gbk_id = gbk_dict[genbank_acc] uniprot_name = uniprot_dict[uniprot_acc]["name"] seq = uniprot_dict[uniprot_acc]["sequence"] date = uniprot_dict[uniprot_acc]["seq_date"] uniprot_insert_values.add( (gbk_id, uniprot_acc, uniprot_name, seq, date)) else: # not retrieving protein sequences genbank_acc = uniprot_dict[uniprot_acc]["genbank_accession"][ 'gbk_acc'] gbk_id = gbk_dict[genbank_acc] uniprot_name = uniprot_dict[uniprot_acc]["name"] uniprot_insert_values.add((gbk_id, uniprot_acc, uniprot_name)) if len(uniprot_insert_values) != 0: if args.sequence or args.seq_update: columns = [ 'genbank_id', 'uniprot_accession', 'uniprot_name', 'sequence', 'seq_update_date' ] else: columns = ['genbank_id', 'uniprot_accession', 'uniprot_name'] insert_data(connection, "Uniprots", columns, list(uniprot_insert_values)) if len(update_record_gbk_id) != 0: with connection.begin(): for record in tqdm(update_record_gbk_id, desc="Updating UniProt-Gbk relationships"): connection.execute( text("UPDATE Uniprots " f"SET genbank_id = {record[1]} " f"WHERE uniprot_accession = '{record[0]}'")) if len(update_record_name) != 0: with connection.begin(): for record in tqdm(update_record_name, desc="Updating UniProt protein names"): connection.execute( text("UPDATE Uniprots " f"SET uniprot_name = {record[1]} " f"WHERE uniprot_accession = '{record[0]}'")) if len(update_record_seq) != 0: with connection.begin(): for record in tqdm(update_record_seq, desc="Updating UniProt protein seqs"): connection.execute( text( "UPDATE Uniprots " f"SET sequence = {record[1]}, seq_update_date = {record[2]} " f"WHERE uniprot_accession = '{record[0]}'")) return
def add_pdb_accessions(uniprot_dict, gbk_dict, connection, args): """Add PDB accessions to the local CAZyme database :param uniprot_dict: dict containing data retrieved from UniProt :param gbk_dict: dict representing data from the Genbanks table :param connection: open sqlalchemy conenction to an SQLite db engine :param args: cmd-line args parser Return nothing. """ logger = logging.getLogger(__name__) pdbs_to_delete = set() # only used if args.delete_old_pdbs is true relationships_to_delete = set( ) # only used if args.delete_old_pdbs is true # load in PDB objects in the local CAZyme db pdb_table_dict = get_table_dicts.get_pdb_table_dict(connection) # {pdb_accession: pdb_db_id} # First, identify new PDB accessions to add to the database pdb_insert_values = set() for uniprot_acc in tqdm(uniprot_dict, desc="Identifying new PDBs to add to db"): for pdb in uniprot_dict[uniprot_acc]["pdb"]: try: pdb_table_dict[pdb] except KeyError: pdb_insert_values.add((pdb, )) if len(pdb_insert_values) != 0: logger.warning( f"Adding {len(pdb_insert_values)} PDB accessions to the database") insert_data(connection, "Pdbs", ["pdb_accession"], list(pdb_insert_values)) # reload the updated pdb table pdb_table_dict = get_table_dicts.get_pdb_table_dict(connection) # load in Genbanks_Pdbs relationship table gbk_pdb_rel_table_dict = get_table_dicts.get_gbk_pdb_table_dict(connection) # {gbk_db_id: {pdb_db_id} } # convert the data from UniProt into a dict of {gbk_db_id: pdb_db_id} # to identify pdb-protein relationships retrieved from UniProt gbk_pdb_insert_values = set() for uniprot_acc in tqdm( uniprot_dict, desc="Identifying new protein-PDB relationships to add to db"): genbank_acc = uniprot_dict[uniprot_acc]["genbank_accession"]['gbk_acc'] gbk_db_id = gbk_dict[genbank_acc] # set of pdb_accessions retrieved from UniProt retrieved_pdbs = uniprot_dict[uniprot_acc]["pdb"] if len(retrieved_pdbs) == 0: continue # set of pdb_db_ids the protein (gbk_db_id) is already related to in the db try: existing_pdb_relationships = gbk_pdb_rel_table_dict[gbk_db_id] except KeyError: existing_pdb_relationships = set() for pdb_acc in retrieved_pdbs: pdb_db_id = pdb_table_dict[pdb_acc] try: if pdb_db_id not in existing_pdb_relationships: # genbank and pdb records not yet stored together in the relationship table gbk_pdb_insert_values.add((gbk_db_id, pdb_db_id)) except KeyError: # genbank not listed in the Genbanks_Pdbs relationship table gbk_pdb_insert_values.add((gbk_db_id, pdb_db_id)) if args.delete_old_pdbs: # convert gbk_pdb_rel_table_dict to be keyed by pdb_db_ids and valued by set of gbk_db_ids pdb_gbk_relationships = {} for existing_gbk_id in gbk_pdb_rel_table_dict: existing_pdbs = gbk_pdb_rel_table_dict[existing_gbk_id] for pdb in existing_pdbs: try: pdb_gbk_relationships[pdb].add(existing_gbk_id) except KeyError: pdb_gbk_relationships[pdb] = {existing_gbk_id} # for each pdb_db_id related to the current protein in the db # get the corresponding pdb_accession all_existing_pdb_ids = list(set(pdb_table_dict.values())) all_existing_pdb_accs = list(pdb_table_dict.keys()) for pdb_db_id in existing_pdb_relationships: position = all_existing_pdb_ids.index(pdb_db_id) pdb_acc = all_existing_pdb_accs[position] if pdb_acc not in retrieved_pdbs: # delete genbank-pdb relationship relationships_to_delete.add((gbk_db_id, pdb_db_id)) # check if can delete pdb accession because it is not linked to any other proteins if len( pdb_gbk_relationships[pdb_db_id] ) != 1: # deleting one pdb accession will not leave not related to any genbanks pdbs_to_delete.add((pdb_acc)) if len(gbk_pdb_insert_values) != 0: insert_data(connection, "Genbanks_Pdbs", ["genbank_id", "pdb_id"], list(gbk_pdb_insert_values)) if args.delete_old_pdbs and len(pdbs_to_delete) != 0: with connection.begin(): for record in tqdm(pdbs_to_delete, desc="Deleteing old PDB accessions"): # record = (pdb_acc,) connection.execute( text("DELETE Pdbs " f"WHERE pdb_accession = '{record[0]}'")) if args.delete_old_pdbs and len(relationships_to_delete) != 0: with connection.begin(): for record in tqdm(relationships_to_delete, desc="Deleteing old Genbank-PDB relationships"): # record = (pdb_acc,) connection.execute( text( "DELETE Genbanks_Pdbs " f"WHERE genbank_id = '{record[0]}' AND pdb_id = '{record[1]}'" )) return
def add_ec_numbers(uniprot_dict, all_ecs, gbk_dict, connection, args): """Add EC numbers to the local CAZyme database :param uniprot_dict: dict containing data retrieved from UniProt :param all_ecs: set of all EC numbers retrieved from UniProt :param gbk_dict: dict representing data from the Genbanks table :param connection: open sqlalchemy conenction to an SQLite db engine :param args: cmd-line args parser Return nothing. """ # load EC records in the local CAZyme db ec_table_dict = get_table_dicts.get_ec_table_dict(connection) # identify new EC numbers to add to the EC table existing_ecs = list(ec_table_dict.keys()) ec_insert_values = set() for ec in all_ecs: if ec[0] not in existing_ecs: ec_insert_values.add(ec) if len(ec_insert_values) != 0: insert_data(connection, "Ecs", ["ec_number"], list(ec_insert_values)) # load in the newly updated EC table from the local CAZyme db ec_table_dict = get_table_dicts.get_ec_table_dict(connection) # load in gbk_ec table, contains the gbk-ec number relationships ec_gbk_table_dict = get_table_dicts.get_ec_gbk_table_dict( connection) # {ec_id: {gbk ids}} if args.delete_old_ec: gbk_ec_table_dict = get_table_dicts.get_gbk_ec_table_dict( connection) # {gbk_id: {ec ids}} ecs_rel_to_delete = set() # set of tuples (gbk_id, ec_id) # compile list of tuples to insert into the Genbanks_Ecs table gbk_ec_insert_values = set() for uniprot_acc in tqdm(uniprot_dict, desc="Updating EC numbers"): genbank_acc = uniprot_dict[uniprot_acc]["genbank_accession"]['gbk_acc'] gbk_id = gbk_dict[genbank_acc] retrieved_ec_numbers = uniprot_dict[uniprot_acc][ "ec"] # EC#s retrieved from UniProt for ec in retrieved_ec_numbers: ec_id = ec_table_dict[ec] try: existing_gbk_relationships = ec_gbk_table_dict[ec_id] # check if protein-ec# relationship is already in the db if gbk_id not in existing_gbk_relationships: gbk_ec_insert_values.add((gbk_id, ec_id)) except KeyError: # when adding relationship for the first time gbk_ec_insert_values.add((gbk_id, ec_id)) if args.delete_old_ec: existing_ec_relationships = gbk_ec_table_dict[gbk_id] for ec in retrieved_ec_numbers: ec_id = ec_table_dict[ec] if ec_id not in existing_ec_relationships: ecs_rel_to_delete.add((gbk_id, ec_id)) if len(gbk_ec_insert_values) != 0: insert_data(connection, "Genbanks_Ecs", ["genbank_id", "ec_id"], list(gbk_ec_insert_values)) if args.delete_old_ec and len(ecs_rel_to_delete) != 0: with connection.begin(): for record in tqdm(ecs_rel_to_delete, desc="Deleteing old GenBank-EC relationships"): # record = (genbank_id, ec_id,) stmt = ( delete(genbanks_ecs).\ where(genbanks_ecs.c.genbank_id == record[0]).\ where(genbanks_ecs.c.ec_id == record[1]) ) connection.execute(stmt) return