def get_ec_gbk_table_dict(connection): """Load the Genbanks_Ecs table into memory and compile a dict. The the result dict is keyed by EC IDS. The table contains the current Genbank and EC number relationships in the local CAZyme db. :param connection: open sqlalchemy connection to an SQLite db Return dict {ec_id: {gbk ids}} """ with Session(bind=connection) as session: all_gbk_ec_records = session.query(Genbank, Ec).\ join(Ec, Genbank.ecs).\ all() ec_gbk_table_dict = {} for record in all_gbk_ec_records: genbank_id = record[0].genbank_id ec_id = record[1].ec_id try: ec_gbk_table_dict[ec_id].add(genbank_id) except KeyError: ec_gbk_table_dict[ec_id] = {genbank_id} return ec_gbk_table_dict
def get_gbk_fam_table_dict(connection): """Build dict representing the records present in the Genbanks_CazyFamilies table If a GenBank accession is in the db but not has not CazyFamilies instances related to it, the GenBank accession is not returned when quering the db. :param connection: open sqlalchemy connection to an SQLite3 db engine Return - dict: {gbk_acc: {'families': {'fam subfam': fam_id}}, 'gbk_id': gbk_id } - set of tuples: (gbk_id, fam_id), each representing one row in the table """ with Session(bind=connection) as session: all_gbk_fam_records = session.query(Genbank, CazyFamily).\ join(CazyFamily, Genbank.families).\ all() existing_rel_tuples = set() # set of tuples (gbk_id, fam_id) gbk_fam_table_dict = {} # {gbk_acc: {'families': {'fam subfam': fam_id}}, 'gbk_id': gbk_id } for record in tqdm(all_gbk_fam_records, ' Retreving existing gbk-fam relationships from db'): gbk_accession = record[0].genbank_accession gbk_id = record[0].genbank_id family = record[1].family if record[1].subfamily is None: subfamily = '_' else: subfamily = record[1].subfamily fam_id = record[1].family_id existing_rel_tuples.add( (gbk_id, fam_id) ) try: gbk_fam_table_dict[gbk_accession] try: gbk_fam_table_dict[gbk_accession][f'{family} {subfamily}'] except KeyError: gbk_fam_table_dict[gbk_accession][f'{family} {subfamily}'] = fam_id except KeyError: gbk_fam_table_dict[gbk_accession] = { 'families': {f'{family} {subfamily}': fam_id}, 'gbk_id': gbk_id, } return gbk_fam_table_dict, existing_rel_tuples
def get_ec_table_dict(connection): """Create dict of objects present in the CazyFamilies table. :param connection: open sqlalchemy db engine connection Return dict {ec_number: ec_id} """ with Session(bind=connection) as session: db_ec_records = session.query(Ec).all() ec_table_dict = {} # {ec_number: ec_id} for record in tqdm(db_ec_records, desc="Retrieving existing EC# records"): ec_table_dict[record.ec_number] = record.ec_id return ec_table_dict
def get_pdb_table_dict(connection): """Create dict of objects present in the Pdbs table. :param connection: open sqlalchemy db engine connection Return dict {pdb_accession: pdb_db_id} """ with Session(bind=connection) as session: db_pdb_records = session.query(Pdb).all() pdb_table_dict = {} # {pdb_accession: pdb_db_id} for record in tqdm(db_pdb_records, desc="Loading existing PDB db records"): pdb_table_dict[record.pdb_accession] = record.pdb_id return pdb_table_dict
def get_kingdom_table_dict(connection): """Load and parse the Kingdoms table from the db and compile a dict {kgnd: id} :param connection: Return dict {kingdom: kindom_db_id} """ with Session(bind=connection) as session: kingdom_table = session.query(Kingdom).all() kingdom_dict = {} # {kingdom: kindom_db_id} for kingdom_obj in kingdom_table: kingdom_dict[kingdom_obj.kingdom] = kingdom_obj.kingdom_id return kingdom_dict
def get_gbk_seq(gbk_dict, query_data, connection): """Retrieve GenBank protein sequences for the provided Gbks. :param gbk_dict: dict of selected GenBank accessions {acc: id} :param query_data: dict containing all data retrieved from the db :param connection: open sqlaclchemy connection for an SQLite db Return query_data: dict containing all data retrieved from the db """ logger = logging.getLogger(__name__) gbk_accessions = list(gbk_dict.keys()) # retrieve the data from the Taxonomy and Kingdom tables with Session(bind=connection) as session: gbk_query = session.query(Genbank).\ filter(Genbank.genbank_accession.in_(gbk_accessions)).\ all() if len(gbk_query) == 0: logger.warning("No GenBank records retrieved for any of the selected GenBank accessions.") return query_data for record in tqdm(gbk_query, desc="Getting GenBank protein sequences"): gbk_acc = record[0].genbank_accession seq = record[0].sequence seq_date = record[0].seq_update_date try: query_data[gbk_acc] try: logger.warning( f"Multiple GBK records found for GBK acc {gbk_acc}\n" "Retreiving only one gbk sequence." ) query_data[gbk_acc]['gbk_sequence'] = seq query_data[gbk_acc]['gbk_sequence_date'] = seq_date except KeyError: query_data[gbk_acc]['gbk_sequence'] = seq query_data[gbk_acc]['gbk_sequence_date'] = seq_date except KeyError: query_data[gbk_acc] = {'gbk_sequence': seq, 'gbk_sequence_date': seq_date} return query_data
def get_gbk_kingdom_dict(connection): """Compile dict of Genbank, Taxonomy and Kingdom records :param connection: open sqlalchemy db connection Return dict {kingdom: {genus: {species: {protein_accessions}}} """ with Session(bind=connection) as session: query_results = session.query(Genbank, Taxonomy, Kingdom).\ join(Taxonomy, (Taxonomy.kingdom_id == Kingdom.kingdom_id)).\ join(Genbank, (Genbank.taxonomy_id == Taxonomy.taxonomy_id)).\ all() genbank_kingdom_dict = {} # kingdom: {genus: {species: {protein_accessions}}} for result in tqdm(query_results, desc="Retreving GenBank accessions and taxonomy"): genbank_accession = result[0].genbank_accession genus = result[1].genus species = result[1].species kingdom = result[2].kingdom try: genbank_kingdom_dict[kingdom] try: genbank_kingdom_dict[kingdom][genus] try: genbank_kingdom_dict[kingdom][genus][species].add(genbank_accession) except KeyError: genbank_kingdom_dict[kingdom][genus][species] = {genbank_accession} except KeyError: genbank_kingdom_dict[kingdom][genus] = {species: {genbank_accession}} except KeyError: genbank_kingdom_dict[kingdom] = { genus: { species: {genbank_accession}, }, } return genbank_kingdom_dict
def get_gbk_table_seq_dict(connection): """Compile a dict of the data in the Genbanks table :param connection: open connection to an SQLite3 database Return dict {genbank_accession: 'sequence': str, 'seq_date': str} """ with Session(bind=connection) as session: all_genbank = session.query(Genbank).all() db_gbk_dict = {} # {genbank_accession: 'sequence': str, 'seq_date': str} for gbk in all_genbank: db_gbk_dict[f"{gbk.genbank_accession}"] = { 'sequence': gbk.sequence, 'seq_date': gbk.seq_update_date } return db_gbk_dict
def get_gbk_table_dict(connection): """Compile a dict of the data in the Genbanks table :param connection: open connection to an SQLite3 database Return dict {genbank_accession: 'taxa_id': int, 'gbk_id': int} """ with Session(bind=connection) as session: all_genbank = session.query(Genbank).all() db_gbk_dict = {} # {genbank_accession: 'taxa_id': str, 'id': int} for gbk in all_genbank: db_gbk_dict[f"{gbk.genbank_accession}"] = { 'taxa_id': gbk.taxonomy_id, 'gbk_id': gbk.genbank_id } return db_gbk_dict
def apply_ec_filters( current_gbk_objs, ec_filters, connection, ): """Apply EC number filter to the retrieved Genbank records. :param current_gbk_objs: list of db Genbank objs retrieved from the db :param ec_filters: set of EC numbers to limit the retrieval of data to :param connection: open sqlaclchemy connection for an SQLite db Return set of db Genbank objects. """ logger = logging.getLogger(__name__) ec_gbk_ids = set() # Retrieve all Genbank.genbank_ids for each EC number for ec in tqdm(ec_filters, desc="Retrieving gbks for EC# filters"): with Session(bind=connection) as session: gbk_query = session.query(Genbank.genbank_id).\ join(Ec, Genbank.ecs).\ filter(Ec.ec_number == ec).\ all() for gbk_id in gbk_query: ec_gbk_ids.add(gbk_id) if len(ec_gbk_ids) == 0: logger.error( "Retrieved NO proteins matching the provided EC numbers\n" "Check the local CAZyme db contains the EC numbers provided\n" "Terminating program") sys.exit(1) ec_filtered_gbks = set() for gbk_record in tqdm(current_gbk_objs, desc="Checking gbk records against EC filters"): if (gbk_record.genbank_id, ) in ec_gbk_ids: ec_filtered_gbks.add(gbk_record) return ec_filtered_gbks
def get_ids(genbank_accessions, connection): """Get the local CAZyme database IDs for the list of provided GenBank accessions. :param genbank_accessions: set of GenBank accessions :param connection: open sqlalchemy engine connection Return dict, keyed by GenBank accession and valued by database record ID. """ gbk_dict = {} for accession in tqdm(genbank_accessions, desc="Getting local db record IDs"): with Session(bind=connection) as session: gbk_query = session.query(Genbank).\ filter(Genbank.genbank_accession == accession).\ first() gbk_dict[accession] = gbk_query.genbank_id return gbk_dict
def get_ec_annotations(gbk_dict, query_data, connection): """Retrieve EC number annotations for the provided Gbks. :param gbk_dict: dict of selected GenBank accessions {acc: id} :param query_data: dict containing all data retrieved from the db :param connection: open sqlaclchemy connection for an SQLite db Return query_data: dict containing all data retrieved from the db """ logger = logging.getLogger(__name__) gbk_accessions = list(gbk_dict.keys()) # retrieve the data from the Taxonomy and Kingdom tables with Session(bind=connection) as session: ec_query = session.query(Genbank, Ec).\ join(Ec, Genbank.ecs).\ filter(Genbank.genbank_accession.in_(gbk_accessions)).\ all() if len(ec_query) == 0: logger.warning("No EC annotations retrieved for any of the selected GenBank accessions.") return query_data for record in tqdm(ec_query, desc="Getting EC number annotations"): gbk_acc = record[0].genbank_accession ec_number = record[1].ec_number try: query_data[gbk_acc] try: query_data[gbk_acc]['ec_numbers'].add(ec_number) except KeyError: query_data[gbk_acc]['ec_numbers'] = {ec_number} except KeyError: query_data[gbk_acc] = {'ec_numbers': {ec_number}} return query_data
def get_pdb_accessions(gbk_dict, query_data, connection): """Retrieve PDB accessions for the provided Gbks. :param gbk_dict: dict of selected GenBank accessions {acc: id} :param query_data: dict containing all data retrieved from the db :param connection: open sqlaclchemy connection for an SQLite db Return query_data: dict containing all data retrieved from the db """ logger = logging.getLogger(__name__) gbk_accessions = list(gbk_dict.keys()) # retrieve the data from the Taxonomy and Kingdom tables with Session(bind=connection) as session: pdb_query = session.query(Genbank, Pdb).\ join(Pdb, Genbank.pdbs).\ filter(Genbank.genbank_accession.in_(gbk_accessions)).\ all() if len(pdb_query) == 0: logger.warning("No PDB accessions retrieved for any of the selected GenBank accessions.") return query_data for record in tqdm(pdb_query, desc="Getting PDB accessions"): gbk_acc = record[0].genbank_accession pdb_accession = record[1].pdb_accession try: query_data[gbk_acc] try: query_data[gbk_acc]['pdb_accessions'].add(pdb_accession) except KeyError: query_data[gbk_acc]['pdb_accessions'] = {pdb_accession} except KeyError: query_data[gbk_acc] = {'pdb_accessions': {pdb_accession}} return query_data
def get_fams_table_dict(connection): """Create dict of objects present in the CazyFamilies table. :param connection: open sqlalchemy db engine connection Return dict {family subfamily: db_family_id} """ with Session(bind=connection) as session: all_families = session.query(CazyFamily).all() db_fam_dict = {} for fam in all_families: if fam.subfamily is None: subfam = '_' else: subfam = fam.subfamily db_fam_dict[f"{fam.family} {subfam}"] = fam.family_id return db_fam_dict
def get_uniprot_table_dict(connection): """Create dict of objects present in the Uniprots table. :param connection: open sqlalchemy db engine connection Return dict {acc: {name: str, gbk_id: int, seq: str, seq_date:str } } """ with Session(bind=connection) as session: db_uniprot_records = session.query(Uniprot).all() uniprot_table_dict = {} # {acc: {name: str, gbk_id: int, seq: str, seq_date:str } } for record in tqdm(db_uniprot_records, desc="Retrieving existing UniProt records from db"): uniprot_table_dict[record.uniprot_accession] = { "name": record.uniprot_name, "genbank_id": record.genbank_id, "seq": record.sequence, "seq_date": record.seq_update_date, } return uniprot_table_dict
def get_taxs_table_dict(connection): """Create dict of objects present in the Taxs table. :param connection: open sqlalchemy db engine connection Return dict {genus species: {'tax_id': db_tax_id, 'kingdom_id': kingdom_id} """ with Session(bind=connection) as session: all_taxa = session.query(Taxonomy).all() db_tax_dict = {} for taxa in all_taxa: if len(taxa.species) == 0: db_tax_dict[f"{taxa.genus}"] = { 'tax_id': taxa.taxonomy_id, 'kingdom_id': taxa.kingdom_id, } else: db_tax_dict[f"{taxa.genus} {taxa.species}"] = { 'tax_id': taxa.taxonomy_id, 'kingdom_id': taxa.kingdom_id, } return db_tax_dict
def get_gbk_pdb_table_dict(connection): """Create dict of objects present in the Genbanks_Pdbs table. :param connection: open sqlalchemy db engine connection Return dict {gbk_db_id: {pdb_db_id} } """ with Session(bind=connection) as session: all_gbk_pdb_records = session.query(Genbank, Pdb).\ join(Pdb, Genbank.pdbs).\ all() gbk_pdb_table_dict = {} # {pdb_accession: pdb_db_id} for record in tqdm(all_gbk_pdb_records, desc="Loading existing Genbank_Pdbs db records"): genbank_id = record[0].genbank_id pdb_id = record[1].pdb_id try: gbk_pdb_table_dict[genbank_id].add(pdb_id) except KeyError: gbk_pdb_table_dict[genbank_id] = {pdb_id} return gbk_pdb_table_dict
def get_tax_annotations(gbk_dict, query_data, connection, args): """Retrieve kingdom, genus and/or scientific name of the source organism for the provided Gbks. :param gbk_dict: dict of selected GenBank accessions {acc: id} :param query_data: dict containing all data retrieved from the db :param connection: open sqlaclchemy connection for an SQLite db :param args: cmd-line args parser Return query_data: dict containing all data retrieved from the db """ logger = logging.getLogger(__name__) gbk_accessions = list(gbk_dict.keys()) # retrieve the data from the Taxonomy and Kingdom tables with Session(bind=connection) as session: tax_query = session.query(Genbank, Taxonomy, Kingdom).\ join(Taxonomy, (Taxonomy.kingdom_id == Kingdom.kingdom_id)).\ join(Genbank, (Genbank.taxonomy_id == Taxonomy.taxonomy_id)).\ filter(Genbank.genbank_accession.in_(gbk_accessions)).\ all() if len(tax_query) == 0: logger.warning("No taxonomy data retrieved for any of the selected GenBank accessions.") return query_data for record in tqdm(tax_query, desc="Getting taxonomy data"): gbk_acc = record[0].genbank_accession if 'kingdom' in args.include: kingdom = record[2].kingdom try: query_data[gbk_acc] try: query_data[gbk_acc]['kingdom'] logger.warning( f"Multiple taxa found for {gbk_acc}\n" "Retreiving only one record." ) query_data[gbk_acc]['kingdom'] = kingdom except KeyError: query_data[gbk_acc]['kingdom'] = kingdom except KeyError: query_data[gbk_acc] = {'kingdom': kingdom} if 'genus' in args.include: genus = record[1].genus try: query_data[gbk_acc] try: query_data[gbk_acc]['genus'] logger.warning( f"Multiple taxa found for {gbk_acc}\n" "Retreiving only one record." ) query_data[gbk_acc]['genus'] = genus except KeyError: query_data[gbk_acc]['genus'] = genus except KeyError: query_data[gbk_acc] = {'genus': genus} if 'organism' in args.include: genus = record[1].genus species = record[1].species organism = f"{genus} {species}" try: query_data[gbk_acc] try: query_data[gbk_acc]['organism'] logger.warning( f"Multiple taxa found for {gbk_acc}\n" "Retreiving only one record." ) query_data[gbk_acc]['organism'] = organism except KeyError: query_data[gbk_acc]['organism'] = organism except KeyError: query_data[gbk_acc] = {'organism': organism} return query_data
def get_class_fam_genbank_accessions( class_filters, family_filters, connection, ): """Retrieve the GenBank accessions of proteins from user selected CAZy classes and (sub)families :param class_filters: set of CAZy classes to retrieve data for :param family_filters: set of CAZy families to retrieve data for :param connection: open sqlaclchemy connection for an SQLite db Return list of db objects containing a Genbank obj, Taxonomy obj and Kingdom obj. """ logger = logging.getLogger(__name__) initially_selected_gbk = [] if len(class_filters) == 0 and len(family_filters) == 0: logger.warning("No class or family filters applied") # could retrieve all GenBank accessions with Session(bind=connection) as session: gbk_query = session.query(Genbank, Taxonomy, Kingdom).\ join(Taxonomy, (Taxonomy.kingdom_id == Kingdom.kingdom_id)).\ join(Genbank, (Genbank.taxonomy_id == Taxonomy.taxonomy_id)).\ join(CazyFamily, Genbank.families).\ all() initially_selected_gbk = gbk_query return initially_selected_gbk if len(class_filters) != 0: logger.warning("Applying CAZy class filter(s)") for cazy_class in tqdm( class_filters, desc="Retrieving GenBank accessions for selected CAZy classes"): class_abbrev = CLASS_ABBREVIATIONS[cazy_class] # perform a subquery to retrieve all CAZy families in the CAZy class inner_stmt = select(CazyFamily.family).where( CazyFamily.family.like(f'{class_abbrev}%')) subq = inner_stmt.subquery() aliased_families = aliased(CazyFamily, subq) stmt = select(aliased_families) # perform query to retrieve proteins in the CAZy families with Session(bind=connection) as session: gbk_query = session.query(Genbank, Taxonomy, Kingdom).\ join(Taxonomy, (Taxonomy.kingdom_id == Kingdom.kingdom_id)).\ join(Genbank, (Genbank.taxonomy_id == Taxonomy.taxonomy_id)).\ join(CazyFamily, Genbank.families).\ filter(CazyFamily.family.in_(stmt)).\ all() initially_selected_gbk += gbk_query if len(family_filters) != 0: logger.warning("Applying CAZy family filter(s)") for cazy_family in tqdm( family_filters, desc="Retrieving GenBank accessions for selected CAZy families"): inner_stmt = select( CazyFamily.family).where(CazyFamily.family == cazy_family) subq = inner_stmt.subquery() aliased_families = aliased(CazyFamily, subq) stmt = select(aliased_families) if cazy_family.find('_') != -1: # subfamily with Session(bind=connection) as session: gbk_query = session.query(Genbank, Taxonomy, Kingdom).\ join(Taxonomy, (Taxonomy.kingdom_id == Kingdom.kingdom_id)).\ join(Genbank, (Genbank.taxonomy_id == Taxonomy.taxonomy_id)).\ join(CazyFamily, Genbank.families).\ filter(CazyFamily.subfamily.in_(stmt)).\ all() else: with Session(bind=connection) as session: gbk_query = session.query(Genbank, Taxonomy, Kingdom).\ join(Taxonomy, (Taxonomy.kingdom_id == Kingdom.kingdom_id)).\ join(Genbank, (Genbank.taxonomy_id == Taxonomy.taxonomy_id)).\ join(CazyFamily, Genbank.families).\ filter(CazyFamily.family.in_(stmt)).\ all() initially_selected_gbk += gbk_query return list(set(initially_selected_gbk))
def get_uniprot_data(gbk_dict, query_data, connection, args): """Retrieve UniProt data for the provided Gbks. :param gbk_dict: dict of selected GenBank accessions {acc: id} :param query_data: dict containing all data retrieved from the db :param connection: open sqlaclchemy connection for an SQLite db :param args: cmd-line args parser Return query_data: dict containing all data retrieved from the db """ logger = logging.getLogger(__name__) gbk_accessions = list(gbk_dict.keys()) # retrieve the data from the Taxonomy and Kingdom tables with Session(bind=connection) as session: uniprot_query = session.query(Genbank, Uniprot).\ join(Uniprot, (Uniprot.genbank_id == Genbank.genbank_id)).\ filter(Genbank.genbank_accession.in_(gbk_accessions)).\ all() if len(uniprot_query) == 0: logger.warning("No UniProt records retrieved for any of the selected GenBank accessions.") return query_data for record in tqdm(uniprot_query, desc="Getting UniProt data"): gbk_acc = record[0].genbank_accession if 'uniprot_acc' in args.include: uniprot_accession = record[1].uniprot_accession try: query_data[gbk_acc] try: query_data[gbk_acc]['uniprot_accession'] logger.warning( f"Multiple UniProt records found for GBK acc {gbk_acc}\n" "Retreiving only one." ) query_data[gbk_acc]['uniprot_accession'] = uniprot_accession except KeyError: query_data[gbk_acc]['uniprot_accession'] = uniprot_accession except KeyError: query_data[gbk_acc] = { 'uniprot_accession': uniprot_accession, } if 'uniprot_name' in args.include: uniprot_name = record[1].uniprot_name try: query_data[gbk_acc] try: query_data[gbk_acc]['uniprot_name'] logger.warning( f"Multiple UniProt records found for GBK acc {gbk_acc}\n" "Retreiving only one." ) query_data[gbk_acc]['uniprot_name'] = uniprot_name except KeyError: query_data[gbk_acc]['uniprot_name'] = uniprot_name except KeyError: query_data[gbk_acc] = { 'uniprot_name': uniprot_name, } if 'uniprot_seq' in args.include: seq = record[1].sequence seq_date = record[1].seq_update_date try: query_data[gbk_acc] try: query_data[gbk_acc]['uniprot_sequence'] logger.warning( f"Multiple UniProt records found for GBK acc {gbk_acc}\n" "Retreiving only one record." ) query_data[gbk_acc]['uniprot_sequence'] = seq query_data[gbk_acc]['uniprot_sequence_date'] = seq_date except KeyError: query_data[gbk_acc]['uniprot_sequence'] = seq query_data[gbk_acc]['uniprot_sequence_date'] = seq_date except KeyError: query_data[gbk_acc] = {'sequence': seq, 'sequence_date': seq_date} return query_data
def get_class_fam_annotations(gbk_dict, query_data, connection, args): """Retrieve CAZy class and/or family annotations for the provided Gbks. :param gbk_dict: dict of selected GenBank accessions {acc: id} :param query_data: dict containing all data retrieved from the db :param connection: open sqlaclchemy connection for an SQLite db :param args: cmd-line args parser Return query_data: dict containing all data retrieved from the db """ logger = logging.getLogger(__name__) gbk_accessions = list(gbk_dict.keys()) # retrieve the data from the CAZy Family table with Session(bind=connection) as session: fam_table_query = session.query(Genbank, CazyFamily).\ join(CazyFamily, Genbank.families).\ filter(Genbank.genbank_accession.in_(gbk_accessions)).\ all() if len(fam_table_query) == 0: logger.warning( "No CAZy class/family annotations retrieved for any of the selected " "GenBank accessions." ) return query_data for record in tqdm(fam_table_query, desc="Getting CAZy class/family annotations"): gbk_acc = record[0].genbank_accession if 'class' in args.include: fam = record[1].family cazy_class = re.match(r"\D{2,3}\d", fam).group()[:-1] try: query_data[gbk_acc] try: query_data[gbk_acc]['class'].add(cazy_class) except KeyError: query_data[gbk_acc]['class'] = {cazy_class} except KeyError: query_data[gbk_acc] = {'class': {cazy_class}} if 'family' in args.include: fam = record[1].family try: query_data[gbk_acc] try: query_data[gbk_acc]['family'].add(fam) except KeyError: query_data[gbk_acc]['family'] = {fam} except KeyError: query_data[gbk_acc] = {'family': {fam}} if 'subfamily' in args.include: subfam = record[1].subfamily try: query_data[gbk_acc] try: query_data[gbk_acc]['subfamily'].add(subfam) except KeyError: query_data[gbk_acc]['subfamily'] = {subfam} except KeyError: query_data[gbk_acc] = {'subfamily': {subfam}} return query_data