예제 #1
0
def add_kingdoms(cazy_taxa_dict, connection):
    """Add new Kingdoms objects to database.
    
    Check existing kingdom objects in the db against kingdoms retrieved from the 
    CAZy txt file, so as to only add new kingdoms.

    :param cazy_taxa_dict: dict of kingdoms and organisms from the cazy_data dict
        {kingdom: {organism}}
    :param connection: open sqlalchemy connection to a local SQLite db engine
    
    Return nothing
    """
    kingdom_table_dict = get_table_dicts.get_kingdom_table_dict(connection)
    # dict {kingdom: {organisms}}

    # retrieve the Kingdoms retrieved from the CAZy txt file
    cazy_kingdoms = list(cazy_taxa_dict.keys())
    existing_kingdom_records = list(kingdom_table_dict.keys())

    # create list of tuples for db insert
    kingdoms_db_insert_values = [(kngdm, ) for kngdm in cazy_kingdoms
                                 if kngdm not in existing_kingdom_records]

    if len(kingdoms_db_insert_values) != 0:
        insert_data(connection, 'Kingdoms', ['kingdom'],
                    kingdoms_db_insert_values)

    return
예제 #2
0
def add_cazy_families(cazy_data, connection):
    """Add CAZy families and subfamilies to local CAZyme database
    
    :param cazy_data: dict of data extracted from the txt file
        {gbk_accession: {kingdom:str, organism:str, families{fam:subfam}}}
    :param connection: open sqlalchemy connection to an SQLite db engine
    
    Return nothing"""
    logger = logging.getLogger(__name__)

    # get list of CAZy (sub)families already present in the db
    fam_table_dict = get_table_dicts.get_fams_table_dict(
        connection)  # {family subfamily: db_family_id}

    existing_fam_records = list(fam_table_dict.keys())

    families_db_insert_values = set()  # new fam records to add to db

    for genbank_accession in tqdm(cazy_data,
                                  desc='Extracting CAZy fams from CAZy data'):
        for cazy_fam in cazy_data[genbank_accession]["families"]:
            subfamilies = cazy_data[genbank_accession]["families"][cazy_fam]

            for subfam in subfamilies:
                if subfam is None:
                    cazy_subfam = "_"
                else:
                    cazy_subfam = subfam
                fam_key = f"{cazy_fam} {cazy_subfam}"

                if fam_key not in existing_fam_records:
                    families_db_insert_values.add(
                        (cazy_fam, subfam))  # add None for subfam if is None

    if len(families_db_insert_values) != 0:
        logger.info(f"Inserting {len(families_db_insert_values)} "
                    "new family records into the CazyFamilies table")
        insert_data(connection, 'CazyFamilies', ['family', 'subfamily'],
                    list(families_db_insert_values))
    else:
        logger.info(
            "Found no new CAZy family records to add the CazyFamilies table")

    return
예제 #3
0
def add_source_organisms(taxa_dict, connection):
    """Add taxonomy (source organism) data to the local CAZyme database
    
    :param taxa_dict: dict of taxa data {kingdom: set(organism)} to be added to the db
    :param connection: open sqlalchemy connection to SQLite engine
    
    Return nothing
    """
    logger = logging.getLogger(__name__)

    # retrieve db kingdom objects for retrieving the kingdom_id for the Taxs table
    kingdom_table_dict = get_table_dicts.get_kingdom_table_dict(connection)
    # {kingdom: kingdom_id}
    tax_table_dict = get_table_dicts.get_taxs_table_dict(connection)
    # {genus species: {'tax_id': int(db_tax_id), 'kingdom_id': int(kingdom_id)}

    # compare taxa already in the db against taxa retrieved from the CAZy txt file
    # to identify new taxa objects to be added to the db

    taxonomy_db_insert_values = set()
    records_to_update = set()  # used incase kingdom has changed for a species

    for kingdom in tqdm(
            taxa_dict,
            total=len(list(taxa_dict.keys())),
            desc='Create tax objects per Kingdom',
    ):
        kingdom_id = kingdom_table_dict[kingdom]
        organisms = taxa_dict[kingdom]

        for organism in organisms:  # organisms from the CAZy txt file

            try:
                existing_record_data = tax_table_dict[organism]
                # check kingdom is correct
                existing_record_kngdm_id = tax_table_dict[organism][
                    'kingdom_id']
                if existing_record_kngdm_id != kingdom_id:
                    records_to_update.add((
                        genus,
                        species,
                        kingdom_id,
                    ))

            except KeyError:  # organism not in the db, build new record
                genus = organism.split(" ")[0]
                species = ' '.join(organism.split(" ")[1:])
                new_record = (
                    genus,
                    species,
                    kingdom_id,
                )
                taxonomy_db_insert_values.add(new_record)

    if len(taxonomy_db_insert_values) != 0:
        logger.info(
            f"Adding {len(taxonomy_db_insert_values)} new tax records to the db"
        )
        insert_data(
            connection,
            'Taxs',
            ['genus', 'species', 'kingdom_id'],
            list(taxonomy_db_insert_values),
        )
    else:
        logger.info("No new tax records to add to the db")
    if len(records_to_update) != 0:
        logger.info(
            f"Updating the parent Kingdom for {len(records_to_update)} tax records in the db"
        )
        with connection.begin():
            for record in records_to_update:
                connection.execute(
                    text(
                        "UPDATE Taxs "
                        f"SET kingdom_id = {record[2]} "
                        f"WHERE genus = '{record[0]}' AND species = '{record[1]}'"
                    ))

    return
예제 #4
0
def add_genbank_fam_relationships(cazy_data, connection, args):
    """Add GenBank accession - CAZy family relationships to db
    
    :param cazy_data: dict of data extracted from the txt file
        {gbk_accession: {kingdom:str, organism:str, families{fam:subfam}}}
    :param connection: open sqlalchemy connection to an SQLite db engine
    :param args: cmd-line args parser

    Return nothing
    """
    logger = logging.getLogger(__name__)

    gbk_fam_db_insert_values = set()  # new records to add
    gbk_fam_records_to_del = set()  # records/relationships to delete

    # get dict of GenBank and CazyFamilies tables, used for getting gbk_ids  and fam_ids of accessions and
    # families without entries in the CazyFamilies_Genbanks table

    gbk_table_dict = get_table_dicts.get_gbk_table_dict(connection)
    # {genbank_accession: 'taxa_id': int, 'gbk_id': int}

    fam_table_dict = get_table_dicts.get_fams_table_dict(connection)
    # {'fam subfam': fam_id}

    # load current relationships in the db
    gbk_fam_table_dict, existing_rel_tuples = get_table_dicts.get_gbk_fam_table_dict(
        connection)
    # {genbank_accession: {'families': {str(fam subfam): int(fam_id)}, 'gbk_id': int(gbk_db_id)} }

    for genbank_accession in tqdm(
            cazy_data,
            desc="Extracting Genbank-Family relationships from CAZy data"):

        gbk_id = gbk_table_dict[genbank_accession]['gbk_id']

        cazy_fam_dict = cazy_data[genbank_accession]['families']
        # cazy_data = { gbk_acc: {'families': {fam: {subfamilies}} } }
        # cazy_fam_dict = {fam : {subfamilies}}

        try:
            existing_relation_dict = gbk_fam_table_dict[genbank_accession][
                'families']
            # existing_relation_dict = { str(fam subfam) : fam_id }

            for fam in cazy_fam_dict:
                subfamilies = cazy_fam_dict[
                    fam]  # set of subfams for parent fam from CAZy data

                for subfam in subfamilies:  # add each fam-subfam pair
                    if subfam is None:
                        family_key = f"{fam} _"
                    else:
                        family_key = f"{fam} {subfam}"

                    try:
                        existing_relation_dict[family_key]
                        # already in db, don't add relationship again

                    except KeyError:
                        fam_id = fam_table_dict[family_key]

                        new_row = (
                            gbk_id,
                            fam_id,
                        )
                        if new_row not in existing_rel_tuples:
                            gbk_fam_db_insert_values.add((
                                gbk_id,
                                fam_id,
                            ))

        except KeyError:  # GenBank not present in the Genbanks_CazyFamilies table, create new record

            for fam in cazy_fam_dict:
                subfamilies = cazy_fam_dict[
                    fam]  # set of subfams for parent fam from CAZy data

                for subfam in subfamilies:  # add each fam-subfam pair
                    if subfam is None:
                        family_key = f"{fam} _"
                    else:
                        family_key = f"{fam} {subfam}"

                    fam_id = fam_table_dict[family_key]

                    new_row = (
                        gbk_id,
                        fam_id,
                    )
                    if new_row not in existing_rel_tuples:
                        gbk_fam_db_insert_values.add((
                            gbk_id,
                            fam_id,
                        ))

    if len(gbk_fam_db_insert_values) != 0:

        logger.info(
            f"Adding {len(gbk_fam_db_insert_values)} new GenBank accession - "
            "CAZy (sub)family relationships to the db")
        insert_data(
            connection,
            'Genbanks_CazyFamilies',
            ['genbank_id', 'family_id'],
            list(gbk_fam_db_insert_values),
        )
    else:
        logger.info(
            "No new Genbank accession-CAZy (sub)family relationships to add to the db"
        )

    if (len(gbk_fam_records_to_del) != 0):
        logger.info(
            "Deleting {(len(gbk_fam_records_to_del)} GenBank accession - "
            "CAZy (sub)family relationships\n"
            "that are the db but are no longer in CAZy")
        with connection.begin():
            for record in gbk_fam_records_to_del:
                # record = (genbank_id, fam_id,)
                stmt = (
                    delete(genbanks_families).\
                    where(genbanks_families.c.genbank_id == record[0]).\
                    where(genbanks_families.c.family_id == record[1])
                )
                connection.execute(stmt)

    return
예제 #5
0
def add_genbanks(cazy_data, connection):
    """Add GenBank accessions with tax data to the db
    
    :param cazy_data: dict of CAZy data
        {gbk_acc: {kingdom: {str}, organism: {str}, families: {(fam, subfam, )}}}
    :param connection: open sqlalchemy connection to an SQLite db
    
    Return Nothing
    """
    logger = logging.getLogger(__name__)

    # retrieve existing records from the db
    gbk_table_dict = get_table_dicts.get_gbk_table_dict(connection)
    existing_gbk_records = list(gbk_table_dict.keys())

    taxa_table_dict = get_table_dicts.get_taxs_table_dict(connection)
    # {genus species: {'tax_id': db_tax_id, 'kingdom_id': kingdom_id}

    gbk_record_updates = set(
    )  # {gbk_accession: 'taxa_id': (new taxa_id) int, 'gbk_id': int}
    gbk_db_insert_values = set()

    for gbk_accession in tqdm(cazy_data,
                              desc="Compiling Genbank records for insertion"):
        if gbk_accession not in existing_gbk_records:
            organism = list(cazy_data[gbk_accession]['organism'])[0]
            taxa_id = taxa_table_dict[organism]['tax_id']
            gbk_db_insert_values.add((
                gbk_accession,
                taxa_id,
            ))

        else:  # check if need to update taxa_id
            # get the taxa_id for the existing record
            existing_record_id = gbk_table_dict[gbk_accession]['taxa_id']

            # get the taxa_id for the organism listed in the CAZy txt file
            organism = list(cazy_data[gbk_accession]['organism'])[0]
            cazy_data_taxa_id = taxa_table_dict[organism]['tax_id']

            if cazy_data_taxa_id != existing_record_id:
                # need to update the record
                gbk_record_updates.add(
                    (cazy_data_taxa_id,
                     gbk_table_dict[gbk_accession]['gbk_id']))

    if len(gbk_db_insert_values) != 0:
        logger.info(
            f"Inserting {len(gbk_db_insert_values)} GenBank accessions into the db"
        )
        insert_data(connection, 'Genbanks',
                    ['genbank_accession', 'taxonomy_id'],
                    list(gbk_db_insert_values))

    if len(gbk_record_updates) != 0:
        logger.info(
            f"Updating {len(gbk_record_updates)} Genbank table records with new taxonomy IDs"
        )
        with connection.begin():
            for record in gbk_record_updates:
                connection.execute(
                    text("UPDATE Taxs "
                         f"SET taxonomy_id = {record[1]} "
                         f"WHERE genbank_id = '{record[0]}'"))

    return
def add_uniprot_accessions(uniprot_dict, gbk_dict, connection, args):
    """Add UniProt data to the local CAZyme database
    
    :param uniprot_dict: dict containing data retrieved from UniProt
    :param gbk_dict: dict representing data from the Genbanks table
    :param connection: open sqlalchemy conenction to an SQLite db engine
    :param args: cmd-line args parser

    Return nothing.
    """
    # load the current Uniprot records in the local CAZyme db
    # {acc: {name: str, gbk_id: int, seq: str, seq_date:str } }
    uniprot_table_dict = get_table_dicts.get_uniprot_table_dict(connection)

    uniprot_insert_values = set()

    # the following variables containing objects in the local db that are to be updated
    update_record_gbk_id = set()  # ((uniprot_accession, gbk_id),)
    update_record_name = set()  # ((uniprot_accession, retrieved_name), )
    update_record_seq = set()  # ((uniprot_accession, retrieved_seq), )

    for uniprot_acc in tqdm(uniprot_dict,
                            desc="Separating new and existing records"):
        # check if the UniProt accession is already in the local CAZyme db
        try:
            uniprot_table_dict[uniprot_acc]
            # Uniprot accession is already in the local CAZyme db

            # check if the GenBank id is the same
            existing_gbk_id = uniprot_table_dict[uniprot_acc]['genbank_id']
            retrieved_gbk_acc = uniprot_dict[uniprot_acc]["genbank_accession"][
                'gbk_acc']
            retrieved_gbk_id = gbk_dict[retrieved_gbk_acc]
            if existing_gbk_id != retrieved_gbk_id:
                update_record_gbk_id.add((uniprot_acc, retrieved_gbk_id))

            if args.name_update:
                # check if the name has changed
                existing_name = uniprot_table_dict[uniprot_acc]['name']
                retrieved_name = uniprot_dict[uniprot_acc]['name']

                if existing_name != retrieved_name:
                    update_record_name.add((uniprot_acc, retrieved_name))

            if args.sequence:  # only add seq if sequence is not there
                if uniprot_table_dict[uniprot_acc]['seq'] is None:
                    update_record_seq.add(
                        (uniprot_acc, uniprot_dict[uniprot_acc]["sequence"]))

            if args.seq_update:  # update seq if a newer version is available or no seq present
                if uniprot_table_dict[uniprot_acc]['seq'] is None:
                    update_record_seq.add((
                        uniprot_acc,
                        uniprot_dict[uniprot_acc]["sequence"],
                        uniprot_dict[uniprot_acc]["seq_date"],
                    ))
                else:
                    existing_date = uniprot_table_dict[uniprot_acc]['seq_date']
                    retrieved_date = uniprot_dict[uniprot_acc]['seq_date']

                    if existing_date < retrieved_date:  # existing date is older, update seq
                        update_record_seq.add((
                            uniprot_acc,
                            uniprot_dict[uniprot_acc]["sequence"],
                            uniprot_dict[uniprot_acc]["seq_date"],
                        ))

        except KeyError:  # new record to add to local CAZyme db

            if args.sequence or args.seq_update:
                genbank_acc = uniprot_dict[uniprot_acc]["genbank_accession"][
                    'gbk_acc']
                gbk_id = gbk_dict[genbank_acc]
                uniprot_name = uniprot_dict[uniprot_acc]["name"]
                seq = uniprot_dict[uniprot_acc]["sequence"]
                date = uniprot_dict[uniprot_acc]["seq_date"]

                uniprot_insert_values.add(
                    (gbk_id, uniprot_acc, uniprot_name, seq, date))

            else:  # not retrieving protein sequences
                genbank_acc = uniprot_dict[uniprot_acc]["genbank_accession"][
                    'gbk_acc']
                gbk_id = gbk_dict[genbank_acc]
                uniprot_name = uniprot_dict[uniprot_acc]["name"]

                uniprot_insert_values.add((gbk_id, uniprot_acc, uniprot_name))

    if len(uniprot_insert_values) != 0:
        if args.sequence or args.seq_update:
            columns = [
                'genbank_id', 'uniprot_accession', 'uniprot_name', 'sequence',
                'seq_update_date'
            ]
        else:
            columns = ['genbank_id', 'uniprot_accession', 'uniprot_name']

        insert_data(connection, "Uniprots", columns,
                    list(uniprot_insert_values))

    if len(update_record_gbk_id) != 0:
        with connection.begin():
            for record in tqdm(update_record_gbk_id,
                               desc="Updating UniProt-Gbk relationships"):
                connection.execute(
                    text("UPDATE Uniprots "
                         f"SET genbank_id = {record[1]} "
                         f"WHERE uniprot_accession = '{record[0]}'"))

    if len(update_record_name) != 0:
        with connection.begin():
            for record in tqdm(update_record_name,
                               desc="Updating UniProt protein names"):
                connection.execute(
                    text("UPDATE Uniprots "
                         f"SET uniprot_name = {record[1]} "
                         f"WHERE uniprot_accession = '{record[0]}'"))

    if len(update_record_seq) != 0:
        with connection.begin():
            for record in tqdm(update_record_seq,
                               desc="Updating UniProt protein seqs"):
                connection.execute(
                    text(
                        "UPDATE Uniprots "
                        f"SET sequence = {record[1]}, seq_update_date = {record[2]} "
                        f"WHERE uniprot_accession = '{record[0]}'"))

    return
def add_pdb_accessions(uniprot_dict, gbk_dict, connection, args):
    """Add PDB accessions to the local CAZyme database

    :param uniprot_dict: dict containing data retrieved from UniProt
    :param gbk_dict: dict representing data from the Genbanks table
    :param connection: open sqlalchemy conenction to an SQLite db engine
    :param args: cmd-line args parser

    Return nothing.
    """
    logger = logging.getLogger(__name__)

    pdbs_to_delete = set()  # only used if args.delete_old_pdbs is true
    relationships_to_delete = set(
    )  # only used if args.delete_old_pdbs is true

    # load in PDB objects in the local CAZyme db
    pdb_table_dict = get_table_dicts.get_pdb_table_dict(connection)
    # {pdb_accession: pdb_db_id}

    # First, identify new PDB accessions to add to the database
    pdb_insert_values = set()
    for uniprot_acc in tqdm(uniprot_dict,
                            desc="Identifying new PDBs to add to db"):
        for pdb in uniprot_dict[uniprot_acc]["pdb"]:
            try:
                pdb_table_dict[pdb]
            except KeyError:
                pdb_insert_values.add((pdb, ))

    if len(pdb_insert_values) != 0:
        logger.warning(
            f"Adding {len(pdb_insert_values)} PDB accessions to the database")
        insert_data(connection, "Pdbs", ["pdb_accession"],
                    list(pdb_insert_values))
        # reload the updated pdb table
        pdb_table_dict = get_table_dicts.get_pdb_table_dict(connection)

    # load in Genbanks_Pdbs relationship table
    gbk_pdb_rel_table_dict = get_table_dicts.get_gbk_pdb_table_dict(connection)
    # {gbk_db_id: {pdb_db_id} }

    # convert the data from UniProt into a dict of {gbk_db_id: pdb_db_id}
    # to identify pdb-protein relationships retrieved from UniProt
    gbk_pdb_insert_values = set()
    for uniprot_acc in tqdm(
            uniprot_dict,
            desc="Identifying new protein-PDB relationships to add to db"):
        genbank_acc = uniprot_dict[uniprot_acc]["genbank_accession"]['gbk_acc']
        gbk_db_id = gbk_dict[genbank_acc]

        # set of pdb_accessions retrieved from UniProt
        retrieved_pdbs = uniprot_dict[uniprot_acc]["pdb"]
        if len(retrieved_pdbs) == 0:
            continue

        # set of pdb_db_ids the protein (gbk_db_id) is already related to in the db
        try:
            existing_pdb_relationships = gbk_pdb_rel_table_dict[gbk_db_id]
        except KeyError:
            existing_pdb_relationships = set()

        for pdb_acc in retrieved_pdbs:
            pdb_db_id = pdb_table_dict[pdb_acc]

            try:
                if pdb_db_id not in existing_pdb_relationships:
                    # genbank and pdb records not yet stored together in the relationship table
                    gbk_pdb_insert_values.add((gbk_db_id, pdb_db_id))
            except KeyError:
                # genbank not listed in the Genbanks_Pdbs relationship table
                gbk_pdb_insert_values.add((gbk_db_id, pdb_db_id))

        if args.delete_old_pdbs:
            # convert gbk_pdb_rel_table_dict to be keyed by pdb_db_ids and valued by set of gbk_db_ids
            pdb_gbk_relationships = {}
            for existing_gbk_id in gbk_pdb_rel_table_dict:
                existing_pdbs = gbk_pdb_rel_table_dict[existing_gbk_id]
                for pdb in existing_pdbs:
                    try:
                        pdb_gbk_relationships[pdb].add(existing_gbk_id)
                    except KeyError:
                        pdb_gbk_relationships[pdb] = {existing_gbk_id}

            # for each pdb_db_id related to the current protein in the db
            # get the corresponding pdb_accession
            all_existing_pdb_ids = list(set(pdb_table_dict.values()))
            all_existing_pdb_accs = list(pdb_table_dict.keys())

            for pdb_db_id in existing_pdb_relationships:
                position = all_existing_pdb_ids.index(pdb_db_id)
                pdb_acc = all_existing_pdb_accs[position]

                if pdb_acc not in retrieved_pdbs:
                    # delete genbank-pdb relationship
                    relationships_to_delete.add((gbk_db_id, pdb_db_id))

                    # check if can delete pdb accession because it is not linked to any other proteins
                    if len(
                            pdb_gbk_relationships[pdb_db_id]
                    ) != 1:  # deleting one pdb accession will not leave not related to any genbanks
                        pdbs_to_delete.add((pdb_acc))

    if len(gbk_pdb_insert_values) != 0:
        insert_data(connection, "Genbanks_Pdbs", ["genbank_id", "pdb_id"],
                    list(gbk_pdb_insert_values))

    if args.delete_old_pdbs and len(pdbs_to_delete) != 0:
        with connection.begin():
            for record in tqdm(pdbs_to_delete,
                               desc="Deleteing old PDB accessions"):
                # record = (pdb_acc,)
                connection.execute(
                    text("DELETE Pdbs "
                         f"WHERE pdb_accession = '{record[0]}'"))

    if args.delete_old_pdbs and len(relationships_to_delete) != 0:
        with connection.begin():
            for record in tqdm(relationships_to_delete,
                               desc="Deleteing old Genbank-PDB relationships"):
                # record = (pdb_acc,)
                connection.execute(
                    text(
                        "DELETE Genbanks_Pdbs "
                        f"WHERE genbank_id = '{record[0]}' AND pdb_id = '{record[1]}'"
                    ))

    return
def add_ec_numbers(uniprot_dict, all_ecs, gbk_dict, connection, args):
    """Add EC numbers to the local CAZyme database

    :param uniprot_dict: dict containing data retrieved from UniProt
    :param all_ecs: set of all EC numbers retrieved from UniProt
    :param gbk_dict: dict representing data from the Genbanks table
    :param connection: open sqlalchemy conenction to an SQLite db engine
    :param args: cmd-line args parser

    Return nothing.
    """
    # load EC records in the local CAZyme db
    ec_table_dict = get_table_dicts.get_ec_table_dict(connection)

    # identify new EC numbers to add to the EC table
    existing_ecs = list(ec_table_dict.keys())
    ec_insert_values = set()

    for ec in all_ecs:
        if ec[0] not in existing_ecs:
            ec_insert_values.add(ec)

    if len(ec_insert_values) != 0:
        insert_data(connection, "Ecs", ["ec_number"], list(ec_insert_values))

        # load in the newly updated EC table from the local CAZyme db
        ec_table_dict = get_table_dicts.get_ec_table_dict(connection)

    # load in gbk_ec table, contains the gbk-ec number relationships
    ec_gbk_table_dict = get_table_dicts.get_ec_gbk_table_dict(
        connection)  # {ec_id: {gbk ids}}

    if args.delete_old_ec:
        gbk_ec_table_dict = get_table_dicts.get_gbk_ec_table_dict(
            connection)  # {gbk_id: {ec ids}}
        ecs_rel_to_delete = set()  # set of tuples (gbk_id, ec_id)

    # compile list of tuples to insert into the Genbanks_Ecs table
    gbk_ec_insert_values = set()

    for uniprot_acc in tqdm(uniprot_dict, desc="Updating EC numbers"):
        genbank_acc = uniprot_dict[uniprot_acc]["genbank_accession"]['gbk_acc']
        gbk_id = gbk_dict[genbank_acc]

        retrieved_ec_numbers = uniprot_dict[uniprot_acc][
            "ec"]  # EC#s retrieved from UniProt
        for ec in retrieved_ec_numbers:
            ec_id = ec_table_dict[ec]

            try:
                existing_gbk_relationships = ec_gbk_table_dict[ec_id]
                # check if protein-ec# relationship is already in the db
                if gbk_id not in existing_gbk_relationships:
                    gbk_ec_insert_values.add((gbk_id, ec_id))
            except KeyError:  # when adding relationship for the first time
                gbk_ec_insert_values.add((gbk_id, ec_id))

        if args.delete_old_ec:
            existing_ec_relationships = gbk_ec_table_dict[gbk_id]
            for ec in retrieved_ec_numbers:
                ec_id = ec_table_dict[ec]
                if ec_id not in existing_ec_relationships:
                    ecs_rel_to_delete.add((gbk_id, ec_id))

    if len(gbk_ec_insert_values) != 0:
        insert_data(connection, "Genbanks_Ecs", ["genbank_id", "ec_id"],
                    list(gbk_ec_insert_values))

    if args.delete_old_ec and len(ecs_rel_to_delete) != 0:
        with connection.begin():
            for record in tqdm(ecs_rel_to_delete,
                               desc="Deleteing old GenBank-EC relationships"):
                # record = (genbank_id, ec_id,)
                stmt = (
                    delete(genbanks_ecs).\
                    where(genbanks_ecs.c.genbank_id == record[0]).\
                    where(genbanks_ecs.c.ec_id == record[1])
                )
                connection.execute(stmt)

    return