Ejemplo n.º 1
0
def get_ec_gbk_table_dict(connection):
    """Load the Genbanks_Ecs table into memory and compile a dict.

    The the result dict is keyed by EC IDS.
    
    The table contains the current Genbank and EC number relationships in 
    the local CAZyme db.
    
    :param connection: open sqlalchemy connection to an SQLite db
    
    Return dict {ec_id: {gbk ids}}
    """
    with Session(bind=connection) as session:
        all_gbk_ec_records = session.query(Genbank, Ec).\
            join(Ec, Genbank.ecs).\
            all()
        
    ec_gbk_table_dict = {}
    
    for record in all_gbk_ec_records:
        genbank_id = record[0].genbank_id
        ec_id = record[1].ec_id
        
        try:
            ec_gbk_table_dict[ec_id].add(genbank_id)
        except KeyError:
            ec_gbk_table_dict[ec_id] = {genbank_id}
    
    return ec_gbk_table_dict
Ejemplo n.º 2
0
def get_gbk_fam_table_dict(connection):
    """Build dict representing the records present in the Genbanks_CazyFamilies table

    If a GenBank accession is in the db but not has not CazyFamilies instances related to it,
    the GenBank accession is not returned when quering the db.
    
    :param connection: open sqlalchemy connection to an SQLite3 db engine
    
    Return 
    - dict: {gbk_acc: {'families': {'fam subfam': fam_id}}, 'gbk_id': gbk_id }
    - set of tuples: (gbk_id, fam_id), each representing one row in the table
    """
    with Session(bind=connection) as session:
        all_gbk_fam_records = session.query(Genbank, CazyFamily).\
        join(CazyFamily, Genbank.families).\
        all()

    existing_rel_tuples = set()  # set of tuples (gbk_id, fam_id)

    gbk_fam_table_dict = {}
    # {gbk_acc: {'families': {'fam subfam': fam_id}}, 'gbk_id': gbk_id }

    for record in tqdm(all_gbk_fam_records, ' Retreving existing gbk-fam relationships from db'):
        gbk_accession = record[0].genbank_accession
        gbk_id = record[0].genbank_id

        family = record[1].family
        if record[1].subfamily is None:
            subfamily = '_'
        else:
            subfamily = record[1].subfamily
        fam_id = record[1].family_id

        existing_rel_tuples.add( (gbk_id, fam_id) )

        try:
            gbk_fam_table_dict[gbk_accession]

            try:
                gbk_fam_table_dict[gbk_accession][f'{family} {subfamily}']

            except KeyError:
                gbk_fam_table_dict[gbk_accession][f'{family} {subfamily}'] = fam_id

        except KeyError:
            gbk_fam_table_dict[gbk_accession] = {
                'families': {f'{family} {subfamily}': fam_id},
                'gbk_id': gbk_id,
            }

    return gbk_fam_table_dict, existing_rel_tuples
Ejemplo n.º 3
0
def get_ec_table_dict(connection):
    """Create dict of objects present in the CazyFamilies table.
    
    :param connection: open sqlalchemy db engine connection
    
    Return dict {ec_number: ec_id}
    """
    with Session(bind=connection) as session:
        db_ec_records = session.query(Ec).all()
    
    ec_table_dict = {}  # {ec_number: ec_id}
    for record in tqdm(db_ec_records, desc="Retrieving existing EC# records"):
        ec_table_dict[record.ec_number] = record.ec_id

    return ec_table_dict
Ejemplo n.º 4
0
def get_pdb_table_dict(connection):
    """Create dict of objects present in the Pdbs table.
    
    :param connection: open sqlalchemy db engine connection
    
    Return dict {pdb_accession: pdb_db_id}
    """
    with Session(bind=connection) as session:
        db_pdb_records = session.query(Pdb).all()

    pdb_table_dict = {}  # {pdb_accession: pdb_db_id}

    for record in tqdm(db_pdb_records, desc="Loading existing PDB db records"):
        pdb_table_dict[record.pdb_accession] = record.pdb_id

    return pdb_table_dict 
Ejemplo n.º 5
0
def get_kingdom_table_dict(connection):
    """Load and parse the Kingdoms table from the db and compile a dict {kgnd: id}
    
    :param connection:
    
    Return dict {kingdom: kindom_db_id}
    """
    with Session(bind=connection) as session:
        kingdom_table = session.query(Kingdom).all()
    
    kingdom_dict = {}  # {kingdom: kindom_db_id}
    
    for kingdom_obj in kingdom_table:
        kingdom_dict[kingdom_obj.kingdom] = kingdom_obj.kingdom_id
        
    return kingdom_dict
Ejemplo n.º 6
0
def get_gbk_seq(gbk_dict, query_data, connection):
    """Retrieve GenBank protein sequences for the provided Gbks.
    
    :param gbk_dict: dict of selected GenBank accessions {acc: id}
    :param query_data: dict containing all data retrieved from the db
    :param connection: open sqlaclchemy connection for an SQLite db

    Return query_data: dict containing all data retrieved from the db
    """
    logger = logging.getLogger(__name__)
    
    gbk_accessions = list(gbk_dict.keys())

    # retrieve the data from the Taxonomy and Kingdom tables
    with Session(bind=connection) as session:
        gbk_query = session.query(Genbank).\
            filter(Genbank.genbank_accession.in_(gbk_accessions)).\
            all()

    if len(gbk_query) == 0:
        logger.warning("No GenBank records retrieved for any of the selected GenBank accessions.")
        return query_data

    for record in tqdm(gbk_query, desc="Getting GenBank protein sequences"):
        gbk_acc = record[0].genbank_accession
        seq = record[0].sequence
        seq_date = record[0].seq_update_date

        try:
            query_data[gbk_acc]
            
            try:
                logger.warning(
                    f"Multiple GBK records found for GBK acc {gbk_acc}\n"
                    "Retreiving only one gbk sequence."
                )
                query_data[gbk_acc]['gbk_sequence'] = seq
                query_data[gbk_acc]['gbk_sequence_date'] = seq_date
            except KeyError:
                query_data[gbk_acc]['gbk_sequence'] = seq
                query_data[gbk_acc]['gbk_sequence_date'] = seq_date

        except KeyError:
            query_data[gbk_acc] = {'gbk_sequence': seq, 'gbk_sequence_date': seq_date}

    return query_data
Ejemplo n.º 7
0
def get_gbk_kingdom_dict(connection):
    """Compile dict of Genbank, Taxonomy and Kingdom records
    
    :param connection: open sqlalchemy db connection
    
    Return dict {kingdom: {genus: {species: {protein_accessions}}}
    """
    with Session(bind=connection) as session:
        query_results = session.query(Genbank, Taxonomy, Kingdom).\
            join(Taxonomy, (Taxonomy.kingdom_id == Kingdom.kingdom_id)).\
            join(Genbank, (Genbank.taxonomy_id == Taxonomy.taxonomy_id)).\
            all()

    genbank_kingdom_dict = {}  # kingdom: {genus: {species: {protein_accessions}}}

    for result in tqdm(query_results, desc="Retreving GenBank accessions and taxonomy"):
        genbank_accession = result[0].genbank_accession
        genus = result[1].genus
        species = result[1].species
        kingdom = result[2].kingdom

        try:
            genbank_kingdom_dict[kingdom]

            try:
                genbank_kingdom_dict[kingdom][genus]

                try:
                    genbank_kingdom_dict[kingdom][genus][species].add(genbank_accession)
                
                except KeyError:
                    genbank_kingdom_dict[kingdom][genus][species] = {genbank_accession}

            except KeyError:
                genbank_kingdom_dict[kingdom][genus] = {species: {genbank_accession}}

        except KeyError:
            genbank_kingdom_dict[kingdom] = {
                genus: {
                    species: {genbank_accession},
                },
            }

    return genbank_kingdom_dict
Ejemplo n.º 8
0
def get_gbk_table_seq_dict(connection):
    """Compile a dict of the data in the Genbanks table
    
    :param connection: open connection to an SQLite3 database
    
    Return dict {genbank_accession: 'sequence': str, 'seq_date': str}
    """
    with Session(bind=connection) as session:
        all_genbank = session.query(Genbank).all()

    db_gbk_dict = {}  # {genbank_accession: 'sequence': str, 'seq_date': str}
    
    for gbk in all_genbank:
        db_gbk_dict[f"{gbk.genbank_accession}"] = {
            'sequence': gbk.sequence,
            'seq_date': gbk.seq_update_date
        }
    
    return db_gbk_dict
Ejemplo n.º 9
0
def get_gbk_table_dict(connection):
    """Compile a dict of the data in the Genbanks table
    
    :param connection: open connection to an SQLite3 database
    
    Return dict {genbank_accession: 'taxa_id': int, 'gbk_id': int}
    """
    with Session(bind=connection) as session:
        all_genbank = session.query(Genbank).all()

    db_gbk_dict = {}  # {genbank_accession: 'taxa_id': str, 'id': int}
    
    for gbk in all_genbank:
        db_gbk_dict[f"{gbk.genbank_accession}"] = {
            'taxa_id': gbk.taxonomy_id,
            'gbk_id': gbk.genbank_id
        }
    
    return db_gbk_dict
def apply_ec_filters(
    current_gbk_objs,
    ec_filters,
    connection,
):
    """Apply EC number filter to the retrieved Genbank records.
    
    :param current_gbk_objs: list of db Genbank objs retrieved from the db
    :param ec_filters: set of EC numbers to limit the retrieval of data to
    :param connection: open sqlaclchemy connection for an SQLite db
    
    Return set of db Genbank objects.
    """
    logger = logging.getLogger(__name__)

    ec_gbk_ids = set()

    # Retrieve all Genbank.genbank_ids for each EC number
    for ec in tqdm(ec_filters, desc="Retrieving gbks for EC# filters"):
        with Session(bind=connection) as session:
            gbk_query = session.query(Genbank.genbank_id).\
                join(Ec, Genbank.ecs).\
                filter(Ec.ec_number == ec).\
                all()

        for gbk_id in gbk_query:
            ec_gbk_ids.add(gbk_id)

    if len(ec_gbk_ids) == 0:
        logger.error(
            "Retrieved NO proteins matching the provided EC numbers\n"
            "Check the local CAZyme db contains the EC numbers provided\n"
            "Terminating program")
        sys.exit(1)

    ec_filtered_gbks = set()

    for gbk_record in tqdm(current_gbk_objs,
                           desc="Checking gbk records against EC filters"):
        if (gbk_record.genbank_id, ) in ec_gbk_ids:
            ec_filtered_gbks.add(gbk_record)

    return ec_filtered_gbks
def get_ids(genbank_accessions, connection):
    """Get the local CAZyme database IDs for the list of provided GenBank accessions.
    
    :param genbank_accessions: set of GenBank accessions
    :param connection: open sqlalchemy engine connection
    
    Return dict, keyed by GenBank accession and valued by database record ID.
    """
    gbk_dict = {}

    for accession in tqdm(genbank_accessions,
                          desc="Getting local db record IDs"):
        with Session(bind=connection) as session:
            gbk_query = session.query(Genbank).\
                filter(Genbank.genbank_accession == accession).\
                first()

        gbk_dict[accession] = gbk_query.genbank_id

    return gbk_dict
Ejemplo n.º 12
0
def get_ec_annotations(gbk_dict, query_data, connection):
    """Retrieve EC number annotations for the provided Gbks.
    
    :param gbk_dict: dict of selected GenBank accessions {acc: id}
    :param query_data: dict containing all data retrieved from the db
    :param connection: open sqlaclchemy connection for an SQLite db

    Return query_data: dict containing all data retrieved from the db
    """
    logger = logging.getLogger(__name__)
    
    gbk_accessions = list(gbk_dict.keys())

    # retrieve the data from the Taxonomy and Kingdom tables
    with Session(bind=connection) as session:
        ec_query = session.query(Genbank, Ec).\
            join(Ec, Genbank.ecs).\
            filter(Genbank.genbank_accession.in_(gbk_accessions)).\
            all()

    if len(ec_query) == 0:
        logger.warning("No EC annotations retrieved for any of the selected GenBank accessions.")
        return query_data

    for record in tqdm(ec_query, desc="Getting EC number annotations"):
        gbk_acc = record[0].genbank_accession

        ec_number = record[1].ec_number

        try:
            query_data[gbk_acc]
            
            try:
                query_data[gbk_acc]['ec_numbers'].add(ec_number)
            except KeyError:
                query_data[gbk_acc]['ec_numbers'] = {ec_number}

        except KeyError:
            query_data[gbk_acc] = {'ec_numbers': {ec_number}}

    return query_data
Ejemplo n.º 13
0
def get_pdb_accessions(gbk_dict, query_data, connection):
    """Retrieve PDB accessions for the provided Gbks.
    
    :param gbk_dict: dict of selected GenBank accessions {acc: id}
    :param query_data: dict containing all data retrieved from the db
    :param connection: open sqlaclchemy connection for an SQLite db

    Return query_data: dict containing all data retrieved from the db
    """
    logger = logging.getLogger(__name__)
    
    gbk_accessions = list(gbk_dict.keys())

    # retrieve the data from the Taxonomy and Kingdom tables
    with Session(bind=connection) as session:
        pdb_query = session.query(Genbank, Pdb).\
            join(Pdb, Genbank.pdbs).\
            filter(Genbank.genbank_accession.in_(gbk_accessions)).\
            all()

    if len(pdb_query) == 0:
        logger.warning("No PDB accessions retrieved for any of the selected GenBank accessions.")
        return query_data

    for record in tqdm(pdb_query, desc="Getting PDB accessions"):
        gbk_acc = record[0].genbank_accession

        pdb_accession = record[1].pdb_accession

        try:
            query_data[gbk_acc]
            
            try:
                query_data[gbk_acc]['pdb_accessions'].add(pdb_accession)
            except KeyError:
                query_data[gbk_acc]['pdb_accessions'] = {pdb_accession}

        except KeyError:
            query_data[gbk_acc] = {'pdb_accessions': {pdb_accession}}

    return query_data
Ejemplo n.º 14
0
def get_fams_table_dict(connection):
    """Create dict of objects present in the CazyFamilies table.
    
    :param connection: open sqlalchemy db engine connection
    
    Return dict {family subfamily: db_family_id}
    """
    with Session(bind=connection) as session:
        all_families = session.query(CazyFamily).all()
        
    db_fam_dict = {}

    for fam in all_families:
        if fam.subfamily is None:
            subfam = '_'
        else:
            subfam = fam.subfamily
            
        db_fam_dict[f"{fam.family} {subfam}"] = fam.family_id
    
    return db_fam_dict
Ejemplo n.º 15
0
def get_uniprot_table_dict(connection):
    """Create dict of objects present in the Uniprots table.
    
    :param connection: open sqlalchemy db engine connection
    
    Return dict {acc: {name: str, gbk_id: int, seq: str, seq_date:str } }
    """
    with Session(bind=connection) as session:
        db_uniprot_records = session.query(Uniprot).all()

    uniprot_table_dict = {}  # {acc: {name: str, gbk_id: int, seq: str, seq_date:str } }

    for record in tqdm(db_uniprot_records, desc="Retrieving existing UniProt records from db"):
        uniprot_table_dict[record.uniprot_accession] = {
            "name": record.uniprot_name,
            "genbank_id": record.genbank_id,
            "seq": record.sequence,
            "seq_date": record.seq_update_date,
        }
    
    return uniprot_table_dict
Ejemplo n.º 16
0
def get_taxs_table_dict(connection):
    """Create dict of objects present in the Taxs table.
    
    :param connection: open sqlalchemy db engine connection
    
    Return dict {genus species: {'tax_id': db_tax_id, 'kingdom_id': kingdom_id}
    """
    with Session(bind=connection) as session:
        all_taxa = session.query(Taxonomy).all()
        
    db_tax_dict = {}
    for taxa in all_taxa:
        if len(taxa.species) == 0:
            db_tax_dict[f"{taxa.genus}"] = {
                'tax_id': taxa.taxonomy_id,
                'kingdom_id': taxa.kingdom_id,
            }
        else:
            db_tax_dict[f"{taxa.genus} {taxa.species}"] = {
                'tax_id': taxa.taxonomy_id,
                'kingdom_id': taxa.kingdom_id,
            }
    
    return db_tax_dict
Ejemplo n.º 17
0
def get_gbk_pdb_table_dict(connection):
    """Create dict of objects present in the Genbanks_Pdbs table.
    
    :param connection: open sqlalchemy db engine connection
    
    Return dict {gbk_db_id: {pdb_db_id} }
    """
    with Session(bind=connection) as session:
        all_gbk_pdb_records = session.query(Genbank, Pdb).\
            join(Pdb, Genbank.pdbs).\
            all()

    gbk_pdb_table_dict = {}  # {pdb_accession: pdb_db_id}

    for record in tqdm(all_gbk_pdb_records, desc="Loading existing Genbank_Pdbs db records"):
        genbank_id = record[0].genbank_id
        pdb_id = record[1].pdb_id

        try:
            gbk_pdb_table_dict[genbank_id].add(pdb_id)
        except KeyError:
            gbk_pdb_table_dict[genbank_id] = {pdb_id}

    return gbk_pdb_table_dict 
Ejemplo n.º 18
0
def get_tax_annotations(gbk_dict, query_data, connection, args):
    """Retrieve kingdom, genus and/or scientific name of the source organism for the provided Gbks.
    
    :param gbk_dict: dict of selected GenBank accessions {acc: id}
    :param query_data: dict containing all data retrieved from the db
    :param connection: open sqlaclchemy connection for an SQLite db
    :param args: cmd-line args parser

    Return query_data: dict containing all data retrieved from the db
    """
    logger = logging.getLogger(__name__)
    
    gbk_accessions = list(gbk_dict.keys())

    # retrieve the data from the Taxonomy and Kingdom tables
    with Session(bind=connection) as session:
        tax_query = session.query(Genbank, Taxonomy, Kingdom).\
            join(Taxonomy, (Taxonomy.kingdom_id == Kingdom.kingdom_id)).\
            join(Genbank, (Genbank.taxonomy_id == Taxonomy.taxonomy_id)).\
            filter(Genbank.genbank_accession.in_(gbk_accessions)).\
            all()

    if len(tax_query) == 0:
        logger.warning("No taxonomy data retrieved for any of the selected GenBank accessions.")
        return query_data

    for record in tqdm(tax_query, desc="Getting taxonomy data"):
        gbk_acc = record[0].genbank_accession

        if 'kingdom' in args.include:
            kingdom = record[2].kingdom

            try:
                query_data[gbk_acc]
                
                try:
                    query_data[gbk_acc]['kingdom']
                    logger.warning(
                        f"Multiple taxa found for {gbk_acc}\n"
                        "Retreiving only one record."
                    )
                    query_data[gbk_acc]['kingdom'] = kingdom
                except KeyError:
                    query_data[gbk_acc]['kingdom'] = kingdom

            except KeyError:
                query_data[gbk_acc] = {'kingdom': kingdom}
        
        if 'genus' in args.include:
            genus = record[1].genus

            try:
                query_data[gbk_acc]
                
                try:
                    query_data[gbk_acc]['genus']
                    logger.warning(
                        f"Multiple taxa found for {gbk_acc}\n"
                        "Retreiving only one record."
                    )
                    query_data[gbk_acc]['genus'] = genus
                except KeyError:
                    query_data[gbk_acc]['genus'] = genus

            except KeyError:
                query_data[gbk_acc] = {'genus': genus}

        if 'organism' in args.include:
            genus = record[1].genus
            species = record[1].species
            organism = f"{genus} {species}"

            try:
                query_data[gbk_acc]
                
                try:
                    query_data[gbk_acc]['organism']
                    logger.warning(
                        f"Multiple taxa found for {gbk_acc}\n"
                        "Retreiving only one record."
                    )
                    query_data[gbk_acc]['organism'] = organism
                except KeyError:
                    query_data[gbk_acc]['organism'] = organism

            except KeyError:
                query_data[gbk_acc] = {'organism': organism}

    return query_data
def get_class_fam_genbank_accessions(
    class_filters,
    family_filters,
    connection,
):
    """Retrieve the GenBank accessions of proteins from user selected CAZy classes and (sub)families

    :param class_filters: set of CAZy classes to retrieve data for
    :param family_filters: set of CAZy families to retrieve data for
    :param connection: open sqlaclchemy connection for an SQLite db
    
    Return list of db objects containing a Genbank obj, Taxonomy obj and Kingdom obj.
    """
    logger = logging.getLogger(__name__)

    initially_selected_gbk = []

    if len(class_filters) == 0 and len(family_filters) == 0:
        logger.warning("No class or family filters applied")
        # could retrieve all GenBank accessions
        with Session(bind=connection) as session:
            gbk_query = session.query(Genbank, Taxonomy, Kingdom).\
                join(Taxonomy, (Taxonomy.kingdom_id == Kingdom.kingdom_id)).\
                join(Genbank, (Genbank.taxonomy_id == Taxonomy.taxonomy_id)).\
                join(CazyFamily, Genbank.families).\
                all()

            initially_selected_gbk = gbk_query

        return initially_selected_gbk

    if len(class_filters) != 0:
        logger.warning("Applying CAZy class filter(s)")
    for cazy_class in tqdm(
            class_filters,
            desc="Retrieving GenBank accessions for selected CAZy classes"):
        class_abbrev = CLASS_ABBREVIATIONS[cazy_class]

        # perform a subquery to retrieve all CAZy families in the CAZy class
        inner_stmt = select(CazyFamily.family).where(
            CazyFamily.family.like(f'{class_abbrev}%'))
        subq = inner_stmt.subquery()
        aliased_families = aliased(CazyFamily, subq)
        stmt = select(aliased_families)

        # perform query to retrieve proteins in the CAZy families
        with Session(bind=connection) as session:
            gbk_query = session.query(Genbank, Taxonomy, Kingdom).\
                join(Taxonomy, (Taxonomy.kingdom_id == Kingdom.kingdom_id)).\
                join(Genbank, (Genbank.taxonomy_id == Taxonomy.taxonomy_id)).\
                join(CazyFamily, Genbank.families).\
                filter(CazyFamily.family.in_(stmt)).\
                all()

        initially_selected_gbk += gbk_query

    if len(family_filters) != 0:
        logger.warning("Applying CAZy family filter(s)")
    for cazy_family in tqdm(
            family_filters,
            desc="Retrieving GenBank accessions for selected CAZy families"):
        inner_stmt = select(
            CazyFamily.family).where(CazyFamily.family == cazy_family)
        subq = inner_stmt.subquery()
        aliased_families = aliased(CazyFamily, subq)
        stmt = select(aliased_families)

        if cazy_family.find('_') != -1:  # subfamily
            with Session(bind=connection) as session:
                gbk_query = session.query(Genbank, Taxonomy, Kingdom).\
                    join(Taxonomy, (Taxonomy.kingdom_id == Kingdom.kingdom_id)).\
                    join(Genbank, (Genbank.taxonomy_id == Taxonomy.taxonomy_id)).\
                    join(CazyFamily, Genbank.families).\
                    filter(CazyFamily.subfamily.in_(stmt)).\
                    all()

        else:
            with Session(bind=connection) as session:
                gbk_query = session.query(Genbank, Taxonomy, Kingdom).\
                    join(Taxonomy, (Taxonomy.kingdom_id == Kingdom.kingdom_id)).\
                    join(Genbank, (Genbank.taxonomy_id == Taxonomy.taxonomy_id)).\
                    join(CazyFamily, Genbank.families).\
                    filter(CazyFamily.family.in_(stmt)).\
                    all()

        initially_selected_gbk += gbk_query

    return list(set(initially_selected_gbk))
Ejemplo n.º 20
0
def get_uniprot_data(gbk_dict, query_data, connection, args):
    """Retrieve UniProt data for the provided Gbks.
    
    :param gbk_dict: dict of selected GenBank accessions {acc: id}
    :param query_data: dict containing all data retrieved from the db
    :param connection: open sqlaclchemy connection for an SQLite db
    :param args: cmd-line args parser

    Return query_data: dict containing all data retrieved from the db
    """
    logger = logging.getLogger(__name__)
    
    gbk_accessions = list(gbk_dict.keys())

    # retrieve the data from the Taxonomy and Kingdom tables
    with Session(bind=connection) as session:
        uniprot_query = session.query(Genbank, Uniprot).\
            join(Uniprot, (Uniprot.genbank_id == Genbank.genbank_id)).\
            filter(Genbank.genbank_accession.in_(gbk_accessions)).\
            all()

    if len(uniprot_query) == 0:
        logger.warning("No UniProt records retrieved for any of the selected GenBank accessions.")
        return query_data

    for record in tqdm(uniprot_query, desc="Getting UniProt data"):
        gbk_acc = record[0].genbank_accession

        if 'uniprot_acc' in args.include:
            uniprot_accession = record[1].uniprot_accession

            try:
                query_data[gbk_acc]
    
                try:
                    query_data[gbk_acc]['uniprot_accession']
                    logger.warning(
                        f"Multiple UniProt records found for GBK acc {gbk_acc}\n"
                        "Retreiving only one."
                    )
                    query_data[gbk_acc]['uniprot_accession'] = uniprot_accession

                except KeyError:
                    query_data[gbk_acc]['uniprot_accession'] = uniprot_accession

            except KeyError:
                query_data[gbk_acc] = {
                    'uniprot_accession': uniprot_accession,
                }

        if 'uniprot_name' in args.include:
            uniprot_name = record[1].uniprot_name

            try:
                query_data[gbk_acc]
                
                try:
                    query_data[gbk_acc]['uniprot_name']
                    logger.warning(
                        f"Multiple UniProt records found for GBK acc {gbk_acc}\n"
                        "Retreiving only one."
                    )
                    query_data[gbk_acc]['uniprot_name'] = uniprot_name
                    
                except KeyError:
                    query_data[gbk_acc]['uniprot_name'] = uniprot_name

            except KeyError:
                query_data[gbk_acc] = {
                    'uniprot_name': uniprot_name,
                }

        if 'uniprot_seq' in args.include:
            seq = record[1].sequence
            seq_date = record[1].seq_update_date

            try:
                query_data[gbk_acc]
                
                try:
                    query_data[gbk_acc]['uniprot_sequence']
                    logger.warning(
                        f"Multiple UniProt records found for GBK acc {gbk_acc}\n"
                        "Retreiving only one record."
                    )
                    query_data[gbk_acc]['uniprot_sequence'] = seq
                    query_data[gbk_acc]['uniprot_sequence_date'] = seq_date
                except KeyError:
                    query_data[gbk_acc]['uniprot_sequence'] = seq
                    query_data[gbk_acc]['uniprot_sequence_date'] = seq_date

            except KeyError:
                query_data[gbk_acc] = {'sequence': seq, 'sequence_date': seq_date}

    return query_data
Ejemplo n.º 21
0
def get_class_fam_annotations(gbk_dict, query_data, connection, args):
    """Retrieve CAZy class and/or family annotations for the provided Gbks.
    
    :param gbk_dict: dict of selected GenBank accessions {acc: id}
    :param query_data: dict containing all data retrieved from the db
    :param connection: open sqlaclchemy connection for an SQLite db
    :param args: cmd-line args parser

    Return query_data: dict containing all data retrieved from the db
    """
    logger = logging.getLogger(__name__)
    
    gbk_accessions = list(gbk_dict.keys())

    # retrieve the data from the CAZy Family table
    with Session(bind=connection) as session:
        fam_table_query = session.query(Genbank, CazyFamily).\
            join(CazyFamily, Genbank.families).\
            filter(Genbank.genbank_accession.in_(gbk_accessions)).\
            all()

    if len(fam_table_query) == 0:
        logger.warning(
            "No CAZy class/family annotations retrieved for any of the selected "
            "GenBank accessions."
        )
        return query_data

    for record in tqdm(fam_table_query, desc="Getting CAZy class/family annotations"):
        gbk_acc = record[0].genbank_accession

        if 'class' in args.include:
            fam = record[1].family
            cazy_class = re.match(r"\D{2,3}\d", fam).group()[:-1]

            try:
                query_data[gbk_acc]
                
                try:
                    query_data[gbk_acc]['class'].add(cazy_class)
                except KeyError:
                    query_data[gbk_acc]['class'] = {cazy_class}

            except KeyError:
                query_data[gbk_acc] = {'class': {cazy_class}}
        
        if 'family' in args.include:
            fam = record[1].family

            try:
                query_data[gbk_acc]
                
                try:
                    query_data[gbk_acc]['family'].add(fam)
                except KeyError:
                    query_data[gbk_acc]['family'] = {fam}

            except KeyError:
                query_data[gbk_acc] = {'family': {fam}}

        if 'subfamily' in args.include:
            subfam = record[1].subfamily

            try:
                query_data[gbk_acc]
                
                try:
                    query_data[gbk_acc]['subfamily'].add(subfam)
                except KeyError:
                    query_data[gbk_acc]['subfamily'] = {subfam}

            except KeyError:
                query_data[gbk_acc] = {'subfamily': {subfam}}

    return query_data