def query_local_DB(hits, db): """Queries a local SQLite3 database created using the makedb module. """ organisms = {} hit_dict = defaultdict(list) for hit in hits: hit_dict[hit.subject].append(hit) for (rowid, name, start_pos, end_pos, strand, scaffold, organism) in database.query_genes(list(hit_dict), db): if organism not in organisms: organisms[organism] = Organism(organism, "") if scaffold not in organisms[organism].scaffolds: organisms[organism].scaffolds[scaffold] = Scaffold(scaffold) hits = hit_dict[str(rowid)] for hit in hits: hit.subject = name subject = Subject( id=rowid, name=name, hits=hits, start=int(start_pos), end=int(end_pos), strand=strand, ) organisms[organism].scaffolds[scaffold].subjects.append(subject) return [organism for organism in organisms.values()]
def parse_IPG_table(results, hits): """Links Hit objects to their genomic context from an IPG table. This function: 1. Parses entries from the table, grouped by IPG number with parse_IP_groups() 2. For each group: a) Find Hit objects linked to any member of the group with find_IPG_hits() b) For each group member, create a Subject object, then place it on its corresponding Scaffold and Organism objects (creating new objects when new scaffolds and organisms are encountered) c) Add Hit objects to every Subject object in the group Args: results (list): Results from IPG search. hits (list): Hit objects that were used to query NCBI. Returns: Organism objects containing hits sorted into genomic scaffolds. """ # Group hits by their subject IDs hit_dict = group_hits(hits) # Parse IPGs from the table groups = parse_IP_groups(results) seen = set() organisms = defaultdict(dict) for ipg in list(groups): group = groups.pop(ipg) # Find any hits corresponding to this IPG hit_list = find_IPG_hits(group, hit_dict) if not hit_list: LOG.warning("Found no hits for IPG %s", ipg) continue # Now populate the organisms dictionary with copies for entry in group: # Avoid incomplete entries if not all([entry.scaffold, entry.start, entry.end, entry.strand]): continue # Avoid vectors, single Gene nucleotide entries, etc if entry.source not in ( "RefSeq", "INSDC", ): continue # Test unique scaffold and coordinates - sometimes will have many identical # protein_id in one CDS region test = (entry.scaffold, entry.start, entry.end) if test in seen: continue seen.add(test) org, st, acc = entry.organism, entry.strain, entry.scaffold # Create new Organism and Scaffold objects on first encounters if st in org: org = org.replace(st, "").strip() if st not in organisms[org]: organisms[org][st] = Organism(name=org, strain=st) if acc not in organisms[org][st].scaffolds: organisms[org][st].scaffolds[acc] = Scaffold(acc) # Copy the original Hit object and add contextual information subject = Subject( hits=[hit.copy(subject=entry.protein_id) for hit in hit_list], name=entry.protein_id, ipg=ipg, end=int(entry.end), start=int(entry.start), strand=entry.strand, ) organisms[org][st].scaffolds[acc].subjects.append(subject) return [ organism for strains in organisms.values() for organism in strains.values() ]
def query_local_DB(hits, database): """Build Organisms/Scaffolds using database.DB instance. This function essentially mirrors parse_IPG_table, but is adapted to the JSON database created using cblaster makedb. Protein headers in the DIAMOND database follow the form "i_j_k" where i, j and k refer to the database indexes of organisms, scaffolds and proteins, respectively. For example, >2_56_123 refers to the 123rd protein of the 56th scaffold of the 2nd organism in the database. Context of each hit is found by directly accessing those indices in the database, and then Organism, Scaffold and Subject objects are generated as in parse_IPG_table. Args: hits (list): Hit objects created during cblaster search. database (database.DB): cblaster database object. Returns: Organism objects containing hits sorted into genomic scaffolds. """ organisms = defaultdict(dict) # Form non-redundant dictionary of hits. Each key will become a unique Subject. hit_dict = defaultdict(list) for hit in hits: hit_dict[hit.subject].append(hit) for hit_index, hits in hit_dict.items(): # Hit headers should follow form "i_j_k", where i, j and k refer to the # database indexes of organisms, scaffolds and proteins, respectively. # e.g. >2_56_123 => 123rd protein of 56th scaffold of the 2nd organism try: i, j, k = [int(index) for index in hit_index.split("_")] except ValueError: LOG.exception("Hit has malformed header") organism = database.organisms[i] scaffold = organism.scaffolds[j] protein = scaffold.features[k] # For brevity... org = organism.name st = organism.strain sc = scaffold.accession # Instantiate new Organism/Scaffold objects on first encounter if st not in organisms[org]: organisms[org][st] = Organism(org, st) if sc not in organisms[org][st].scaffolds: organisms[org][st].scaffolds[sc] = Scaffold(sc) # Want to report just protein ID, not lineage identifier = find_identifier(protein.qualifiers) if not identifier: LOG.warning("Could not find identifier for hit %, skipping", hit.subject) continue for hit in hits: hit.subject = identifier # Save genomic location on the Hit instance subject = Subject(name=identifier, hits=hits, start=protein.location.min(), end=protein.location.max(), strand=protein.location.strand) organisms[org][st].scaffolds[sc].subjects.append(subject) return [ organism for strains in organisms.values() for organism in strains.values() ]