Exemple #1
0
def query_local_DB(hits, db):
    """Queries a local SQLite3 database created using the makedb module.
    """
    organisms = {}
    hit_dict = defaultdict(list)
    for hit in hits:
        hit_dict[hit.subject].append(hit)
    for (rowid, name, start_pos, end_pos, strand, scaffold,
         organism) in database.query_genes(list(hit_dict), db):
        if organism not in organisms:
            organisms[organism] = Organism(organism, "")
        if scaffold not in organisms[organism].scaffolds:
            organisms[organism].scaffolds[scaffold] = Scaffold(scaffold)
        hits = hit_dict[str(rowid)]
        for hit in hits:
            hit.subject = name
        subject = Subject(
            id=rowid,
            name=name,
            hits=hits,
            start=int(start_pos),
            end=int(end_pos),
            strand=strand,
        )
        organisms[organism].scaffolds[scaffold].subjects.append(subject)
    return [organism for organism in organisms.values()]
Exemple #2
0
def parse_IPG_table(results, hits):
    """Links Hit objects to their genomic context from an IPG table.

    This function:
    1. Parses entries from the table, grouped by IPG number with parse_IP_groups()
    2. For each group:
       a) Find Hit objects linked to any member of the group with find_IPG_hits()
       b) For each group member, create a Subject object, then place it on its
          corresponding Scaffold and Organism objects (creating new objects when new
          scaffolds and organisms are encountered)
       c) Add Hit objects to every Subject object in the group

    Args:
        results (list): Results from IPG search.
        hits (list): Hit objects that were used to query NCBI.
    Returns:
        Organism objects containing hits sorted into genomic scaffolds.
    """

    # Group hits by their subject IDs
    hit_dict = group_hits(hits)

    # Parse IPGs from the table
    groups = parse_IP_groups(results)

    seen = set()
    organisms = defaultdict(dict)
    for ipg in list(groups):
        group = groups.pop(ipg)

        # Find any hits corresponding to this IPG
        hit_list = find_IPG_hits(group, hit_dict)

        if not hit_list:
            LOG.warning("Found no hits for IPG %s", ipg)
            continue

        # Now populate the organisms dictionary with copies
        for entry in group:
            # Avoid incomplete entries
            if not all([entry.scaffold, entry.start, entry.end, entry.strand]):
                continue

            # Avoid vectors, single Gene nucleotide entries, etc
            if entry.source not in (
                    "RefSeq",
                    "INSDC",
            ):
                continue

            # Test unique scaffold and coordinates - sometimes will have many identical
            # protein_id in one CDS region
            test = (entry.scaffold, entry.start, entry.end)
            if test in seen:
                continue
            seen.add(test)

            org, st, acc = entry.organism, entry.strain, entry.scaffold

            # Create new Organism and Scaffold objects on first encounters
            if st in org:
                org = org.replace(st, "").strip()
            if st not in organisms[org]:
                organisms[org][st] = Organism(name=org, strain=st)
            if acc not in organisms[org][st].scaffolds:
                organisms[org][st].scaffolds[acc] = Scaffold(acc)

            # Copy the original Hit object and add contextual information
            subject = Subject(
                hits=[hit.copy(subject=entry.protein_id) for hit in hit_list],
                name=entry.protein_id,
                ipg=ipg,
                end=int(entry.end),
                start=int(entry.start),
                strand=entry.strand,
            )

            organisms[org][st].scaffolds[acc].subjects.append(subject)

    return [
        organism for strains in organisms.values()
        for organism in strains.values()
    ]
Exemple #3
0
def query_local_DB(hits, database):
    """Build Organisms/Scaffolds using database.DB instance.

    This function essentially mirrors parse_IPG_table, but is adapted to the JSON
    database created using cblaster makedb. Protein headers in the DIAMOND database
    follow the form "i_j_k" where i, j and k refer to the database indexes of organisms,
    scaffolds and proteins, respectively. For example, >2_56_123 refers to the 123rd
    protein of the 56th scaffold of the 2nd organism in the database. Context of each
    hit is found by directly accessing those indices in the database, and then
    Organism, Scaffold and Subject objects are generated as in parse_IPG_table.

    Args:
        hits (list): Hit objects created during cblaster search.
        database (database.DB): cblaster database object.
    Returns:
        Organism objects containing hits sorted into genomic scaffolds.
    """

    organisms = defaultdict(dict)

    # Form non-redundant dictionary of hits. Each key will become a unique Subject.
    hit_dict = defaultdict(list)
    for hit in hits:
        hit_dict[hit.subject].append(hit)

    for hit_index, hits in hit_dict.items():
        # Hit headers should follow form "i_j_k", where i, j and k refer to the
        # database indexes of organisms, scaffolds and proteins, respectively.
        # e.g. >2_56_123 => 123rd protein of 56th scaffold of the 2nd organism
        try:
            i, j, k = [int(index) for index in hit_index.split("_")]
        except ValueError:
            LOG.exception("Hit has malformed header")

        organism = database.organisms[i]
        scaffold = organism.scaffolds[j]
        protein = scaffold.features[k]

        # For brevity...
        org = organism.name
        st = organism.strain
        sc = scaffold.accession

        # Instantiate new Organism/Scaffold objects on first encounter
        if st not in organisms[org]:
            organisms[org][st] = Organism(org, st)
        if sc not in organisms[org][st].scaffolds:
            organisms[org][st].scaffolds[sc] = Scaffold(sc)

        # Want to report just protein ID, not lineage
        identifier = find_identifier(protein.qualifiers)
        if not identifier:
            LOG.warning("Could not find identifier for hit %, skipping",
                        hit.subject)
            continue
        for hit in hits:
            hit.subject = identifier

        # Save genomic location on the Hit instance
        subject = Subject(name=identifier,
                          hits=hits,
                          start=protein.location.min(),
                          end=protein.location.max(),
                          strand=protein.location.strand)

        organisms[org][st].scaffolds[sc].subjects.append(subject)

    return [
        organism for strains in organisms.values()
        for organism in strains.values()
    ]