Exemple #1
0
    def _search(self, taxids):
        """Search GenBank for matches, increasing thoroughness if
fewer matches than target nseqs"""
        # record seqids of all sequences that match query
        # move to next thoroughness if too few sequences
        # if _search is run again, will not return seqids of sequences
        #  that have already been found -- self.deja_vues records seen
        #  sequences
        # limit number of taxids to 5, search recursively if more than 5
        if len(taxids) > 5:
            res = self._search(taxids[:5])
            self.thoroughness = 1
            res.extend(self._search(taxids[5:]))
            return res
        seqids = []
        while len(seqids) < self.nseqs:
            if self.thoroughness > self.max_thoroughness:
                # keep searching until enough sequences have been
                #  found or until the max thoroughness has been hit
                break
            search_term = self._buildSearchTerm(taxids, self.thoroughness)
            seqcount = etools.eSearch(search_term, logger=self.logger)['Count']
            if int(seqcount) >= 1:
                # return ALL matching seqids if more than 0
                seqids.extend(etools.eSearch(search_term, logger=self.logger,
                                             retMax=seqcount)['IdList'])
                # filter those that have already been seen
                seqids = [e for e in seqids if e not in self.deja_vues]
            self.thoroughness += 1
        self.deja_vues.extend(seqids)
        self.deja_vues = list(set(self.deja_vues))
        return list(set(seqids))
Exemple #2
0
def getOutgroup(namesdict, parentid, logger, outgroupid=None, minrecords=1000):
    """Return namesdict with suitable outgroup"""
    # TODO: too complex, consider breaking up
    def findParent(parentid):
        return etools.eFetch(parentid, logger=logger,
                             db="taxonomy")[0]['ParentTaxId']

    def getTaxIdMetaData(ncbi_id):
        etal_bool = False
        if len(ncbi_id) > 1:
            ncbi_id = ncbi_id[0]
            etal_bool = True
        record = etools.eFetch(ncbi_id, logger=logger, db="taxonomy")[0]
        metadata = [record['Rank'], record['ScientificName']]
        if etal_bool:
            metadata = [e + ' et al.' for e in metadata]
        return metadata[0], metadata[1]
    # loop until a suitable outgroup is found. Criteria are:
    #  1. ids returned must belong to a sister group of all ids of
    #   names given
    #  2. ids must have nucleotide data (i.e.avoid returning extinct organisms)
    # assumptions:
    #  1. NCBI taxonomy is not paraphyletic
    # make sure parentid is string
    if not outgroupid:
        parentid = str(parentid)
        outgroup_ids = []
        while not outgroup_ids:
            # if parent id are Cellular Orgs, likely name resolution error
            #  or names given are too diverse
            if parentid == '131567':
                raise TaxonomicRankError()
            # get parent of parent
            grandparentid = findParent(parentid)
            # find all children
            candidates = etools.findChildren(grandparentid, logger=logger,
                                             next=True)
            # filter out children that are in ingroup
            candidates = [e for e in candidates if e != parentid]
            # search genbank for nuc records
            for candidate in candidates:
                term = 'txid' + str(candidate) + '[PORGN]'
                nuc_record = etools.eSearch(term, logger=logger)
                # there must be more than 1000 nuc records
                if int(nuc_record['Count']) > minrecords:
                    outgroup_ids.append(candidate)
            # make grandparentid the new parentid
            parentid = grandparentid
    else:
        outgroup_ids = [outgroupid]
    # add outgroup_ids to namesdict
    rank, unique_name = getTaxIdMetaData(outgroup_ids)
    # convert to ints
    outgroup_ids = [int(e) for e in outgroup_ids]
    namesdict["outgroup"] = {"txids": outgroup_ids, "unique_name": unique_name,
                             "rank": rank}
    return namesdict