Beispiel #1
0
    def return_profile_OTF(self, fam):
        """
		Returns profiles as binary vectors for use with optimisation pipelines
		"""
        if type(fam) is str:
            fam = hashutils.hogid2fam(fam)
        ortho_fam = self.READ_ORTHO(fam)
        tp = self.HAM_PIPELINE([fam, ortho_fam])

        losses = [
            self.taxaIndex[n.name] for n in tp.traverse()
            if n.lost and n.name in self.taxaIndex
        ]
        dupl = [
            self.taxaIndex[n.name] for n in tp.traverse()
            if n.dupl and n.name in self.taxaIndex
        ]
        presence = [
            self.taxaIndex[n.name] for n in tp.traverse()
            if n.nbr_genes > 0 and n.name in self.taxaIndex
        ]

        indices = dict(
            zip(['presence', 'loss', 'dup'], [presence, losses, dupl]))
        hog_matrix_raw = np.zeros((1, 3 * len(self.taxaIndex)))
        for i, event in enumerate(indices):
            if len(indices[event]) > 0:
                taxindex = np.asarray(indices[event])
                hogindex = np.asarray(indices[event]) + i * len(self.taxaIndex)
                hog_matrix_raw[:, hogindex] = 1
        return {fam: {'mat': hog_matrix_raw, 'tree': tp}}
Beispiel #2
0
    def return_profile_complements(self, fam):
        """
		Returns profiles for each loss to search for complementary hogs
		"""
        if type(fam) is str:
            fam = hashutils.hogid2fam(fam)
        ortho_fam = self.READ_ORTHO(fam)
        tp = self.HAM_PIPELINE([fam, ortho_fam])

        losses = set([
            n.name for n in tp.traverse()
            if n.lost and n.name in self.taxaIndex
        ])
        #these are the roots of the fams we are looking for
        #we just assume no duplications or losses from this point

        ancestral_nodes = ([
            n for n in profiler.tree.traverse() if n.name in losses
        ])
        losses = []
        dupl = []
        complements = {n.name + '_loss': []}

        indices = dict(
            zip(['presence', 'loss', 'dup'], [presence, losses, dupl]))

        hog_matrix_raw = np.zeros((1, 3 * len(self.taxaIndex)))
        for i, event in enumerate(indices):
            if len(indices[event]) > 0:
                taxindex = np.asarray(indices[event])
                hogindex = np.asarray(indices[event]) + i * len(self.taxaIndex)
                hog_matrix_raw[:, hogindex] = 1

        return {fam: {'mat': hog_matrix_raw, 'hash': tp}}
Beispiel #3
0
    def pull_hashes(self, hoglist):
        """
		Given a list of hog_ids , returns a dictionary containing their hashes.
		This uses the hdf5 file to get the hashvalues
		:param hog_id: query hog id
		:param fam_id: query fam id
		:return: a dict containing the hash values of the hogs in hoglist
		"""

        return {
            hog: hashutils.fam2hash_hdf5(hashutils.hogid2fam(str(hog)),
                                         self.hashes_h5,
                                         nsamples=self.nsamples)
            for hog in hoglist
        }
Beispiel #4
0
    def hog_query(self, hog_id=None, fam_id=None, k=100):
        """
		Given a hog_id or a fam_id as a query, returns a dictionary containing the results of the LSH.
		:param hog_id: query hog id
		:param fam_id: query fam id
		:return: list containing the results of the LSH for the given query
		"""

        if hog_id is not None:
            fam_id = hashutils.hogid2fam(hog_id)
        query_hash = hashutils.fam2hash_hdf5(fam_id,
                                             self.hashes_h5,
                                             nsamples=self.nsamples)
        #print(query_hash.hashvalues)
        results = self.lshobj.query(query_hash, k)

        return results
Beispiel #5
0
    def return_profile_OTF_DCA(self, fam, lock=None):
        """
		Returns profiles as strings for use with DCA pipelines
		just concatenate the numpy arrays and use the tostring
		function to generate an input "alignment"

		"""
        if type(fam) is str:
            fam = hashutils.hogid2fam(fam)
        if lock is not None:
            lock.acquire()
        ortho_fam = self.READ_ORTHO(fam)
        if lock is not None:
            lock.release()
        tp = self.HAM_PIPELINE([fam, ortho_fam])
        dcastr = hashutils.tree2str_DCA(tp, self.taxaIndex)
        return {fam: {'dcastr': dcastr, 'tree': tp}}
Beispiel #6
0
    def hog_query_sorted(self, hog_id=None, fam_id=None, k=100):
        """
		Given a hog_id or a fam_id as a query, returns a dictionary containing the results of the LSH.
		:param hog_id: query hog id
		:param fam_id: query fam id
		:return: list containing the results of the LSH for the given query
		"""

        if hog_id is not None:
            fam_id = hashutils.hogid2fam(hog_id)
        query_hash = hashutils.fam2hash_hdf5(fam_id,
                                             self.hashes_h5,
                                             nsamples=self.nsamples)
        results = self.lshobj.query(query_hash, k)
        hogdict = self.pull_hashes(results)

        hogdict = {hog: hogdict[hog].jaccard(query_hash) for hog in hogdict}
        sortedhogs = [(k, v) for k, v in hogdict.items()]
        sortedhogs = sorted(student_tuples, key=lambda x: x[1])
        sortedhogs = [h[0] for h in sortehogs.reverse()]
        return hogdict