class AccumulateRankings(ComputeCube):
    """
    A compute Cube that receives rankings and assemble them in a list
    """

    classification = [["Compute", "Accumulator"]]

    url = parameter.StringParameter(
        'url',
        default="http://10.0.1.22:4242",
        help_text="Url of the Restful FastROCS Server for the request")

    intake = ObjectInputPort('intake')
    success = ObjectOutputPort('success')

    def begin(self):
        self.ranking_list = list()
        self.dataset_id = None

    def process(self, data, port):
        self.ranking_list.append(data[2])
        self.nb_ka = len(data[0]) - len(data[1][1])
        self.method = data[3]
        if len(data) == 5:
            self.dataset_id = data[4]

    def end(self):
        url = self.args.url + '/datasets/'
        if self.dataset_id != None:
            pass
            #response = requests.delete(url + str(self.dataset_id) + '/')
        self.success.emit((self.ranking_list, self.nb_ka, self.method))
Exemple #2
0
class IndexInputCube(SourceCube):
    """
    An input cube that reads an index log and return the baitsets
    """

    classification = [["Input"]]

    success = ObjectOutputPort('success')

    limit = parameter.IntegerParameter(
        'limit',
        required=False,
        description='Read up to N items from this cube')

    data_in = parameter.DataSetInputParameter(
        'data_in',
        required=True,
        title='Index log',
        description='The index log to read from')

    def begin(self):
        self.in_orion = config_from_env() is not None
        if self.in_orion:
            #self.stream = stream_file(988)
            self.stream = stream_file(self.args.data_in)
        else:
            self.stream = open(str(self.args.data_in), 'rb')

    def __iter__(self):
        max_idx = self.args.limit
        if max_idx is not None:
            max_idx = int(max_idx)
        count = 0

        for chunk in self.stream:

            index_log = chunk.decode('utf-8')
            lines = index_log.split("set ")
            lines = lines[1:]
            for baitset in lines:
                baitset = baitset.split(" ")
                set_id = baitset[0][-2:-1]
                set_id = int(set_id)
                baitset = baitset[1:-1]
                for i, idx in enumerate(baitset):
                    if idx.isdigit():
                        baitset[i] = int(idx)
                count += 1
                if max_idx is not None and count == max_idx:
                    break
                yield (set_id, baitset)
class AccuMolList(ComputeCube):
    """
    A compute Cube that gets Molecules and assemble them in a list
    """

    classification = [["Compute", "Concat"]]

    intake = MoleculeInputPort('intake')
    success = ObjectOutputPort('success')

    def begin(self):
        self.mol_list = list()

    def process(self, mol, port):
        self.mol_list.append(mol)

    def end(self):
        self.success.emit(self.mol_list)
class IndexGenerator(ComputeCube):
    """

    """

    classification = [["Compute", "Index"]]

    intake = ObjectInputPort('intake')
    success = ObjectOutputPort('success')
    tags = [['IndexGenerator_tags']]
    title = 'IndexGenerator_Title'

    def begin(self):
        self.log.info("Generator Cube runnig")

    def process(self, data, port):
        pass

    def end(self):
        total = 100
        nb_index = 33
        iteration = 10

        for idx in range(iteration):
            self.baitset = list()
            i = 0

            index_set = set()
            while i < nb_index:
                index = random.randint(0, total - 1)
                if index in index_set:
                    continue
                else:
                    index_set.add(index)
                    self.baitset.append(index)
                    i += 1

            self.baitset.sort()
            self.success.emit((idx, self.baitset))
Exemple #5
0
class Test(SourceCube):
    success = ObjectOutputPort('success')
    data_in = parameter.DataSetInputParameter(
        'data_in',
        required=True,
        title='Index log',
        description='The index log to read from')

    def begin(self):
        pass

    def __iter__(self):
        molA = oechem.OEGraphMol()
        molB = oechem.OEGraphMol()
        oechem.OESmilesToMol(molA, 'c1cccc1 A')
        oechem.OESmilesToMol(molB, 'c1cccc1 B')
        act_list = [molA, molB]
        baitset = (0, [0])
        ranking = list()
        dataset_infos = (0, {"A": 0, "B": 1})
        yield (act_list, baitset, ranking, dataset_infos)

    def end(self):
        pass
class AnalyseRankings(ComputeCube):
    """

    """

    classification = [["Compute", "Analysis"]]

    fptype = parameter.IntegerParameter(
        'fptype',
        default=105,
        help_text="Fingerprint type to use for the ranking")

    topn = parameter.IntegerParameter(
        'topn',
        default=100,
        help_text=
        "Number of top molecules returned in the rankinNumber of top molecules returned in the ranking"
    )

    intake = ObjectInputPort('intake')
    success = ObjectOutputPort('success')

    def process(self, data, port):
        self.ranking_list = data[0]
        self.nb_ka = data[1]
        self.method = data[2]
        self.results_avg = self.ranking_analysis()

        self.success.emit((self.results_avg, self.method))

    def ranking_analysis(self):
        results = pd.DataFrame()
        for ranking in self.ranking_list:
            set_results = pd.DataFrame(columns=['RR', 'HR'])
            count = 0
            count_ka = 0
            for row, mol in enumerate(ranking):
                count += 1
                if mol[4] == 1:
                    count_ka += 1
                rr = 100 * count_ka / self.nb_ka
                hr = 100 * count_ka / count
                set_results.loc[row] = [rr, hr]
            results = pd.concat([results, set_results])

        results_avg = pd.DataFrame()

        if self.method == 'Fingerprint':
            fptypes = {102: 'path', 104: 'circular', 105: 'tree'}
            FPType = fptypes[self.args.fptype]
            name = 'FP_' + FPType
        elif self.method == 'FastROCS':
            name = 'FR'

        results_avg['Average RR ' + name] = results.groupby(
            results.index)['RR'].mean()
        results_avg['Average HR ' + name] = results.groupby(
            results.index)['HR'].mean()
        results_avg = results_avg.head(self.args.topn)

        return results_avg
class ParallelInsertKARestfulROCS(ParallelComputeCube):
    """
    """

    classification = [["ParallelCompute"]]

    url = parameter.StringParameter(
        'url',
        default="http://10.0.1.22:4242",
        help_text="Url of the Restful FastROCS Server for the request")

    topn = parameter.IntegerParameter(
        'topn',
        default=100,
        help_text=
        "Number of top molecules returned in the rankinNumber of top molecules returned in the ranking"
    )

    data_input = ObjectInputPort('data_input')
    success = ObjectOutputPort('success')

    def process(self, data, port):

        self.act_list = data[0]
        self.baitset = data[1]
        self.ranking = data[2]
        self.dataset_infos = data[3]
        self.log.info("processing KA for baitset : " + str(self.baitset[0]))

        self.dataset_identifier = self.dataset_infos[0]
        self.add_queries()
        self.get_results()
        for tanimoto, mol in self.cur_scores.values():
            self.update_ranking(mol, tanimoto, True)

        self.success.emit((self.act_list, self.baitset, self.ranking,
                           self.dataset_infos[0], 'FastROCS'))

    def add_queries(self):
        url = self.args.url + "/queries/"
        self.query_id_list = list()
        for idx in self.baitset[1]:
            self.query = tempfile.NamedTemporaryFile(suffix='.oeb',
                                                     mode='wb',
                                                     delete=False)
            with oechem.oemolostream(self.query.name) as ofs:
                oechem.OEWriteMolecule(ofs, self.act_list[idx])
            self.query.flush()

            parameters = {}
            parameters["num_hits"] = self.args.topn

            parameters["dataset_identifier"] = self.dataset_identifier
            with open(self.query.name, "rb") as query_file:
                response = requests.post(url,
                                         files={"query": query_file},
                                         data=parameters)
            os.remove(self.query.name)
            data = response.json()
            self.query_id_list.append(data["id"])

    def get_results(self):
        self.cur_scores = {}

        for query_id in self.query_id_list:
            url = self.args.url + "/queries/{}/".format(query_id)
            response = None
            tries = 0
            while response == None or data["status"]["job"] != "COMPLETED":
                tries += 1
                time.sleep(tries)
                response = requests.get(url)
                data = response.json()
            results_url = data["results"]
            results_data = requests.get(self.args.url + results_url)

            with tempfile.NamedTemporaryFile(suffix='.oeb',
                                             mode='wb',
                                             delete=False) as temp:
                temp.write(results_data.content)
                temp.flush()
                with oechem.oemolistream(temp.name) as results:
                    for mol in results.GetOEGraphMols():
                        if self.dataset_infos[1][
                                mol.GetTitle()] not in self.baitset[1]:
                            tanimoto_combo = float(
                                oechem.OEGetSDData(mol, "TanimotoCombo"))
                            if mol.GetTitle() in self.cur_scores.keys():
                                if self.cur_scores[
                                        mol.GetTitle()][0] < tanimoto_combo:
                                    self.cur_scores[mol.GetTitle()] = (
                                        tanimoto_combo, mol.CreateCopy())
                            else:
                                self.cur_scores[mol.GetTitle()] = (
                                    tanimoto_combo, mol.CreateCopy())
                os.remove(temp.name)

    def update_ranking(self, mol, max_tanimoto, ka_tag):
        index = 0
        if len(self.ranking) >= self.args.topn and max_tanimoto < self.ranking[
                len(self.ranking) - 1][2]:
            pass
        else:
            for top_mol in self.ranking:
                if max_tanimoto < top_mol[2]:
                    index = self.ranking.index(top_mol) + 1
                else:
                    break

            upper = self.ranking[:index]
            lower = self.ranking[index:]
            self.ranking = upper + [(oechem.OEMolToSmiles(mol), mol.GetTitle(),
                                     max_tanimoto, self.baitset[0], ka_tag)
                                    ] + lower

            i = self.args.topn - 1
            while i < len(self.ranking) - 1:
                if self.ranking[i][2] != self.ranking[i + 1][2]:
                    self.ranking = self.ranking[:i + 1]

                    break
                else:
                    i += 1

    def end(self):
        pass
class ParallelFastROCSRanking(ComputeCube):
    """
    A compute Cube that receives a Molecule a baitset of indices and a FastROCSServer address
    and returns the ranking of the Server Molecules against the query
    """

    classification = [["Compute", "FastROCS", "Similarity"]]

    url = parameter.StringParameter(
        'url',
        default="http://10.0.61.25:4711",
        help_text="Url of the FastROCS Server for the request")

    dataset_name = parameter.StringParameter(
        'dataset_name',
        default="screening_database",
        help_text="Name of the screening database")

    topn = parameter.IntegerParameter(
        'topn',
        default=100,
        help_text=
        "Number of top molecules returned in the rankinNumber of top molecules returned in the ranking"
    )

    data_input = ObjectInputPort('data_input')
    success = ObjectOutputPort('success')

    def begin(self):
        pass

    def process(self, data, port):

        self.act_list = data[0]
        self.baitset = data[1]
        self.ranking = data[2]
        self.dataset_infos = data[3]

        self.log.info("start ranking baitset number {}".format(
            self.baitset[0]))

        url = self.args.url + "/datasets/?name={}".format(
            self.args.dataset_name)
        response = requests.get(url)
        data = response.json()
        self.dataset_identifier = int(data["id"])

        count = 0
        self.add_queries()
        for query_id in self.query_id_list:
            cur_rank = self.get_result(query_id)
            if len(self.ranking) == 0:
                self.ranking = cur_rank
            else:
                self.merge_ranking(cur_rank)
            count += 1
            self.log.info("Baitset " + str(self.baitset[0]) + " : " +
                          str(count) + " requests processed")

        sys.stdout.flush()
        self.log.info("Emitting ranking baitset " + str(self.baitset[0]))
        self.success.emit((self.act_list, self.baitset, self.ranking,
                           self.dataset_infos, 'FastROCS'))

    def add_queries(self):
        url = self.args.url + "/queries/"
        self.query_id_list = list()
        for idx in self.baitset[1]:
            self.query = tempfile.NamedTemporaryFile(suffix='.oeb',
                                                     mode='wb',
                                                     delete=False)
            with oechem.oemolostream(self.query.name) as ofs:
                oechem.OEWriteMolecule(ofs, self.act_list[idx])
            self.query.flush()

            parameters = {}
            parameters["num_hits"] = self.args.topn

            parameters["dataset_identifier"] = self.dataset_identifier
            with open(self.query.name, "rb") as query_file:
                response = requests.post(url,
                                         files={"query": query_file},
                                         data=parameters)
            os.remove(self.query.name)
            data = response.json()
            self.query_id_list.append(data["id"])

    def get_result(self, query_id):
        cur_rank = list()

        url = self.args.url + "/queries/{}/".format(query_id)
        response = None
        tries = 0
        while response == None or data["status"]["job"] != "COMPLETED":
            time.sleep(60 * tries)
            tries += 1
            response = requests.get(url)
            data = response.json()
        results_url = data["results"]
        results_data = requests.get(self.args.url + results_url)

        with tempfile.NamedTemporaryFile(suffix='.oeb',
                                         mode='wb',
                                         delete=False) as temp:
            temp.write(results_data.content)
            temp.flush()
            with oechem.oemolistream(temp.name) as results:
                for mol in results.GetOEGraphMols():
                    cur_rank.append(
                        (oechem.OEMolToSmiles(mol), mol.GetTitle(),
                         float(oechem.OEGetSDData(mol, 'TanimotoCombo')),
                         self.baitset[0], False))
            os.remove(temp.name)
        return cur_rank

    def merge_ranking(self, ranking):
        merged_list = list()
        i = 0
        j = 0
        count = 0
        id_set = set()
        while i < len(self.ranking):
            while j < len(ranking) and ranking[j][2] > self.ranking[i][2]:
                if ranking[j][1] not in id_set:
                    if count < self.args.topn or ranking[j][2] == merged_list[
                            count - 1][2]:
                        merged_list.append(ranking[j])
                        count += 1
                        id_set.add(ranking[j][1])
                        j += 1
                    else:
                        break
                else:
                    j += 1

            if self.ranking[i][1] not in id_set:
                if self.ranking[i] not in id_set and (
                        count < self.args.topn
                        or self.ranking[i][2] == merged_list[count - 1][2]):
                    merged_list.append(self.ranking[i])
                    count += 1
                    id_set.add(self.ranking[i][1])
                    i += 1
                else:
                    break
            else:
                i += 1

        while j < len(ranking):
            if ranking[j][1] not in id_set:
                if ranking[j] not in id_set and (count < self.args.topn
                                                 or ranking[j][2]
                                                 == merged_list[count - 1][2]):
                    merged_list.append(ranking[j])
                    count += 1
                    id_set.add(ranking[j][1])
                    j += 1
                else:
                    break
            else:
                j += 1

        self.ranking = merged_list
class ParallelFastFPInsertKA(ParallelComputeCube):
    """
    """

    classification = [["ParallelCompute"]]

    fptype = parameter.IntegerParameter(
        'fptype',
        default=105,
        help_text="Fingerprint type to use for the ranking")

    topn = parameter.IntegerParameter(
        'topn',
        default=100,
        help_text=
        "Number of top molecules returned in the rankinNumber of top molecules returned in the ranking"
    )

    data_input = ObjectInputPort('data_input')
    success = ObjectOutputPort('success')

    def process(self, data, port):

        self.act_list = data[0]
        self.baitset = data[1]
        self.ranking = data[2]
        self.fp_list = list()

        self.calculate_fp()
        self.insert_known_actives()

        self.success.emit(
            (self.act_list, self.baitset, self.ranking, 'Fingerprint'))

    def calculate_fp(self):

        for mol in self.act_list:
            fp = oegraphsim.OEFingerPrint()
            oegraphsim.OEMakeFP(fp, mol, self.args.fptype)

            self.fp_list.append(fp)

    def insert_known_actives(self):

        c = 0
        for idx in self.baitset[1]:
            while c < idx:
                ka_fp = self.fp_list[c]
                simval = self.calc_sim_val(ka_fp)
                self.update_ranking(self.act_list[c], simval, True)

                c += 1
            c += 1
        while c < len(self.act_list):
            ka_fp = self.fp_list[c]
            simval = self.calc_sim_val(ka_fp)
            self.update_ranking(self.act_list[c], simval, True)
            c += 1

    def calc_sim_val(self, fp):
        maxval = 0
        for idx in self.baitset[1]:
            tanimoto = oechem.OETanimoto(fp, self.fp_list[idx])
            if tanimoto > maxval:
                maxval = tanimoto
        return maxval

    def update_ranking(self, mol, max_tanimoto, ka_tag):
        index = 0
        if len(self.ranking) >= self.args.topn and max_tanimoto < self.ranking[
                len(self.ranking) - 1][2]:
            pass
        else:
            for top_mol in self.ranking:
                if max_tanimoto < top_mol[2]:
                    index = self.ranking.index(top_mol) + 1
                else:
                    break

            upper = self.ranking[:index]
            lower = self.ranking[index:]
            self.ranking = upper + [(oechem.OEMolToSmiles(mol), mol.GetTitle(),
                                     max_tanimoto, self.baitset[0], ka_tag)
                                    ] + lower

            i = self.args.topn - 1
            while i < len(self.ranking) - 1:
                if self.ranking[i][2] != self.ranking[i + 1][2]:
                    self.ranking = self.ranking[:i + 1]

                    break
                else:
                    i += 1

    def end(self):
        pass
class ParallelFastFPRanking(ParallelComputeCube):
    """
    A compute Cube that receives a Molecule and a list of Fingerprints with a baitset of indices
    and returns the max Similarity value of the Molecule against the Fingerprints
    """

    classification = [["Compute", "Fingerprint", "Similarity"]]

    url = parameter.StringParameter(
        'url',
        default="http://10.0.62.124:8081",
        help_text="Url of the FastFingerPrint Server for the request")

    fptype = parameter.IntegerParameter(
        'fptype',
        default=105,
        help_text="Fingerprint type to use for the ranking")

    topn = parameter.IntegerParameter(
        'topn',
        default=100,
        help_text=
        "Number of top molecules returned in the rankinNumber of top molecules returned in the ranking"
    )

    data_input = ObjectInputPort('data_input')
    success = ObjectOutputPort('success')

    def begin(self):
        #        self.max_tanimoto = 0
        #        self.fp = None
        #        self.fp_list = None
        #        self.baitset = None
        pass

    def process(self, data, port):

        self.act_list = data[0]
        self.baitset = data[1]
        self.ranking = data[2]
        fptypes = {102: 'path', 104: 'circular', 105: 'tree'}
        database = fptypes[self.args.fptype] + "_db"
        for idx in self.baitset[1]:
            smiles = oechem.OEMolToSmiles(self.act_list[idx])
            safe_smiles = parse.quote(smiles)
            url = "%s/%s/hitlist?smiles=%s&oformat=csv&maxhits=%d" % (
                self.args.url, database, safe_smiles, self.args.topn)
            response = requests.get(url)
            hitlist = response.content.decode().split('\n')
            hitlist.pop(0)
            hitlist.pop()
            cur_rank = list()
            for mol in hitlist:
                cur_mol = mol.split(',')
                cur_rank.append((cur_mol[0], cur_mol[1], float(cur_mol[4]),
                                 self.baitset[0], False))
            if len(self.ranking) == 0:
                self.ranking = cur_rank
            else:
                self.merge_ranking(cur_rank)

        #if self.fp_list is not None and self.baitset is not None:
        #with oechem.oemolistream(str(self.args.data_in)) as ifs:
        #    for mol in ifs.GetOEMols():
        #        max_tanimoto = 0
        #        fp = oegraphsim.OEFingerPrint()
        #        oegraphsim.OEMakeFP(fp, mol, self.args.fptype)
        #        for idx in self.baitset[1]:
        #            act_fp = self.fp_list[idx]
        #            tanimoto = oegraphsim.OETanimoto(fp, self.fp_list[idx])
        #            if tanimoto > max_tanimoto:
        #                max_tanimoto = tanimoto
        #        self.update_ranking(mol, max_tanimoto, False)

        self.success.emit((self.act_list, self.baitset, self.ranking))

    def merge_ranking(self, ranking):
        merged_list = list()
        i = 0
        j = 0
        count = 0
        id_set = set()
        while i < len(self.ranking):
            while j < len(ranking) and ranking[j][2] > self.ranking[i][2]:
                if ranking[j][1] not in id_set:
                    if count < self.args.topn or ranking[j][2] == merged_list[
                            count - 1][2]:
                        merged_list.append(ranking[j])
                        count += 1
                        id_set.add(ranking[j][1])
                        j += 1
                    else:
                        break
                else:
                    j += 1

            if self.ranking[i][1] not in id_set:
                if self.ranking[i] not in id_set and (
                        count < self.args.topn
                        or self.ranking[i][2] == merged_list[count - 1][2]):
                    merged_list.append(self.ranking[i])
                    count += 1
                    id_set.add(self.ranking[i][1])
                    i += 1
                else:
                    break
            else:
                i += 1

        while j < len(ranking):
            if ranking[j][1] not in id_set:
                if ranking[j] not in id_set and (count < self.args.topn
                                                 or ranking[j][2]
                                                 == merged_list[count - 1][2]):
                    merged_list.append(ranking[j])
                    count += 1
                    id_set.add(ranking[j][1])
                    j += 1
                else:
                    break
            else:
                j += 1

        self.ranking = merged_list

    def update_ranking(self, mol, max_tanimoto, ka_tag):
        index = 0
        if len(self.ranking) >= self.args.topn and max_tanimoto < self.ranking[
                len(self.ranking) - 1][2]:
            pass
        else:
            for top_mol in self.ranking:
                if max_tanimoto < top_mol[2]:
                    index = self.ranking.index(top_mol) + 1
                else:
                    break

            upper = self.ranking[:index]
            lower = self.ranking[index:]
            self.ranking = upper + [(oechem.OEMolToSmiles(mol), mol.GetTitle(),
                                     max_tanimoto, self.baitset[0], ka_tag)
                                    ] + lower

            i = self.args.topn - 1
            while i < len(self.ranking) - 1:
                if self.ranking[i][2] != self.ranking[i + 1][2]:
                    self.ranking = self.ranking[:i + 1]

                    break
                else:
                    i += 1
class PrepareRanking(ComputeCube):

    url = parameter.StringParameter(
        'url',
        default="http://10.0.1.22:4242",
        help_text="Url of the Restful FastROCS Server for the request")

    method = parameter.StringParameter('method',
                                       default='Fingerprint',
                                       help_text='Method used for the ranking')

    act_input = ObjectInputPort('act_input')
    baitset_input = ObjectInputPort('baitset_input')
    success = ObjectOutputPort('success')

    def begin(self):
        self.baitsets = list()
        self.act_list = list()
        self.ranking = list()

    def process(self, data, port):
        if port is 'act_input':
            self.act_list = data
            if self.args.method == 'FastROCS':
                self.dataset_infos = self.add_dataset()

        if port is 'baitset_input':
            self.baitsets.append(data)

        if len(self.act_list) > 0:
            while len(self.baitsets) > 0:
                if self.args.method == 'FastROCS':
                    self.success.emit((self.act_list, self.baitsets.pop(),
                                       self.ranking, self.dataset_infos))
                else:
                    self.success.emit(
                        (self.act_list, self.baitsets.pop(), self.ranking))

    def add_dataset(self):
        url = self.args.url + "/datasets/"
        act_mol_idx = {}
        dataset = None
        parameters = {}

        self.dataset = tempfile.NamedTemporaryFile(suffix='.oeb',
                                                   mode='wb',
                                                   delete=False)
        with oechem.oemolostream(self.dataset.name) as ofs:
            for idx, mol in enumerate(self.act_list):
                act_mol_idx[mol.GetTitle()] = idx
                oechem.OEWriteMolecule(ofs, mol)
        self.dataset.flush()

        dataset = open(self.dataset.name, 'rb')
        parameters["dataset"] = (self.dataset.name, dataset,
                                 'application/octet-stream')
        parameters["name"] = 'dataset of active molecules'

        multipart_data = MultipartEncoder(fields=parameters)

        response = requests.post(
            url,
            data=multipart_data,
            headers={"content-type": multipart_data.content_type})

        if dataset is not None:
            dataset.close()
        os.remove(self.dataset.name)

        data = response.json()
        dataset_infos = (data["id"], act_mol_idx)
        return dataset_infos