Esempio n. 1
0
    def score_model(self, model_configuration: dict, fragments_file: str,
                    descriptors_file: str, output_file: str):
        inputoutput_utils.create_parent_directory(output_file)
        decision_tree = tree.DecisionTreeClassifier()
        # get activity list
        actives = [
            1 for i in range(len(model_configuration["data"]["active"]))
        ]
        inactives = [
            0 for i in range(len(model_configuration["data"]["inactive"]))
        ]
        activity = actives + inactives

        decision_tree.fit(
            model_configuration["data"]["active"] +
            model_configuration["data"]["inactive"], activity)
        test_descriptors = _extract_descriptors(descriptors_file)
        prediction = decision_tree.predict(test_descriptors)
        molecule_names = _extract_names(fragments_file)
        # write output
        first_line = True
        with open(output_file, "w", encoding="utf-8") as output_stream:
            for i in range(len(prediction)):
                score = {
                    "name": molecule_names[i],
                    "score": float(prediction[i])
                }
                if first_line:
                    first_line = False
                else:
                    output_stream.write("\n")
                json.dump(score, output_stream)
def extract_fragments(input_files: list, input_type: str, output_files: list,
                      extraction_options: dict):
    # The write_molecule_json need some static info.

    # Count some statistics.
    total_fragments = 0
    for file in output_files:
        inputoutput_utils.create_parent_directory(file)

    for file_num, path in enumerate(input_files):
        holder = {"first": True}
        with open(output_files[file_num], "w",
                  encoding="utf-8") as output_stream:
            for molecule in _LOAD_FUNCTIONS[input_type](path):
                item = {
                    "name":
                    molecule.GetProp("_Name"),
                    "smiles":
                    rdkit.Chem.MolToSmiles(molecule),
                    "fragments":
                    _extract_fragments_from_molecule(
                        molecule, extraction_options["fragments"],
                        extraction_options)
                }
                total_fragments += len(item["fragments"])
                # Append to output.
                _append_object_to_jsonlines(output_stream, item, holder)
    logging.info("\tfragments total: %d", total_fragments)
Esempio n. 3
0
 def score_model(self, model_configuration: dict, fragments_file: str,
                 descriptors_file: str, output_file: str):
     inputoutput_utils.create_parent_directory(output_file)
     first_line = True
     with open(output_file, "w", encoding="utf-8") as output_stream:
         with open(fragments_file, "r", encoding="utf-8") as input_stream:
             for new_line in input_stream:
                 line = json.loads(new_line)
                 test_active_indexes = []
                 for fragment in line["fragments"]:
                     index = fragment["index"]
                     for group in model_configuration["configuration"][
                             "groups"]:
                         if index in group:
                             index = group[0]
                             break
                     if index not in test_active_indexes:
                         test_active_indexes.append(index)
                 max_sim = max([
                     _compute_sim(item, test_active_indexes)
                     for item in model_configuration["data"]["active"]
                 ])
                 score = {"name": line["name"], "score": max_sim}
                 if first_line:
                     first_line = False
                 else:
                     output_stream.write("\n")
                 json.dump(score, output_stream)
Esempio n. 4
0
def print_graph(activity_files: list, directory: str, nicknames: list,
                input_type: str):
    input_values = []
    for file in activity_files:
        with open(file, "r", encoding="utf-8") as activity_file:
            for new_line in activity_file:
                line = json.loads(new_line)
                input_values.append(line[input_type.upper()])
    plt.plot(nicknames, input_values, marker="o")
    if input_type.upper() == "EF1":
        plt.ylabel("EF 1%")
    elif input_type.upper() == "EF5":
        plt.ylabel("EF 5%")
    else:
        plt.ylabel(input_type.upper())

    if directory != "":
        file_name = directory + "/" + input_type.upper() + ".png"
    else:
        file_name = input_type.upper() + ".png"
    inputoutput_utils.create_parent_directory(file_name)
    plt.xticks(rotation=90, fontsize="x-small")
    plt.tight_layout()
    plt.savefig(file_name, dpi=150)
    plt.figure()
 def score_model(self, model_configuration: dict, fragments_file: str,
                 descriptors_file: str, output_file: str):
     inputoutput_utils.create_parent_directory(output_file)
     first_line = True
     with open(output_file, "w", encoding="utf-8") as output_stream:
         with open(fragments_file, "r", encoding="utf-8") as input_stream:
             for new_line in input_stream:
                 line = json.loads(new_line)
                 test_active_indexes = []
                 for fragment in line["fragments"]:
                     if fragment["index"] not in test_active_indexes:
                         test_active_indexes.append(fragment["index"])
                 group_indexes = _make_groups(test_active_indexes,
                                              model_configuration["configuration"]["groups"])
                 summary = 0
                 for test_index in group_indexes:
                     if test_index in model_configuration["data"]["active"]:
                         summary += 1
                 sim = summary / (len(model_configuration["data"]["active"]) +
                                  len(test_active_indexes) - summary)
                 score = {
                     "name": line["name"],
                     "score": sim
                 }
                 if first_line:
                     first_line = False
                 else:
                     output_stream.write("\n")
                 json.dump(score, output_stream)
    def score_model(self, model_configuration: dict, fragments_file: str,
                    descriptors_file: str, output_file: str):
        inputoutput_utils.create_parent_directory(output_file)
        model_data = model_configuration["data"]
        active_molecules_ap = []
        nbits = model_configuration["configuration"]["nbits"]
        for active_molecule in model_data["active"]:
            molecule_smiles = active_molecule.strip("\"")
            molecule = Chem.MolFromSmiles(molecule_smiles)
            ap_fingerprint = Pairs.GetHashedAtomPairFingerprint(molecule,
                                                                nBits=nbits)
            active_molecules_ap.append(ap_fingerprint)

        first_line = True
        with open(output_file, "w", encoding="utf-8") as output_stream:
            with open(fragments_file, "r", encoding="utf-8") as input_stream:
                for new_line in input_stream:
                    line = json.loads(new_line)
                    test_molecule_input = line["smiles"]
                    test_molecule_smiles = test_molecule_input.strip("\"")
                    test_molecule = Chem.MolFromSmiles(test_molecule_smiles)
                    test_mol_fingerprint = Pairs.GetHashedAtomPairFingerprint(
                        test_molecule, nBits=nbits)
                    max_sim = max([
                        DataStructs.TanimotoSimilarity(test_mol_fingerprint,
                                                       fingerprint)
                        for fingerprint in active_molecules_ap
                    ])
                    score = {"name": line["name"], "score": max_sim}
                    if first_line:
                        first_line = False
                    else:
                        output_stream.write("\n")
                    json.dump(score, output_stream)
Esempio n. 7
0
def _print_histogram(baseline_val: int, input_file: str,
                     output_directory: str) -> list:
    inputoutput_utils.create_parent_directory(output_directory + "/0")
    textstr = "baseline AUC: " + str(float(round(baseline_val, 5)))
    auc = []
    with open(input_file, "r", encoding="utf-8") as input_stream:
        for new_line in input_stream:
            line = json.loads(new_line)
            auc.append(line["AUC"])
    min_val = min(auc)
    max_val = max(auc)
    diff = max_val - min_val
    diff_step = diff / 10
    steps = []
    for i in range(10):
        steps.append(i * diff_step + min_val)
    steps.append(max_val)
    arr = plt.hist(auc, bins=steps, color="blue")
    plt.xticks(steps, rotation=90)
    for i in range(10):
        plt.text(arr[1][i] + diff_step / 2,
                 arr[0][i],
                 str(int(arr[0][i])),
                 horizontalalignment="center")
    props = dict(boxstyle="round")
    plt.text(steps[7], int(arr[0][0]), textstr)
    plt.tight_layout()
    plt.savefig(output_directory + "/AUC.png", dpi=1000)
    plt.figure()
    return auc
Esempio n. 8
0
def _model_and_score_and_evaluate(active_fragments: str, test_fragments: str, test_activity: str,
                                  num: int, output_directory: str, maximal_num: int):
    inputoutput_utils.create_parent_directory(output_directory + "/scorefiles/0")
    inputoutput_utils.create_parent_directory(output_directory + "/activities/0")
    with open(output_directory + "/configurationfiles/configuration" + str(maximal_num) + "_" + str(num) + ".json", "r",
              encoding="utf-8") as input_file:
        for new_line in input_file:
            line = json.loads(new_line)
            if os.path.isfile(output_directory + "/evaluations/" + line["evaluation"]):
                continue 
            
            new_model = model_factory.create_model(line["model_name"])
            model = new_model.create_model(active_fragments, "", "", "",
                                           line)
            new_model.score_model(model, test_fragments, "",
                                  output_directory + "/scorefiles/score" + line["evaluation"])

            # run add_activity
            activity = add_activity.read_activity(test_activity)
            add_activity.add_activity_and_write_to_json(output_directory + "/scorefiles/score" + line["evaluation"],
                                                        activity,
                                                        output_directory + "/activities/activity" + line["evaluation"])

            # run compute_evaluation
            score_act = compute_evaluation.read_file_with_score_and_activity(output_directory + "/activities/activity"
                                                                            + line["evaluation"])
            activity = compute_evaluation.sort_activity(score_act)
            compute_evaluation.evaluation(activity, output_directory + "/evaluations/" + line["evaluation"])
 def score_model(self, model_configuration: dict, fragments_file: str,
                 descriptors_file: str, output_file: str):
     inputoutput_utils.create_parent_directory(output_file)
     model_data = model_configuration["data"]
     radius = int(
         model_configuration["configuration"]["fragments"][0]["size"])
     active_molecules_tt = []
     for active_molecule in model_data["active"]:
         molecule_smiles = active_molecule.strip("\"")
         molecule = Chem.MolFromSmiles(molecule_smiles)
         tt_fingerprint = Torsions.GetTopologicalTorsionFingerprintAsIntVect(
             molecule, radius)
         active_molecules_tt.append(tt_fingerprint)
     first_line = True
     with open(output_file, "w", encoding="utf-8") as output_stream:
         with open(fragments_file, "r", encoding="utf-8") as input_stream:
             for new_line in input_stream:
                 line = json.loads(new_line)
                 test_molecule_input = line["smiles"]
                 test_molecule_smiles = test_molecule_input.strip("\"")
                 test_molecule = Chem.MolFromSmiles(test_molecule_smiles)
                 test_mol_fingerprint = Torsions.GetTopologicalTorsionFingerprintAsIntVect(
                     test_molecule, radius)
                 max_sim = max([
                     DataStructs.TanimotoSimilarity(test_mol_fingerprint,
                                                    fingerprint)
                     for fingerprint in active_molecules_tt
                 ])
                 score = {"name": line["name"], "score": max_sim}
                 if first_line:
                     first_line = False
                 else:
                     output_stream.write("\n")
                 json.dump(score, output_stream)
    def score_model(self, model_configuration: dict, fragments_file: str,
                    descriptors_file: str, output_file: str):
        inputoutput_utils.create_parent_directory(output_file)
        nbit = int(model_configuration["configuration"]["nbits"])

        first_line = True
        with open(output_file, "w", encoding="utf-8") as output_stream:
            with open(fragments_file, "r", encoding="utf-8") as input_stream:
                for new_line in input_stream:
                    line = json.loads(new_line)
                    test_indexes = []
                    for item in line["fragments"]:
                        test_indexes.append(int(item["index"]) % nbit)

                    max_sim = 0
                    for active_indexes in model_configuration["data"]["active"]:
                        intersection = _intersection_of_two_arrays(active_indexes, test_indexes)
                        sim = len(intersection) / (
                                    len(test_indexes) + len(active_indexes) - len(intersection))
                        if sim > max_sim:
                            max_sim = sim
                    score = {
                        "name": line["name"],
                        "score": max_sim
                    }
                    if first_line:
                        first_line = False
                    else:
                        output_stream.write("\n")
                    json.dump(score, output_stream)
 def score_model(self, model_configuration: dict, fragments_file: str,
                 descriptors_file: str, output_file: str):
     inputoutput_utils.create_parent_directory(output_file)
     active_parameter = int(
         model_configuration["configuration"]["active_parameter"])
     inactive_parameter = int(
         model_configuration["configuration"]["inactive_parameter"])
     with open(output_file, "w") as streamo:
         first = True
         with open(fragments_file, "r") as stream:
             for line in stream:
                 molecule = json.loads(line)
                 name = molecule["name"]
                 suma = 0
                 for fragment in molecule["fragments"]:
                     founded = False
                     for index in model_configuration["data"]:
                         if int(index) == int(fragment["index"]):
                             founded = True
                             break
                     if founded:
                         suma += active_parameter
                     else:
                         suma -= inactive_parameter
                 sim = suma / len(molecule["fragments"])
                 score = {"name": name, "score": sim}
                 if first:
                     first = False
                 else:
                     streamo.write("\n")
                 json.dump(score, streamo)
def _main():
    configuration = _read_configuration()
    inputoutput_utils.create_parent_directory(configuration["output"])
    values = [[], [], [], [], []]
    if configuration["type"] == "AUC":
        str1 = "EF1"
        str2 = "EF5"
    elif configuration["type"] == "EF1":
        str1 = "AUC"
        str2 = "EF5"
    elif configuration["type"] == "EF5":
        str1 = "AUC"
        str2 = "EF1"
    else:
        print("Wrong type!")
        print("It has to be: AUC. EF1 or EF5")
        exit(1)
    with open(configuration["input_file"], "r",
              encoding="utf-8") as input_stream:
        for new_line in input_stream:
            line = json.loads(new_line)
            values[0].append(line["groups"])
            values[1].append(line[configuration["type"]])
            values[2].append(line[str1])
            values[3].append(line[str2])
    if len(values[0]) < int(configuration["best"]):
        print("The input file does not have that much good results!")
        print("Number of results in input file: " + str(len(values[0])))
        print("Number you wanted to select: " + configuration["best"])
        print("Can not be like that!")
        exit(1)
    for i in range(len(values[1]) - 1):
        for j in range(len(values[1]) - i - 1):
            if values[1][j] < values[1][j + 1]:
                tmp = values[1][j]
                values[1][j] = values[1][j + 1]
                values[1][j + 1] = tmp
                tmp = values[0][j]
                values[0][j] = values[0][j + 1]
                values[0][j + 1] = tmp
                tmp = values[2][j]
                values[2][j] = values[2][j + 1]
                values[2][j + 1] = tmp
                tmp = values[3][j]
                values[3][j] = values[3][j + 1]
                values[3][j + 1] = tmp
    with open(configuration["output"], "w", encoding="utf-8") as output_stream:
        for i in range(int(configuration["best"])):
            model = {
                "groups": values[0][i],
                configuration["type"]: values[1][i],
                str1: values[2][i],
                str2: values[3][i]
            }
            json.dump(model, output_stream)
            output_stream.write("\n")
def _main():
    configuration = _read_configuration()
    with open(configuration["configuration"], "r", encoding="utf-8") as input_stream:
        model_configuration = json.load(input_stream)
    model_name = model_configuration["model_name"]

    new_model = model_factory.create_model(model_name)
    model = new_model.create_model(configuration["active_fragments"], configuration["inactive_fragments"],
                                   configuration["active_descriptors"], configuration["inactive_descriptors"],
                                   model_configuration)

    inputoutput_utils.create_parent_directory(configuration["output"])
    new_model.save_to_json_file(configuration["output"], model)
Esempio n. 14
0
def add_activity_and_write_to_json(input_score: str, activity: list,
                                   output_file: str):
    inputoutput_utils.create_parent_directory(output_file)
    with open(output_file, "w", encoding="utf-8") as output_stream:
        with open(input_score, "r", encoding="utf-8") as stream:
            for num, line in enumerate(stream):
                score = json.loads(line)
                output = {
                    "name": score["name"],
                    "score": score["score"],
                    "activity": activity[num]
                }
                if num != 0:
                    output_stream.write("\n")
                json.dump(output, output_stream)
def _print_histogram(input_files: list, nicknames: list,
                     output_directory: str):
    all_auc = []
    inputoutput_utils.create_parent_directory(output_directory + "/0")
    for file in input_files:
        auc = []
        with open(file, "r", encoding="utf-8") as input_stream:
            for new_line in input_stream:
                line = json.loads(new_line)
                auc.append(line["AUC"])
        all_auc.append(auc)
    plt.hist(all_auc, stacked=True, density=False)
    plt.legend(nicknames, loc="upper right")
    plt.savefig(output_directory + "/AUC.png", dpi=1000)
    plt.figure()
def _make_configuration_files(input_file: str, output_directory: str,
                              model_name: str, cpu_counts: int,
                              cutoff_val: int) -> list:
    active_indexes = []
    inputoutput_utils.create_parent_directory(output_directory +
                                              "/configurationfiles/0")
    with open(input_file, "r", encoding="utf-8") as input_stream:
        for new_line in input_stream:
            line = json.loads(new_line)
            for item in line["fragments"]:
                if item["index"] not in active_indexes:
                    active_indexes.append(item["index"])

    pair_list = []
    for i in range(len(active_indexes) - 1):
        for j in range(i + 1, len(active_indexes)):
            pair_list.append([active_indexes[i], active_indexes[j]])

    number = len(pair_list) // cpu_counts
    ranges = []
    for i in range(cpu_counts):
        ranges.append(i * number)
    ranges.append(len(pair_list))

    for i in range(cpu_counts):
        output_file = output_directory + "/configurationfiles/configuration" + str(
            i) + ".json"
        first = True
        with open(output_file, "w", encoding="utf-8") as output_stream:
            for j in range(ranges[i], ranges[i + 1]):
                if cutoff_val == -1:
                    model = {
                        "model_name": model_name,
                        "groups": [pair_list[j]]
                    }
                else:
                    model = {
                        "model_name": model_name,
                        "cutoff": cutoff_val,
                        "groups": [pair_list[j]]
                    }
                if first:
                    first = False
                else:
                    output_stream.write("\n")
                json.dump(model, output_stream)

    return ranges
def evaluation(activity_arr: list, output_file: str):
    inputoutput_utils.create_parent_directory(output_file)
    auc = Scoring.CalcAUC(activity_arr, 0)
    ef1 = Scoring.CalcEnrichment(activity_arr, 0, [0.01])
    ef5 = Scoring.CalcEnrichment(activity_arr, 0, [0.05])
    rie = Scoring.CalcRIE(activity_arr, 0, 100)
    bedroc = Scoring.CalcBEDROC(activity_arr, 0, 100)
    output = {
        "AUC": auc,
        "EF1": ef1[0],
        "EF5": ef5[0],
        "RIE": rie,
        "BEDROC": bedroc
    }
    with open(output_file, "w", encoding="utf-8") as stream:
        json.dump(output, stream)
Esempio n. 18
0
def _print_graph(val: float, values: list, type1: str, output: str):
    inputoutput_utils.create_parent_directory(output)
    textstr = "baseline " + type1 + ": " + "%.6f" %(val)
    if type1 == "AUC":
        min_val = min(values)
        max_val = max(values)
        diff = max_val - min_val
        diff_step = diff/10
        steps = []
        for i in range(10):
            steps.append(i*diff_step + min_val)
        steps.append(max_val)
        arr = plt.hist(values, bins=steps, color="blue")
        plt.xticks(steps, rotation=90)
        for i in range(10):
            plt.text(arr[1][i], arr[0][i], str(int(arr[0][i])), horizontalalignment="left")
        props = dict(boxstyle="round")
        plt.text(steps[7], int(arr[0][0]), textstr)
        plt.tight_layout()
        plt.savefig(output)
    else:
        num = {}
        values1 = sorted(values)
        for item in values1:
            if item in num:
                num[item] += 1
            else:
                num[item] = 1
        item = []
        values = []
        for l in num:
            item.append(l)
            values.append(num[l])
        xval = [val-2] + [val] + (item)
        plt.plot(item, values, marker="o")
        plt.xticks(xval)

        plt.axvline(val, color="red")
        plt.savefig(output)

    print(min(values))
    print(max(values))
Esempio n. 19
0
    def score_model(self, model_configuration: dict, fragments_file: str,
                    descriptors_file: str, output_file: str):
        name_num = _read_molecules(fragments_file)
        inputoutput_utils.create_parent_directory(output_file)
        active_parameter = int(
            model_configuration["configuration"]["active_parameter"])
        inactive_parameter = int(
            model_configuration["configuration"]["inactive_parameter"])
        with open(output_file, "w") as streamo:
            first_write = True
            with open(descriptors_file, "r") as stream:
                next(stream)
                counter = 0
                molecule_num = 0
                sum = 0
                for line in stream:
                    line_parts = line.split(",")
                    parts = line_parts[2:]
                    founded = False
                    for descriptors in model_configuration["data"]:
                        if descriptors == parts:
                            founded = True
                            break

                    if founded:
                        sum += active_parameter
                    else:
                        sum -= inactive_parameter
                    counter += 1
                    if counter == name_num[molecule_num]["fragments"]:
                        score = {
                            "name": name_num[molecule_num]["molecule"],
                            "score": sum / counter
                        }
                        counter = 0
                        sum = 0
                        molecule_num += 1
                        if first_write:
                            first_write = False
                        else:
                            streamo.write("\n")
                        json.dump(score, streamo)
Esempio n. 20
0
 def score_model(self, model_configuration: dict, fragments_file: str,
                 descriptors_file: str, output_file: str):
     inputoutput_utils.create_parent_directory(output_file)
     first_line = True
     with open(output_file, "w", encoding="utf-8") as output_stream:
         with open(fragments_file, "r", encoding="utf-8") as input_stream:
             for new_line in input_stream:
                 line = json.loads(new_line)
                 test_active_indexes = _add_indexes(
                     line["fragments"],
                     model_configuration["configuration"]["groups"])
                 max_sim = _compute_sim(
                     model_configuration["data"]["active"],
                     test_active_indexes)
                 score = {"name": line["name"], "score": max_sim}
                 if first_line:
                     first_line = False
                 else:
                     output_stream.write("\n")
                 json.dump(score, output_stream)
Esempio n. 21
0
    def score_model(self, model_configuration: dict, fragments_file: str,
                    descriptors_file: str, output_file: str):
        inputoutput_utils.create_parent_directory(output_file)
        model_data = model_configuration["data"]
        diameter = int(
            model_configuration["configuration"]["fragments"][0]["size"])
        radius = diameter // 2
        if int(diameter) % 2 == 1:
            print("Incorrect input, size must be even!")
            exit(1)
        active_molecules_ecfp = []
        nbits = model_configuration["configuration"]["nbits"]
        for active_molecule in model_data["active"]:
            molecule_smiles = active_molecule.strip("\"")
            molecule = Chem.MolFromSmiles(molecule_smiles)
            ecfp_fingerprint = AllChem.GetHashedMorganFingerprint(molecule,
                                                                  radius,
                                                                  nBits=nbits)
            active_molecules_ecfp.append(ecfp_fingerprint)

        first_line = True
        with open(output_file, "w", encoding="utf-8") as output_stream:
            with open(fragments_file, "r", encoding="utf-8") as input_stream:
                for new_line in input_stream:
                    line = json.loads(new_line)
                    test_molecule_input = line["smiles"]
                    test_molecule_smiles = test_molecule_input.strip("\"")
                    test_molecule = Chem.MolFromSmiles(test_molecule_smiles)
                    test_mol_fingerprint = AllChem.GetHashedMorganFingerprint(
                        test_molecule, radius, nBits=nbits)
                    max_sim = max([
                        DataStructs.TanimotoSimilarity(test_mol_fingerprint,
                                                       fingerprint)
                        for fingerprint in active_molecules_ecfp
                    ])
                    score = {"name": line["name"], "score": max_sim}
                    if first_line:
                        first_line = False
                    else:
                        output_stream.write("\n")
                    json.dump(score, output_stream)
Esempio n. 22
0
def _main():
    configuration = _read_configuration()
    inputoutput_utils.create_parent_directory(configuration["output_file"])
    _print_graph(configuration["input_activity"], configuration["output_file"])
def _main():
    # run extract_fragments
    configuration = _read_configuration()
    
    with open(configuration["model_configuration"], "r", encoding="utf-8") as input_stream:
        model_configuration = json.load(input_stream)
    try:
        new_model = model_factory.create_model(model_configuration["model_name"])
    except:
        print("Model does not exist!")
        exit(1)
        
    if "kekule" not in model_configuration:
        model_configuration["kekule"] = False
    else:
        model_configuration["kekule"] = bool(model_configuration["kekule"])
    if "isomeric" not in model_configuration:
        model_configuration["isomeric"] = False
    else:
        model_configuration["isomeric"] = bool(model_configuration["isomeric"])
    if "fragments" not in model_configuration:
        model_configuration["fragments"] = "ecfp.6"
    parsed_types = []
    for item in model_configuration["fragments"].split(","):
        item_split = item.split(".")
        if item_split[0] != "ap":
            if not len(item_split) == 2:
                logging.error("Invalid fragment type: %s", item)
                logging.info("Expected format {TYPE}.{SIZE} or ap")
                exit(1)
            parsed_types.append({
                "name": item_split[0],
                "size": int(item_split[1])
            })
        else:
            parsed_types.append({
                "name": item_split[0],
            })
    model_configuration["fragments"] = parsed_types
                
    extraction_options = {
        "kekule": model_configuration["kekule"],
        "isomeric": model_configuration["isomeric"],
        "fragments": model_configuration["fragments"]
    }
    input_files = [configuration["input_actives"], configuration["input_inactives"],
                   configuration["test"]]
    directory = configuration["directory"]
    fragments_output_files = [directory+"/fragmentsa.json", directory+"/fragmentsi.json",
                              directory+"/fragmentst.json"]
    for file in fragments_output_files:
        inputoutput_utils.create_parent_directory(file)
    extract_fragments.extract_fragments(input_files, configuration["input_type"],
                                        fragments_output_files, extraction_options)

    # run extract_descriptors
    
    descriptors_output_files = [directory+"/descriptorsa.csv", directory+"/descriptorsi.csv",
                                directory+"/descriptorst.csv"]
    for file in descriptors_output_files:
        inputoutput_utils.create_parent_directory(file)
    if (model_configuration["model_name"] == "descriptors_model") |\
        ((model_configuration["model_name"] == "linear_regression_model") and (int(model_configuration["molecules"]) == 0)):
        compute_descriptors.compute_descriptors(fragments_output_files, descriptors_output_files,
                                                True)
    else:
        compute_descriptors.compute_descriptors(fragments_output_files, descriptors_output_files, False)

    # run create_model and score_molecules
    
    model = new_model.create_model(directory+"/fragmentsa.json", directory+"/fragmentsi.json",
                                   directory+"/descriptorsa.csv", directory+"/descriptorsi.csv",
                                   model_configuration)
    new_model.score_model(model, directory+"/fragmentst.json",
                          directory+"/descriptorst.csv", directory+"/score.json")

    # run add_activity
    activity = add_activity.read_activity(configuration["activity"])
    add_activity.add_activity_and_write_to_json(directory + "/score.json", activity,
                                                directory + "/activity.json")

    #  run compute_evaluation
    score_act = compute_evaluation.read_file_with_score_and_activity(directory + "/activity.json")
    activity = compute_evaluation.sort_activity(score_act)
    compute_evaluation.evaluation(activity, configuration["output"])
def _main():
    configuration = _read_configuration()
    active_indexes = []
    with open(configuration["input_fragments"], "r",
              encoding="utf-8") as input_stream:
        for new_line in input_stream:
            line = json.loads(new_line)
            for fragment in line["fragments"]:
                if fragment["index"] not in active_indexes:
                    active_indexes.append(fragment["index"])
    pairs = []
    for i in range(len(active_indexes) - 1):
        for j in range(i + 1, len(active_indexes)):
            pairs.append([active_indexes[i], active_indexes[j]])
    auc = 0
    ef1 = 0
    ef5 = 0
    with open(configuration["baseline_output"], "r",
              encoding="utf-8") as input_stream:
        for new_line in input_stream:
            line = json.loads(new_line)
            auc = line["AUC"]
            ef1 = line["EF1"]
            ef5 = line["EF5"]

    inputoutput_utils.create_parent_directory(
        configuration["output_directory"] + "/0")
    _prepare_files(configuration["output_directory"])
    onlyfiles = [
        f for f in listdir(configuration["input_directory"])
        if isfile(join(configuration["input_directory"], f))
    ]
    for file in onlyfiles:
        with open(configuration["input_directory"] + "/" + file,
                  "r",
                  encoding="utf-8") as input_stream:
            file_str = file.split(".")[0]
            num = int(file_str[10:])
            for new_line in input_stream:
                line = json.loads(new_line)
                output = {
                    "groups": [pairs[num]],
                    "AUC": line["AUC"],
                    "EF1": line["EF1"],
                    "EF5": line["EF5"]
                }
                if (line["AUC"] > auc) and (line["EF1"] > ef1) and (line["EF5"]
                                                                    > ef5):
                    with open(configuration["output_directory"] +
                              "/aucef1ef5.json",
                              "a",
                              encoding="utf-8") as output_stream:
                        json.dump(output, output_stream)
                        output_stream.write("\n")
                if (line["AUC"] > auc) and (line["EF1"] > ef1):
                    with open(configuration["output_directory"] +
                              "/aucef1.json",
                              "a",
                              encoding="utf-8") as output_stream:
                        json.dump(output, output_stream)
                        output_stream.write("\n")
                if (line["AUC"] > auc) and (line["EF5"] > ef5):
                    with open(configuration["output_directory"] +
                              "/aucef5.json",
                              "a",
                              encoding="utf-8") as output_stream:
                        json.dump(output, output_stream)
                        output_stream.write("\n")
                if (line["EF1"] > ef1) and (line["EF5"] > ef5):
                    with open(configuration["output_directory"] +
                              "/ef1ef5.json",
                              "a",
                              encoding="utf-8") as output_stream:
                        json.dump(output, output_stream)
                        output_stream.write("\n")
                if line["AUC"] > auc:
                    with open(configuration["output_directory"] + "/auc.json",
                              "a",
                              encoding="utf-8") as output_stream:
                        json.dump(output, output_stream)
                        output_stream.write("\n")
                if line["EF5"] > ef5:
                    with open(configuration["output_directory"] + "/ef5.json",
                              "a",
                              encoding="utf-8") as output_stream:
                        json.dump(output, output_stream)
                        output_stream.write("\n")
                if line["EF1"] > ef1:
                    with open(configuration["output_directory"] + "/ef1.json",
                              "a",
                              encoding="utf-8") as output_stream:
                        json.dump(output, output_stream)
                        output_stream.write("\n")
                if (line["AUC"] > auc) or (line["EF1"] > ef1) or (line["EF5"] >
                                                                  ef5):
                    with open(configuration["output_directory"] +
                              "/greater.json",
                              "a",
                              encoding="utf-8") as output_stream:
                        json.dump(output, output_stream)
                        output_stream.write("\n")

    with open(configuration["output_directory"] + "/baseline.json",
              "w",
              encoding="utf-8") as output_stream:
        output = {"AUC": auc, "EF1": ef1, "EF5": ef5}
        json.dump(output, output_stream)
Esempio n. 25
0
def compute_descriptors(input_files: list,
                        output_files: list,
                        use_fragments: bool,
                        features_to_use=[]):
    for output_file in output_files:
        inputoutput_utils.create_parent_directory(output_file)
    # Pick features to use.
    if features_to_use == [] or features_to_use is None:
        used_features_names = _NAMES
    else:
        used_features_names = features_to_use
    used_features_fnc = [
        _FUNCTIONS[_NAMES.index(name)] for name in used_features_names
    ]
    # Compute and write descriptors.
    sanitize_operation = rdkit.Chem.SanitizeFlags.SANITIZE_ALL ^ \
                         rdkit.Chem.SanitizeFlags.SANITIZE_KEKULIZE
    number_of_invalid = 0
    # Gather data.
    count_molecules = 0
    num = -1
    for input_file in input_files:
        with open(input_file, "r", encoding="utf-8") as streami:
            num += 1
            with open(output_files[num], "w", encoding="utf-8") as stream:
                _write_header(stream, use_fragments, used_features_names)
                for line in streami:
                    molecule = json.loads(line)
                    smiles_list = []
                    index_list = []
                    position = 0
                    if use_fragments:
                        for fragment in molecule["fragments"]:
                            smiles_list.append(fragment["smiles"])
                            index_list.append(fragment["index"])
                    else:
                        smiles_list.append(molecule["smiles"])

                    counter = 0
                    for smiles in smiles_list:

                        # SMILES.
                        stream.write("\"")
                        stream.write(smiles)
                        stream.write("\",")
                        if use_fragments:
                            stream.write(str(index_list[position]))
                            stream.write(",")
                        position += 1
                        counter += 1
                        count_molecules += 1
                        # Construct molecule, compute and write properties.
                        molecule = rdkit.Chem.MolFromSmiles(str(smiles),
                                                            sanitize=False)
                        # Do not kekulize molecule.
                        rdkit.Chem.SanitizeMol(molecule,
                                               sanitizeOps=sanitize_operation)
                        #
                        if molecule is None:
                            logging.error("Invalid molecule detected: %s",
                                          smiles)
                            number_of_invalid += 1
                            continue
                        stream.write(",".join(
                            [str(fnc(molecule)) for fnc in used_features_fnc]))
                        stream.write("\n")

    # Log nad return summary.
    logging.info("Invalid molecules: %d/%d", number_of_invalid,
                 count_molecules)
Esempio n. 26
0
def _make_configuration_files(group_file: str, output_directory: str, model_name: str,
                              cpu_counts: int, cutoff_val: int) -> int:
    groups = []
    inputoutput_utils.create_parent_directory(output_directory + "/configurationfiles/0")
    inputoutput_utils.create_parent_directory(output_directory + "/evaluations/0")
    files = [f for f in listdir(output_directory + "/configurationfiles") if
                 isfile(join(output_directory + "/configurationfiles", f))]
    num_of_config = 0
    for file in files:
        with open(output_directory + "/configurationfiles/" + file, "r", encoding="utf-8") as input_stream:
            for new_line in input_stream:
                num_of_config += 1
    config_files = []
    for file in files:
        first_part = file.split("_")[0]
        config_files.append(int(first_part[13:]))
    if num_of_config == 0:
        maximal_num = 0
    else:
        maximal_num = max(config_files)
    evaluation_files = [f for f in listdir(output_directory + "/evaluations") if
                        isfile(join(output_directory + "/evaluations", f))]
    if len(evaluation_files) != num_of_config:   
        num_of_max_num = 0
        for item in config_files:
            if item == maximal_num:
                num_of_max_num += 1
        if num_of_max_num != cpu_counts:
            print("Please run the program as before on " + str(num_of_max_num) + " cores")
            exit(1)
        else:
            return maximal_num
    
    with open(group_file, "r", encoding="utf-8") as input_stream:
        for new_line in input_stream:
            line = json.loads(new_line)
            groups.append(line["groups"])

    num_of_groups = len(groups)
    group_list = []
    for file in files:
        with open(output_directory + "/configurationfiles/" + file, "r", encoding="utf-8") as input_stream:
            for new_line in input_stream:
                line = json.loads(new_line)
                group_list.append(line["groups"])
                
    for i in range(len(groups)-1):
        for j in range(i+1, len(groups)):
            groups1 = groups[i].copy()
            groups2 = groups[j].copy()
            is_intersected = False
            while _control_intersection(groups1, groups2):
                fin = False
                is_intersected = True
                for group1 in groups1:
                    for item in group1:
                        for group2 in groups2:
                            if item in group2:
                                groups1.remove(group1)
                                groups2.remove(group2)
                                groups1.append(_new_group(group1, group2))
                                fin = True
                                break

                        if fin:
                            break
                    if fin:
                        break
            if is_intersected:
                groups11 = _one_group_intersection(groups1)
                new_group = groups11.copy()
                if groups2 != []:
                    new_group.extend(groups2)
                if _control_groups(group_list, new_group) is False:
                    group_list.append(new_group)
            else:
                if _control_groups(group_list, groups1 + groups2) is False:
                    group_list.append(groups1 + groups2)

    for file in files:
        with open(output_directory + "/configurationfiles/" + file, "r", encoding="utf-8") as input_stream:
            for new_line in input_stream:
                line = json.loads(new_line)
                grupeto = line["groups"]
                for i in range(len(group_list)):
                    if grupeto == group_list[i]:
                        group_list.remove(grupeto)
                        break
    
    number = len(group_list) // cpu_counts
    ranges = []
    for i in range(cpu_counts):
        ranges.append(i * number)
    ranges.append(len(group_list))
    maximal_num += 1
    for i in range(cpu_counts):
        output_file = output_directory + "/configurationfiles/configuration" + str(maximal_num) + "_" + str(i) + ".json"
        first = True
        with open(output_file, "w", encoding="utf-8") as output_stream:
            for j in range(ranges[i], ranges[i+1]):
                if cutoff_val == -1:
                    model = {
                        "model_name": model_name,
                        "groups": group_list[j],
                        "evaluation": "evaluation" + str(maximal_num) + "_" + str(j) + ".json"
                    }
                else:
                    model = {
                        "model_name": model_name,
                        "cutoff": cutoff_val,
                        "groups": group_list[j],
                        "evaluation": "evaluation" + str(maximal_num) + "_" + str(j) + ".json"
                    }
                if first:
                    first = False
                else:
                    output_stream.write("\n")
                json.dump(model, output_stream)

    return maximal_num
Esempio n. 27
0
    def score_model(self, model_configuration: dict, fragments_file: str,
                    descriptors_file: str, output_file: str):
        inputoutput_utils.create_parent_directory(output_file)
        reg = linear_model.LinearRegression()
        # get activity list
        actives = [
            1 for i in range(len(model_configuration["data"]["active"]))
        ]
        inactives = [
            0 for i in range(len(model_configuration["data"]["inactive"]))
        ]
        activity = actives + inactives

        reg.fit(
            model_configuration["data"]["active"] +
            model_configuration["data"]["inactive"], activity)
        test_descriptors = extract_descriptors(
            descriptors_file, model_configuration["configuration"])
        molecule_file = int(model_configuration["configuration"]["molecules"])
        prediction = (reg.predict(test_descriptors))
        if molecule_file == 1:
            first_line = True
            with open(output_file, "w", encoding="utf-8") as output_stream:
                with open(fragments_file, "r",
                          encoding="utf-8") as input_stream:
                    for num_line, new_line in enumerate(input_stream):
                        line = json.loads(new_line)
                        score = {
                            "name": line["name"],
                            "score": prediction[num_line]
                        }
                        if first_line:
                            first_line = False
                        else:
                            output_stream.write("\n")
                        json.dump(score, output_stream)

        else:
            num_of_fragment = [0]
            names_of_molecules = []
            with open(fragments_file, "r",
                      encoding="utf-8") as fragments_stream:
                suma = 0
                for new_line in fragments_stream:
                    line = json.loads(new_line)
                    fragment_length = len(line["fragments"])
                    suma += fragment_length
                    num_of_fragment.append(suma)
                    names_of_molecules.append(line["name"])
            first_line = True
            with open(output_file, "w", encoding="utf-8") as output_stream:
                for i in range(len(num_of_fragment) - 1):
                    prediction_of_molecule = prediction[
                        num_of_fragment[i]:num_of_fragment[i + 1]]
                    sim = sum(prediction_of_molecule) / len(
                        prediction_of_molecule)
                    score = {"name": names_of_molecules[i], "score": sim}
                    if first_line:
                        first_line = False
                    else:
                        output_stream.write("\n")
                    json.dump(score, output_stream)