max_sim = 0 for active_indexes in model_configuration["data"]["active"]: intersection = _intersection_of_two_arrays(active_indexes, test_indexes) sim = len(intersection) / ( len(test_indexes) + len(active_indexes) - len(intersection)) if sim > max_sim: max_sim = sim score = { "name": line["name"], "score": max_sim } if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream) def _intersection_of_two_arrays(arr1: list, arr2: list): and_arr = [] for item in arr1: for i in range(len(arr2)): if item == arr2[i]: and_arr.append(item) arr2 = arr2[:i] + arr2[(i+1):] break return and_arr register_model(MyNbitHashedApModel.model_name, lambda: MyNbitHashedApModel())
for group in model_configuration["configuration"][ "groups"]: if index in group: index = group[0] break if index not in test_active_indexes: test_active_indexes.append(index) max_sim = max([ _compute_sim(item, test_active_indexes) for item in model_configuration["data"]["active"] ]) score = {"name": line["name"], "score": max_sim} if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream) def _compute_sim(active_fragments: list, test_fragments: list) -> list: summary = 0 for item in test_fragments: if item in active_fragments: summary += 1 sim = summary / (len(active_fragments) + len(test_fragments) - summary) return sim register_model(DeleteIndexGroupModel.model_name, lambda: DeleteIndexGroupModel())
molecule = Chem.MolFromSmiles(molecule_smiles) ecfp_fingerprint = AllChem.GetHashedMorganFingerprint(molecule, radius, nBits=nbits) active_molecules_ecfp.append(ecfp_fingerprint) first_line = True with open(output_file, "w", encoding="utf-8") as output_stream: with open(fragments_file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) test_molecule_input = line["smiles"] test_molecule_smiles = test_molecule_input.strip("\"") test_molecule = Chem.MolFromSmiles(test_molecule_smiles) test_mol_fingerprint = AllChem.GetHashedMorganFingerprint( test_molecule, radius, nBits=nbits) max_sim = max([ DataStructs.TanimotoSimilarity(test_mol_fingerprint, fingerprint) for fingerprint in active_molecules_ecfp ]) score = {"name": line["name"], "score": max_sim} if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream) register_model(NbitEcfpModelHashed.model_name, lambda: NbitEcfpModelHashed())
for item in fragments: if item["index"] not in molecule_indexes: molecule_indexes.append(item["index"]) for group in groups: founded_pos = False founded_neg = False for item in group: if (founded_pos is False) and (item in molecule_indexes): founded_pos = True elif (founded_neg is False) and (item not in molecule_indexes): founded_neg = True if founded_pos and founded_neg: break if founded_pos and founded_neg: for item in group: if item not in molecule_indexes: molecule_indexes.append(item) return molecule_indexes def _compute_sim(active_fragments: list, test_fragments: list) -> list: summary = 0 for item in test_fragments: if item in active_fragments: summary += 1 sim = summary / (len(active_fragments) + len(test_fragments) - summary) return sim register_model(AddGroupModel.model_name, lambda: AddGroupModel())
def save_to_json_file(self, output_file: str, model: dict): inputoutput_utils.save_to_json_file(output_file, model) def score_model(self, model_configuration: dict, fragments_file: str, descriptors_file: str, output_file: str): inputoutput_utils.create_parent_directory(output_file) first_line = True with open(output_file, "w", encoding="utf-8") as output_stream: with open(fragments_file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) test_active_indexes = [] for fragment in line["fragments"]: if fragment["index"] not in test_active_indexes: test_active_indexes.append(fragment["index"]) summary = 0 for test_index in test_active_indexes: if test_index in model_configuration["data"]["active"]: summary += 1 sim = summary / (len(model_configuration["data"]["active"]) + len(test_active_indexes) - summary) score = {"name": line["name"], "score": sim} if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream) register_model(BaselineModel.model_name, lambda: BaselineModel())
molecule_smiles = active_molecule.strip("\"") molecule = Chem.MolFromSmiles(molecule_smiles) fcfp_fingerprint = AllChem.GetMorganFingerprintAsBitVect(molecule, radius, useFeatures=True, nBits=nbits) active_molecules_fcfp.append(fcfp_fingerprint) first_line = True with open(output_file, "w", encoding="utf-8") as output_stream: with open(fragments_file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) test_molecule_input = line["smiles"] test_molecule_smiles = test_molecule_input.strip("\"") test_molecule = Chem.MolFromSmiles(test_molecule_smiles) test_mol_fingerprint = AllChem.GetMorganFingerprintAsBitVect(test_molecule, radius, useFeatures=True, nBits=nbits) max_sim = max([DataStructs.TanimotoSimilarity(test_mol_fingerprint, fingerprint) for fingerprint in active_molecules_fcfp]) score = { "name": line["name"], "score": max_sim } if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream) register_model(NbitFcfpModelVector.model_name, lambda: NbitFcfpModelVector())
molecule_smiles = active_molecule.strip("\"") molecule = Chem.MolFromSmiles(molecule_smiles) ap_fingerprint = Pairs.GetHashedAtomPairFingerprint(molecule, nBits=nbits) active_molecules_ap.append(ap_fingerprint) first_line = True with open(output_file, "w", encoding="utf-8") as output_stream: with open(fragments_file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) test_molecule_input = line["smiles"] test_molecule_smiles = test_molecule_input.strip("\"") test_molecule = Chem.MolFromSmiles(test_molecule_smiles) test_mol_fingerprint = Pairs.GetHashedAtomPairFingerprint( test_molecule, nBits=nbits) max_sim = max([ DataStructs.TanimotoSimilarity(test_mol_fingerprint, fingerprint) for fingerprint in active_molecules_ap ]) score = {"name": line["name"], "score": max_sim} if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream) register_model(NbitAPModel.model_name, lambda: NbitAPModel())
indexes[item] += 1 cut = num_of_active_mol * int(model_configuration["cutoff"]) / 100 cutoff_indexes = [] for item in indexes: if indexes[item] >= cut: cutoff_indexes.append(item) model = { "configuration": { "model_name": model_configuration["model_name"] }, "data": { "active": cutoff_indexes } } return model def save_to_json_file(self, output_file: str, model: dict): inputoutput_utils.save_to_json_file(output_file, model) def score_model(self, model_configuration: dict, fragments_file: str, descriptors_file: str, output_file: str): baseline_model = baseline.BaselineModel() baseline_model.score_model(model_configuration, fragments_file, descriptors_file, output_file) register_model(CutOffModel.model_name, lambda: CutOffModel())
is_in_inactives = True break if is_in_actives & is_in_inactives is False: suma += active_parameter elif is_in_inactives & is_in_actives is False: suma -= inactive_parameter else: suma += neutral_parameter score = { "name": name, "score": suma / len(molecule["fragments"]) } if first: first = False else: output_stream.write("\n") json.dump(score, output_stream) def _select_indexes(input_file: str): indexes = [] with open(input_file, "r") as stream: for line in stream: line_parts = line.split(",") indexes.append(line_parts[1]) indexes.pop(0) return indexes register_model(PosNegIndexModel.model_name, lambda: PosNegIndexModel())
test_active_indexes.append(group) return test_active_indexes def _compute_sim(active_fragments: list, test_fragments: list, groups: list) -> list: groups_len = 1 for item in groups: groups_len += len(item) summary = 0 for item in test_fragments: if item in active_fragments: summary += 1 for group in groups: if group in active_fragments: founded = False for item in group: if item in test_fragments: founded = True break if founded: summary += len(group) if groups_len == 0: groups_len = 1 sim = summary / (groups_len * (len(active_fragments) + len(test_fragments) - summary)) return sim register_model(GroupAndAddModel.model_name, lambda: GroupAndAddModel())
with open(fragments_file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) test_active_indexes = [] for fragment in line["fragments"]: if fragment["index"] not in test_active_indexes: test_active_indexes.append(fragment["index"]) max_sim = max([ _compute_sim(item, test_active_indexes) for item in model_configuration["data"]["active"] ]) score = {"name": line["name"], "score": max_sim} if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream) def _compute_sim(active_fragments: list, test_fragments: list) -> list: summary = 0 for item in test_fragments: if item in active_fragments: summary += 1 sim = summary / (len(active_fragments) + len(test_fragments) - summary) return sim register_model(BaselineActivePairModel.model_name, lambda: BaselineActivePairModel())
num_of_fragment[i]:num_of_fragment[i + 1]] sim = sum(prediction_of_molecule) / len( prediction_of_molecule) score = {"name": names_of_molecules[i], "score": sim} if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream) def extract_descriptors(input_file: str, model_configuration: dict) -> list: descriptors = [] with open(input_file, "r", encoding="utf-8") as stream: line = stream.readline() line_parts = line.split(",") if (((line_parts[1] == "index") & (int(model_configuration["molecules"]) == 1)) | ((line_parts[1] != "index") & (int(model_configuration["molecules"]) == 0))): print("Wrong input") exit(1) for line in stream: line_parts = line.split(",") descriptors.append(list(map(float, line_parts[1:]))) return descriptors register_model(LinearRegressionModel.model_name, lambda: LinearRegressionModel())
indexes.append(fragment["index"]) max_sim = 0 for active_indexes in model_configuration["data"]["active"]: intersection = _intersection_of_two_arrays(active_indexes, indexes) sim = len(intersection) / (len(indexes) + len(active_indexes) - len(intersection)) if sim > max_sim: max_sim = sim score = { "name": line["name"], "score": max_sim } if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream) def _intersection_of_two_arrays(arr1: list, arr2: list): and_arr = [] for item in arr1: for i in range(len(arr2)): if item == arr2[i]: and_arr.append(item) arr2 = arr2[:i] + arr2[(i+1):] break return and_arr register_model(ControlModel.model_name, lambda: ControlModel())
output_stream.write("\n") json.dump(score, output_stream) def _make_groups(indexes: list, groups: list) -> list: for group in groups: founded = True for item in group: if item not in indexes: founded = False break if founded: for item in group: indexes.remove(item) indexes.append(group) return indexes def _compute_sim(active_fragments: list, test_fragments: list) -> list: summary = 0 for item in test_fragments: if item in active_fragments: summary += 1 sim = summary / (len(active_fragments) + len(test_fragments) - summary) return sim register_model(CutoffActiveGroupModel.model_name, lambda: CutoffActiveGroupModel())
sum -= inactive_parameter counter += 1 if counter == name_num[molecule_num]["fragments"]: score = { "name": name_num[molecule_num]["molecule"], "score": sum / counter } counter = 0 sum = 0 molecule_num += 1 if first_write: first_write = False else: streamo.write("\n") json.dump(score, streamo) def _read_molecules(input_file: str): name_num = [] with open(input_file, "r") as stream: for line in stream: molecule = json.loads(line) name = molecule["name"] num_fragments = len(molecule["fragments"]) mol_frag = {"molecule": name, "fragments": num_fragments} name_num.append(mol_frag) return name_num register_model(DescriptorsModel.model_name, lambda: DescriptorsModel())
else: output_stream.write("\n") json.dump(score, output_stream) def _extract_names(input_file: str) -> list: names = [] with open(input_file, "r", encoding="utf-8") as stream: for new_line in stream: line = json.loads(new_line) names.append(line["name"]) return names def _extract_descriptors(input_file: str) -> list: descriptors = [] with open(input_file, "r", encoding="utf-8") as stream: line = stream.readline() line_parts = line.split(",") if line_parts[1] == "index": print("Wrong input, expected molecules not fragments") exit(1) for line in stream: line_parts = line.split(",") descriptors.append(list(map(float, line_parts[1:]))) return descriptors register_model(DecisionTreeClassifier.model_name, lambda: DecisionTreeClassifier())
model_configuration["configuration"]["active_parameter"]) inactive_parameter = int( model_configuration["configuration"]["inactive_parameter"]) with open(output_file, "w") as streamo: first = True with open(fragments_file, "r") as stream: for line in stream: molecule = json.loads(line) name = molecule["name"] suma = 0 for fragment in molecule["fragments"]: founded = False for index in model_configuration["data"]: if int(index) == int(fragment["index"]): founded = True break if founded: suma += active_parameter else: suma -= inactive_parameter sim = suma / len(molecule["fragments"]) score = {"name": name, "score": sim} if first: first = False else: streamo.write("\n") json.dump(score, streamo) register_model(PositiveIndexModel.model_name, lambda: PositiveIndexModel())
molecule = Chem.MolFromSmiles(molecule_smiles) fcfp_fingerprint = AllChem.GetMorganFingerprint(molecule, radius, useFeatures=True) active_molecules_fcfp.append(fcfp_fingerprint) first_line = True with open(output_file, "w", encoding="utf-8") as output_stream: with open(fragments_file, "r", encoding="utf-8") as input_stream: for new_line in input_stream: line = json.loads(new_line) test_molecule_input = line["smiles"] test_molecule_smiles = test_molecule_input.strip("\"") test_molecule = Chem.MolFromSmiles(test_molecule_smiles) test_mol_fingerprint = AllChem.GetMorganFingerprint( test_molecule, radius, useFeatures=True) max_sim = max([ DataStructs.TanimotoSimilarity(test_mol_fingerprint, fingerprint) for fingerprint in active_molecules_fcfp ]) score = {"name": line["name"], "score": max_sim} if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream) register_model(RdkitFcfpModel.model_name, lambda: RdkitFcfpModel())
sim = summary / (len(model_configuration["data"]["active"]) + len(test_active_indexes) - summary) score = { "name": line["name"], "score": sim } if first_line: first_line = False else: output_stream.write("\n") json.dump(score, output_stream) def _make_groups(indexes: list, groups: list) -> list: for group in groups: founded = True for item in group: if item not in indexes: founded = False break if founded: for item in group: indexes.remove(item) indexes.append(group) return indexes register_model(EcfpGroupModel.model_name, lambda: EcfpGroupModel())