def process(fname): results = [] label = int(os.path.basename(fname).replace('.json', '')) with open(fname, 'r') as f: data = json.load(f) ok = [] for d in data: smi = d['smiles'] if smi is None: continue # Validate SMILES errs = molvs.validate_smiles(smi) if errs: # print('Validation error(s):', errs) continue # Standardize SMILES smi = molvs.standardize_smiles(smi) # Check if exists already if smi in pubchem: # print('Exists in PubChem') continue ok.append(smi) #print('Kept:', len(ok)) atc_codes = [atc_lookup[i] for i in atc_model.predict(ok)] for smi, atc_code in zip(ok, atc_codes): mol = Chem.MolFromSmiles(smi) formula = CalcMolFormula(mol) h = md5(smi.encode('utf8')).hexdigest() im = Draw.MolToImage(mol) im_path = os.path.join(images_dir, '{}.png'.format(h)) im.save(im_path) results.append({ 'label': label, 'smiles': smi, 'formula': formula, 'image': im_path, 'atc_code': atc_code, 'created_at': datetime.utcnow().isoformat() }) # Save generated compounds with open(fname, 'w') as f: json.dump(results, f)
logFid.close() header = fid.readline() smilesDict = {} print "Might need to change in accordance with file format " print "**********Please check file format*********" idx = 0 for line in fid: idx = idx + 1 lineList = string.split(line, "\t") smiles = lineList[2] name = lineList[3] className = string.strip(lineList[4]) pIC50 = string.strip(lineList[5]) logFid = open(logFile, "a") #logFid.write("........... Processing "+smiles+" .............\n") info = validate_smiles(smiles) if info: logFid.write(str(info) + " " + smiles + " " + name + "\n") mol = Chem.MolFromSmiles(smiles) if not mol: logFid.write("Could not be transformed to rdkit object " + smiles + " " + name + "\n") stdzdSmiles = None else: stdzdSmiles = standardize(mol, name, texFile, verbose) if stdzdSmiles: smilesDict[name] = stdzdSmiles outFid = open(outFileName, "a") outFid.write( str(stdzdSmiles) + "\t" + name + "\t" + className + "\t" + pIC50 + "\n")
def process_data(self, csv_data, txt_file, data_size, smile_size, group_size): with open(txt_file, "w") as my_output_file: with open(csv_data, "r") as my_input_file: [ my_output_file.write("$$$".join(row) + '\n') for row in csv.reader(my_input_file) ] my_output_file.close() with open('sl.txt', 'r') as myfile: data = myfile.read() data = data.split("\n") D = [] drug_names = [] drug_smiles = [] drug_group = [] c = 0 for i in data: d = i.split("$$$") D.append(d) print(d) if len(d) > 5 and c > 0 and c <= data_size: drug_names.append(d[12]) drug_group.append(d[2]) drug_smiles.append(d[13]) c += 1 drug_mol = [] for i in drug_smiles: if validate_smiles(i) != []: drug_smiles.pop(drug_smiles.index(i)) drug_smiles = list( filter(lambda a: len(a) <= smile_size and len(a) > 0, drug_smiles)) l = [] for j in drug_smiles: drug_mol.append(Chem.MolFromSmiles(j)) l.append(len(j)) fps = [FingerprintMols.FingerprintMol(x) for x in drug_mol] n = group_size count = 0 Prob = [] Data_Final = [] Data_Final1 = [] data_prov = [] for i in fps: probs = [DataStructs.FingerprintSimilarity(i, j) for j in fps] indexes = heapq.nlargest(n, range(len(probs)), probs.__getitem__) Data1 = [drug_smiles[k] for k in indexes] Data = [self.smiles_encoder(drug_smiles[k]) for k in indexes] Prob.append([probs[k] for k in indexes]) count = 0 for i in Data1: count += 1 if (i not in data_prov and count == 1): data_prov.append(i) Data_Final.append(Data) Data_Final1.append(Data1) return Data_Final
drug_names.append(d[12]) drug_group.append(d[2]) drug_smiles.append(d[13]) c+=1 # for drug in data: # print(drug_names[0:3]) # print(drug_smiles[0:3]) # print(drug_group[0:3]) # drug_mol=[] #aplicando validate_smiles(drug_smiles[1406]) verifica-se que a molecula e invalida #remover todas as invalidas for i in drug_smiles: if validate_smiles(i) != []: drug_smiles.pop(drug_smiles.index(i)) drug_smiles = list(filter(lambda a: len(a) <= 100 and len(a) > 0, drug_smiles)) # print(validate_smiles(drug_smiles[1407])) # print(len(drug_smiles)) l=[] for j in drug_smiles: drug_mol.append(Chem.MolFromSmiles(j)) l.append(len(j)) # print(l) fps = [FingerprintMols.FingerprintMol(x) for x in drug_mol]