コード例 #1
0
def process(fname):
    results = []
    label = int(os.path.basename(fname).replace('.json', ''))
    with open(fname, 'r') as f:
        data = json.load(f)

    ok = []
    for d in data:
        smi = d['smiles']
        if smi is None: continue

        # Validate SMILES
        errs = molvs.validate_smiles(smi)
        if errs:
            # print('Validation error(s):', errs)
            continue

        # Standardize SMILES
        smi = molvs.standardize_smiles(smi)

        # Check if exists already
        if smi in pubchem:
            # print('Exists in PubChem')
            continue

        ok.append(smi)

    #print('Kept:', len(ok))
    atc_codes = [atc_lookup[i] for i in atc_model.predict(ok)]

    for smi, atc_code in zip(ok, atc_codes):
        mol = Chem.MolFromSmiles(smi)
        formula = CalcMolFormula(mol)

        h = md5(smi.encode('utf8')).hexdigest()
        im = Draw.MolToImage(mol)
        im_path = os.path.join(images_dir, '{}.png'.format(h))
        im.save(im_path)

        results.append({
            'label': label,
            'smiles': smi,
            'formula': formula,
            'image': im_path,
            'atc_code': atc_code,
            'created_at': datetime.utcnow().isoformat()
        })

    # Save generated compounds
    with open(fname, 'w') as f:
        json.dump(results, f)
コード例 #2
0
 logFid.close()
 header = fid.readline()
 smilesDict = {}
 print "Might need to change in accordance with file format "
 print "**********Please check file format*********"
 idx = 0
 for line in fid:
     idx = idx + 1
     lineList = string.split(line, "\t")
     smiles = lineList[2]
     name = lineList[3]
     className = string.strip(lineList[4])
     pIC50 = string.strip(lineList[5])
     logFid = open(logFile, "a")
     #logFid.write("........... Processing "+smiles+" .............\n")
     info = validate_smiles(smiles)
     if info:
         logFid.write(str(info) + " " + smiles + " " + name + "\n")
     mol = Chem.MolFromSmiles(smiles)
     if not mol:
         logFid.write("Could not be transformed to rdkit object " + smiles +
                      " " + name + "\n")
         stdzdSmiles = None
     else:
         stdzdSmiles = standardize(mol, name, texFile, verbose)
     if stdzdSmiles:
         smilesDict[name] = stdzdSmiles
         outFid = open(outFileName, "a")
         outFid.write(
             str(stdzdSmiles) + "\t" + name + "\t" + className + "\t" +
             pIC50 + "\n")
コード例 #3
0
    def process_data(self, csv_data, txt_file, data_size, smile_size,
                     group_size):

        with open(txt_file, "w") as my_output_file:
            with open(csv_data, "r") as my_input_file:
                [
                    my_output_file.write("$$$".join(row) + '\n')
                    for row in csv.reader(my_input_file)
                ]
            my_output_file.close()

        with open('sl.txt', 'r') as myfile:
            data = myfile.read()

        data = data.split("\n")

        D = []
        drug_names = []
        drug_smiles = []
        drug_group = []

        c = 0
        for i in data:
            d = i.split("$$$")
            D.append(d)
            print(d)
            if len(d) > 5 and c > 0 and c <= data_size:
                drug_names.append(d[12])
                drug_group.append(d[2])
                drug_smiles.append(d[13])
            c += 1

        drug_mol = []

        for i in drug_smiles:
            if validate_smiles(i) != []:
                drug_smiles.pop(drug_smiles.index(i))

        drug_smiles = list(
            filter(lambda a: len(a) <= smile_size and len(a) > 0, drug_smiles))

        l = []
        for j in drug_smiles:
            drug_mol.append(Chem.MolFromSmiles(j))
            l.append(len(j))

        fps = [FingerprintMols.FingerprintMol(x) for x in drug_mol]

        n = group_size
        count = 0

        Prob = []
        Data_Final = []
        Data_Final1 = []
        data_prov = []

        for i in fps:
            probs = [DataStructs.FingerprintSimilarity(i, j) for j in fps]
            indexes = heapq.nlargest(n, range(len(probs)), probs.__getitem__)

            Data1 = [drug_smiles[k] for k in indexes]
            Data = [self.smiles_encoder(drug_smiles[k]) for k in indexes]

            Prob.append([probs[k] for k in indexes])
            count = 0
            for i in Data1:
                count += 1
                if (i not in data_prov and count == 1):
                    data_prov.append(i)
                    Data_Final.append(Data)
                    Data_Final1.append(Data1)

        return Data_Final
コード例 #4
0
            drug_names.append(d[12])
            drug_group.append(d[2])
            drug_smiles.append(d[13])
        c+=1  
    
#    for drug in data:
#        print(drug_names[0:3])
#        print(drug_smiles[0:3])
#        print(drug_group[0:3])
#    
    
    drug_mol=[]
    #aplicando validate_smiles(drug_smiles[1406]) verifica-se que a molecula e invalida
    #remover todas as invalidas
    for i in drug_smiles:
        if validate_smiles(i) != []:
            drug_smiles.pop(drug_smiles.index(i))
            
    
    drug_smiles = list(filter(lambda a: len(a) <= 100 and len(a) > 0, drug_smiles))    
#    print(validate_smiles(drug_smiles[1407]))
#    print(len(drug_smiles))
    
    l=[]
    for j in drug_smiles:
        drug_mol.append(Chem.MolFromSmiles(j))
        l.append(len(j))
       
#    print(l)
    
    fps = [FingerprintMols.FingerprintMol(x) for x in drug_mol]