Beispiel #1
0
def generate_sdf(filename):

    suppl = cheminfo.read_sdffile(filename)

    # start
    molobj = next(suppl)
    atoms, coord = cheminfo.molobj_to_xyz(molobj)
    energy = worker.get_energy(molobj)
    representation = sim.get_representation(atoms, coord)

    # init lists
    molobjs = [molobj]
    energies = [energy]
    coordinates = [coord]
    representations = [representation]

    # collect the rest
    for molobj in suppl:

        energy = worker.get_energy(molobj)
        coord = cheminfo.molobj_get_coordinates(molobj)
        representation = sim.get_representation(atoms, coord)

        molobjs.append(molobj)
        energies.append(energy)
        coordinates.append(coord)
        representations.append(representation)

    return molobjs, energies, coordinates, representations
Beispiel #2
0
def merge_sdfs(filenames):

    molobjs = []
    energies = []
    coordinates = []
    representations = []
    atoms = []
    n_total = 0

    for filename in filenames:

        try:
            molobjs_next, energies_next, coordinates_next, representations_next = generate_sdf(
                filename)
        except:
            continue

        if len(molobjs) == 0:
            atoms, coord = cheminfo.molobj_to_xyz(molobjs_next[0])
            energies += energies_next
            coordinates += coordinates_next
            representations += representations_next
            molobjs += molobjs_next
            n_total += len(molobjs_next)
            continue

        if args.debug:
            print(" {:} = {:} confs".format(filename, len(molobjs_next)))

        idxs = merge_asymmetric(atoms, energies_next, energies,
                                representations_next, representations)

        n_new = 0
        for i, idxl in enumerate(idxs):

            N = len(idxl)
            if N > 0: continue

            energies.append(energies_next[i])
            coordinates.append(coordinates_next[i])
            representations.append(representations_next[i])
            molobjs.append(molobjs_next[i])
            n_new += 1

        if args.debug:
            n_total += n_new
            print(" - new", n_new)
            print("total", n_total)

    if args.dump:
        sdfstr = [cheminfo.molobj_to_sdfstr(molobj) for molobj in molobjs]
        sdfstr = "".join(sdfstr)
        print(sdfstr)

    return
def molobjs_to_xyzs(molobjs):

    mol_atoms = []
    mol_coord = []

    for molobj in molobjs:
        atoms, coord = cheminfo.molobj_to_xyz(molobj)
        mol_atoms.append(atoms)
        mol_coord.append(coord)

    return mol_atoms, mol_coord
Beispiel #4
0
def get_sdfcontent(sdffile, rtn_atoms=False):

    coordinates = []
    energies = []

    reader = cheminfo.read_sdffile(sdffile)
    molobjs = [molobj for molobj in reader]
    atoms = ""

    for molobj in molobjs:
        atoms, coordinate = cheminfo.molobj_to_xyz(molobj)
        energy = get_energy(molobj)

        coordinates.append(coordinate)
        energies.append(energy)

    if rtn_atoms:
        return molobjs[0], atoms, energies, coordinates

    return energies, coordinates
Beispiel #5
0
def run_jobfile(molobjs, tordbs, filename, threads=0):

    # Prepare molobjs to xyz

    origins = []

    for molobj in molobjs:
        atoms, xyz = cheminfo.molobj_to_xyz(molobj)
        origins.append(xyz)

    with open(filename, 'r') as f:
        lines = f.readlines()
        lines = [line.strip() for line in lines]

    if threads > 0:
        run_joblines_threads(origins, molobjs, tordbs, lines, threads=threads, dump=False)

    else:
        run_joblines(origins, molobjs, tordbs, lines, dump=False)

    return True
Beispiel #6
0
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--scratch',
                        action='store',
                        help='',
                        metavar="dir",
                        default="_tmp_")
    parser.add_argument('-j',
                        '--procs',
                        action='store',
                        help='pararallize',
                        metavar="int",
                        default=0,
                        type=int)

    args = parser.parse_args()

    if args.scratch[-1] != "/":
        args.scratch += "/"

    # Read properties
    properties = misc.load_npy(args.scratch + "properties")
    molecules = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz")
    molecules = list(molecules)

    heavy_atoms = []
    predictions = []
    errors = []

    for mol, prop in zip(molecules, properties):

        smi = cheminfo.molobj_to_smiles(mol, remove_hs=True)
        J = thermo.joback.Joback(smi)
        # J = thermo.joback.Joback('CC(=O)C')
        # J = thermo.joback.Joback('CCC(=O)OC(=O)CC')

        status = J.status

        atoms, coord = cheminfo.molobj_to_xyz(mol)
        idx = np.where(atoms != 1)
        atoms = atoms[idx]
        N = len(atoms)
        heavy_atoms.append(N)

        if "Did not match all atoms present" in status:
            errors.append(1)
            predictions.append(float("nan"))
            continue

        try:
            estimate = J.estimate()
        except TypeError:
            errors.append(1)
            predictions.append(float("nan"))
            continue

        errors.append(0)

        T_b = estimate["Tb"]
        T_m = estimate["Tm"]

        predictions.append(T_m)

    errors = np.array(errors, dtype=int)

    idx_success, = np.where(errors == 0)

    heavy_atoms = np.array(heavy_atoms)
    predictions = np.array(predictions)
    properties = np.array(properties)

    predictions = predictions[idx_success]
    properties = properties[idx_success]
    heavy_atoms = heavy_atoms[idx_success]

    print("total", errors.shape[0], "filter", idx_success.shape[0])
    print()
    print(rmse(properties, predictions))

    plt.plot(properties, properties, "-k")
    plt.scatter(properties, predictions, s=0.95, alpha=0.8, c=heavy_atoms)

    plt.xlabel("True")
    plt.ylabel("Predicted")

    plt.savefig("_fig_joback")
    plt.clf()

    return
Beispiel #7
0
def get_clockwork_conformations(molobj, torsions, resolution,
    atoms=None,
    debug=False,
    timings=False):
    """

    Get all conformation for specific cost
    cost defined from torsions and resolution

    """

    n_torsions = len(torsions)

    if atoms is None:
        atoms, xyz = cheminfo.molobj_to_xyz(molobj, atom_type="int")
        del xyz


    combinations = clockwork.generate_clockwork_combinations(resolution, n_torsions)

    # Collect energies and coordinates
    end_energies = []
    end_coordinates = []
    end_representations = []

    first = True

    for resolutions in combinations:

        time_start = time.time()

        # Get all conformations
        c_energies, c_coordinates, c_states = get_conformations(molobj, torsions, resolutions)

        N = len(c_energies)

        # Filter unconverged
        success = np.argwhere(c_states == 0)
        success = success.flatten()
        c_energies = c_energies[success]
        c_coordinates = c_coordinates[success]

        N2 = len(c_energies)

        # Calculate representations
        c_representations = [sim.get_representation(atoms, coordinates) for coordinates in c_coordinates]
        c_representations = np.asarray(c_representations)

        # Clean all new conformers for energies and similarity
        idxs = clean_representations(atoms, c_energies, c_representations)

        c_energies = c_energies[idxs]
        c_coordinates = c_coordinates[idxs]
        c_representations = c_representations[idxs]

        if first:
            first = False
            end_energies += list(c_energies)
            end_coordinates += list(c_coordinates)
            end_representations += list(c_representations)
            continue

        # Asymmetrically add new conformers
        idxs = merge.merge_asymmetric(atoms,
            c_energies,
            end_energies,
            c_representations,
            end_representations)

        # Add new unique conformation to return collection
        for i, idx in enumerate(idxs):

            # if conformation already exists, continue
            if len(idx) > 0: continue

            # Add new unique conformation to collection
            end_energies.append(c_energies[i])
            end_coordinates.append(c_coordinates[i])
            end_representations.append(c_representations[i])


        time_end = time.time()

        if timings:
            timing = time_end - time_start
            print("res time {:8.2f} cnf/sec - {:8.2f} tot sec".format(N/timing, timing))

        continue

    return end_energies, end_coordinates
Beispiel #8
0
def converge_clockwork(molobj, tordb, max_cost=2):
    """
    molobj
    torsions_idx
    resolution

    """

    atoms, xyz = cheminfo.molobj_to_xyz(molobj)

    total_torsions = len(tordb)
    print("total torsions", total_torsions)

    # TODO Cache this
    cost_input, cost_cost = clockwork.generate_costlist(total_torsions=total_torsions)

    # TODO cost_cost and costfunc

    offset = 6
    max_cost = 1
    offset = 1
    max_cost = 7
    # offset = 7
    # max_cost = 1

    for (n_tor, resolution), cost in zip(cost_input[offset:offset+max_cost], cost_cost[offset:offset+max_cost]):

        start = time.time()

        # Iterate over torsion combinations
        combinations = clockwork.generate_torsion_combinations(total_torsions, n_tor)

        cost_result_energies = []
        cost_result_coordinates = []

        C = 0

        for combination in combinations:

            # TODO Move this to function

            com_start = time.time()

            torsions = [tordb[i] for i in combination]

            result_energies, result_coordinates = get_clockwork_conformations(molobj, torsions, resolution)
            n_results = len(result_energies)
            result_cost = [cost]*n_results

            com_end = time.time()

            # print("new confs", len(result_energies), "{:6.2f}".format(com_end-com_start))

            # Merge
            if len(cost_result_energies) == 0:

                cost_result_energies += list(result_energies)
                cost_result_coordinates += list(result_coordinates)
                continue

            else:

                start_merge = time.time()

                # TODO Move this to function

                continue

                idxs = merge.merge_asymmetric(atoms,
                    result_energies,
                    cost_result_energies,
                    result_coordinates,
                    cost_result_coordinates, decimals=2, debug=True)

                for i, idx in enumerate(idxs):

                    C += 1

                    if len(idx) == 0:
                        cost_result_energies.append(result_energies[i])
                        cost_result_coordinates.append(result_coordinates[i])

                end_merge = time.time()

            print("total confs", len(cost_result_energies), "{:10.2f}".format(end_merge-start_merge))
            continue

        end = time.time()

        print("conv", n_tor, resolution, cost, len(cost_result_energies), "tot: {:5.2f}".format(end-start), "per sec: {:5.2f}".format(cost/(end-start)))

    quit()

    return
def main():

    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--scratch',
                        action='store',
                        help='',
                        metavar="dir",
                        default="_tmp_")
    parser.add_argument('-j',
                        '--procs',
                        action='store',
                        help='pararallize',
                        metavar="int",
                        default=0,
                        type=int)

    args = parser.parse_args()

    if args.scratch[-1] != "/":
        args.scratch += "/"

    properties = misc.load_npy(args.scratch + "properties")
    molecules = cheminfo.read_sdffile(args.scratch + "structures.sdf.gz")

    heavy_atoms = []
    distances = []
    volumes = []

    for mol in molecules:

        # atoms = cheminfo.molobj_to_atoms(mol)
        atoms, coord = cheminfo.molobj_to_xyz(mol)

        idx = np.where(atoms != 1)
        atoms = atoms[idx]
        N = len(atoms)
        heavy_atoms.append(N)

        hull = ConvexHull(coord, qhull_options="QJ")

        vol = hull.volume
        volumes.append(vol)

        avgdist = distance.pdist(coord)
        avgdist = np.mean(avgdist)

        distances.append(avgdist)

    heavy_atoms = np.array(heavy_atoms)
    volumes = np.array(volumes)
    distances = np.array(distances)

    #
    #
    #

    representation = distances

    # linear fit
    p = np.polyfit(representation, properties, 3)
    p = np.poly1d(p)

    results = p(representation)
    rmse_error = rmse(results, properties)

    print(rmse_error)

    plt.scatter(representation, properties, c=heavy_atoms, s=0.8)
    x_prop = np.linspace(min(representation), max(representation), 80)
    plt.plot(x_prop, p(x_prop), "k-")

    plt.savefig("i_can_member_it")
    plt.clf()

    return
Beispiel #10
0
def mergesdfs(sdflist):
    """
    """
    # Merge sdf0 with sdf1 and etc

    molobjs_list = []

    for sdf in sdflist:
        sdfs = cheminfo.read_sdffile(sdf)
        molobjs = [molobj for molobj in sdfs]
        molobjs_list.append(molobjs)

    coordinates_sdf = []

    for molobjs in molobjs_list:
        coordinates = [
            cheminfo.molobj_get_coordinates(molobj) for molobj in molobjs
        ]
        coordinates_sdf.append(coordinates)

    atoms, xyz = cheminfo.molobj_to_xyz(molobjs_list[0][0])

    coordinates_x = coordinates_sdf[0]
    coordinates_y = coordinates_sdf[1]

    # JCK

    representations19_x = [
        sim.get_representation(atoms, coordinates)
        for coordinates in coordinates_x
    ]
    representations19_y = [
        sim.get_representation(atoms, coordinates)
        for coordinates in coordinates_y
    ]
    representations19_x = np.asarray(representations19_x[:5])
    representations19_y = np.asarray(representations19_y[:5])

    nx = len(representations19_x)
    ny = len(representations19_y)

    similarity = sim.get_kernel(representations19_x, representations19_y,
                                [atoms] * nx, [atoms] * ny)
    print(similarity)

    # JCK

    # fchl18
    representations_x = get_representations_fchl(atoms,
                                                 coordinates_x,
                                                 max_size=len(atoms))
    representations_x = np.asarray(representations_x[:5])
    representations_y = get_representations_fchl(atoms,
                                                 coordinates_y,
                                                 max_size=len(atoms))
    representations_y = np.asarray(representations_y[:5])
    similarity = get_kernel_fchl(representations_x, representations_y)
    print("qml kernel:")
    print(similarity)

    return
Beispiel #11
0
def main_folder():
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('-v', '--version', action='version', version="1.0")
    parser.add_argument('--sdf',
                        nargs="+",
                        action='store',
                        help='',
                        metavar='FILE')
    args = parser.parse_args()

    # TODO Merge results from redis

    if args.sdf is None:
        print("error: actually we need sdfs to merge")
        quit()

    dumpdir = "_tmp_apentane_cum/"

    filename = args.sdf[0] + "{:}_{:}" + ".sdf"

    molobjs, energies, coordinates, representations = generate_sdf(
        filename.format(1, 1))

    atoms, xyz = cheminfo.molobj_to_xyz(molobjs[0])

    # costcombos, costs = clockwork.generate_costlist(total_torsions=28)
    costcombos, costs = clockwork.generate_costlist()

    n_total = len(molobjs)
    molcosts = [(1, 1)] * n_total

    print("start", n_total)

    for combo in costcombos[:15]:

        try:
            molobjs_new, energies_new, coordinates_new, representations_new = generate_sdf(
                filename.format(*combo))
        except:
            continue

        print(" merge", len(molobjs_new))

        idxs = merge_asymmetric(atoms, energies_new, energies,
                                representations_new, representations)

        n_new = 0
        for i, idxl in enumerate(idxs):

            N = len(idxl)
            if N > 0: continue

            energies.append(energies_new[i])
            coordinates.append(coordinates_new[i])
            representations.append(representations_new[i])
            molobjs.append(molobjs_new[i])

            n_new += 1

        molcosts += [combo] * n_new

        n_total += n_new
        print(" - new", n_new)
        print("total", n_total, combo)

    sdfstr = [cheminfo.molobj_to_sdfstr(molobj) for molobj in molobjs]
    sdfstr = "".join(sdfstr)
    f = open(dumpdir + "all.sdf", 'w')
    f.write(sdfstr)
    f.close()

    hellodump = ""
    for combo in molcosts:
        hello = "{:} {:}".format(*combo)
        hellodump += hello + "\n"

    f = open(dumpdir + "costs.csv", 'w')
    f.write(hellodump)
    f.close()

    plt.plot(energies, 'k.')
    plt.yscale("log")
    plt.savefig(dumpdir + "energies")

    return
Beispiel #12
0
def overview_properties_pca():

    elements = []

    with open('data/sdf/subset_properties.csv', 'r') as f:
        properties = f.readlines()
        properties = [float(x) for x in properties]
        properties = np.array(properties)

    representations = []
    molobjs = cheminfo.read_sdffile("data/sdf/subset_structures.sdf")

    mols_atoms = []
    mols_coord = []

    n_atoms = 0
    n_items = 500

    for i, molobj in enumerate(molobjs):

        atoms, coord = cheminfo.molobj_to_xyz(molobj)

        mols_atoms.append(atoms)
        mols_coord.append(coord)

        elements += list(np.unique(atoms))
        elements = list(np.unique(elements))

        if len(atoms) > n_atoms:
            n_atoms = len(atoms)

        i += 1
        if i == n_items:
            break

    properties = properties[:n_items]

    print(elements)
    print(n_atoms)
    print(len(mols_atoms))

    distance_cut = 20.0
    parameters = {
        "pad": n_atoms,
        'nRs2': 22,
        'nRs3': 17,
        'eta2': 0.41,
        'eta3': 0.97,
        'three_body_weight': 45.83,
        'three_body_decay': 2.39,
        'two_body_decay': 2.39,
        "rcut": distance_cut,
        "acut": distance_cut,
        "elements": elements
    }

    for atoms, coord in zip(mols_atoms, mols_coord):
        representation = generate_fchl_acsf(atoms, coord, **parameters)
        representations.append(representation)

    representations = np.array(representations)

    sigma = 10.

    kernel = qml.kernels.get_local_kernel(representations, representations,
                                          mols_atoms, mols_atoms, sigma)

    print(kernel.shape)

    pca = kpca(kernel, n=2)

    fig, axs = plt.subplots(2, 1, figsize=(5, 10))
    sc = axs[0].scatter(*pca, c=properties)
    fig.colorbar(sc, ax=axs[0])
    im = axs[1].imshow(kernel)
    fig.colorbar(im, ax=axs[1])
    fig.savefig("_tmp_pca_prop.png")

    return
def mergesdfs(sdflist):
    """
    """
    # Merge sdf0 with sdf1 and etc

    molobjs_list = []

    for sdf in sdflist:
        sdfs = cheminfo.read_sdffile(sdf)
        molobjs = [molobj for molobj in sdfs]
        molobjs_list.append(molobjs)

    coordinates_sdf = []

    for molobjs in molobjs_list:
        coordinates = [
            cheminfo.molobj_get_coordinates(molobj) for molobj in molobjs
        ]
        coordinates_sdf.append(coordinates)

    atoms, xyz = cheminfo.molobj_to_xyz(molobjs_list[0][0])

    coordinates_x = coordinates_sdf[0]
    coordinates_y = coordinates_sdf[1]

    # JCK
    sigmas = [0.8]
    parameters = {
        'alchemy': 'off',
        "cut_distance": 10**6,
        'kernel_args': {
            "sigma": sigmas
        },
    }

    repObjs_x = [
        workkernel.FchlRepresentation(atoms, coordinates, **parameters)
        for coordinates in coordinates_x[:3]
    ]
    repObjs_y = [
        workkernel.FchlRepresentation(atoms, coordinates, **parameters)
        for coordinates in coordinates_y[:4]
    ]

    similarity = workkernel.get_kernel_from_objs(repObjs_x, repObjs_y,
                                                 **parameters)
    print("qml obj kernel:")
    print(similarity)

    # JCK

    # Energy are the same, compare with FCHL
    representations_x = get_representations_fchl(atoms,
                                                 coordinates_x,
                                                 max_size=len(atoms))
    representations_x = np.asarray(representations_x[:3])

    representations_y = get_representations_fchl(atoms,
                                                 coordinates_y,
                                                 max_size=len(atoms))
    representations_y = np.asarray(representations_y[:4])

    similarity = get_kernel_fchl(representations_x, representations_y)
    print("qml kernel:")
    print(similarity)

    similarity = get_kernel_fchl_(representations_x, representations_y)
    print("qml fst kernel:")
    print(similarity)

    return