Ejemplo n.º 1
0
def main():
    args = get_arguments()
    data = pandas.read_hdf(args.infile, 'table')
    keys = data[args.smiles_column].map(len) < 121

    if args.length <= len(keys):
        data = data[keys].sample(n=args.length)
    else:
        data = data[keys]

    structures = data[args.smiles_column].map(lambda x: list(x.ljust(120)))

    if args.property_column:
        properties = data[args.property_column][keys]

    del data

    train_idx, test_idx = map(
        np.array, train_test_split(structures.index, test_size=0.20))

    charset = list(reduce(lambda x, y: set(y) | x, structures, set()))

    one_hot_encoded_fn = lambda row: map(
        lambda x: one_hot_array(x, len(charset)), one_hot_index(row, charset))

    h5f = h5py.File(args.outfile, 'w')
    h5f.create_dataset('charset', data=charset)

    def create_chunk_dataset(h5file,
                             dataset_name,
                             dataset,
                             dataset_shape,
                             chunk_size=1000,
                             apply_fn=None):
        new_data = h5file.create_dataset(dataset_name,
                                         dataset_shape,
                                         chunks=tuple([chunk_size] +
                                                      list(dataset_shape[1:])))
        for (chunk_ixs, chunk) in chunk_iterator(dataset):
            if not apply_fn:
                new_data[chunk_ixs, ...] = chunk
            else:
                new_data[chunk_ixs, ...] = apply_fn(chunk)

    create_chunk_dataset(
        h5f,
        'data_train',
        train_idx, (len(train_idx), 120, len(charset)),
        apply_fn=lambda ch: np.array(map(one_hot_encoded_fn, structures[ch])))
    create_chunk_dataset(
        h5f,
        'data_test',
        test_idx, (len(test_idx), 120, len(charset)),
        apply_fn=lambda ch: np.array(map(one_hot_encoded_fn, structures[ch])))

    if args.property_column:
        h5f.create_dataset('property_train', data=properties[train_idx])
        h5f.create_dataset('property_test', data=properties[test_idx])
    h5f.close()
Ejemplo n.º 2
0
def main():
    args = get_arguments()
    data1 = open(args.infile, 'r')
    data1 = data1.readlines()
    data2 = open(args.infile2, 'r')
    data2 = data2.readlines()
    data3 = open(args.infile3, 'r')
    data3 = data3.readlines()
    len_train = len(data1)
    len_test = len(data2) + len(data3)

    data = np.array(data1 + data2 + data3)
    train_idx = np.arange(len_train)
    np.random.shuffle(train_idx)
    test_idx = np.arange(len_train, len_train + len_test)
    np.random.shuffle(test_idx)

    if args.length < len(data):
        data = np.random.choice(data, args.length)

    charset = list(reduce(lambda x, y: set(y) | x, data, set()))
    one_hot_encoded_fn = lambda row: map(
        lambda x: one_hot_array(x, len(charset)), one_hot_index(row, charset))
    h5f = h5py.File(args.outfile, 'w')
    h5f.create_dataset('charset', data=charset)

    def create_chunck_dataset(h5file,
                              dataset_name,
                              dataset,
                              dataset_shape,
                              chunk_size=1000,
                              apply_fn=None):
        new_data = h5file.create_dataset(dataset_name,
                                         dataset_shape,
                                         chunks=tuple([chunk_size] +
                                                      list(dataset_shape[1:])))
        for (chunk_ixs, chunk) in chunk_iterator(dataset):
            if not apply_fn:
                new_data[chunk_ixs, ...] = chunk
            else:
                #print(chunk)
                #print(type(chunk))
                new_data[chunk_ixs, ...] = apply_fn(chunk)

    create_chunck_dataset(
        h5f,
        'data_train',
        train_idx, (len(train_idx), MAX_LEN_STR, len(charset)),
        apply_fn=lambda ch: np.array(map(one_hot_encoded_fn, data[ch])))
    create_chunck_dataset(
        h5f,
        'data_test',
        test_idx, (len(test_idx), MAX_LEN_STR, len(charset)),
        apply_fn=lambda ch: np.array(map(one_hot_encoded_fn, data[ch])))

    h5f.close()
Ejemplo n.º 3
0
def main():
    args = get_arguments()
    data = pandas.read_hdf(args.infile, 'table')
    keys = data[args.smiles_column].map(len) < 121

    if args.length <= len(keys):
        data = data[keys].sample(n = args.length)
    else:
        data = data[keys]

    structures = data[args.smiles_column].map(lambda x: list(x.ljust(120)))

    if args.property_column:
        properties = data[args.property_column][keys]

    del data

    train_idx, test_idx = map(np.array,
                              train_test_split(structures.index, test_size = 0.20))

    charset = list(reduce(lambda x, y: set(y) | x, structures, set()))

    one_hot_encoded_fn = lambda row: map(lambda x: one_hot_array(x, len(charset)),
                                                one_hot_index(row, charset))

    h5f = h5py.File(args.outfile, 'w')
    h5f.create_dataset('charset', data = charset)

    def create_chunk_dataset(h5file, dataset_name, dataset, dataset_shape,
                             chunk_size=1000, apply_fn=None):
        new_data = h5file.create_dataset(dataset_name, dataset_shape,
                                         chunks=tuple([chunk_size]+list(dataset_shape[1:])))
        for (chunk_ixs, chunk) in chunk_iterator(dataset):
            if not apply_fn:
                new_data[chunk_ixs, ...] = chunk
            else:
                new_data[chunk_ixs, ...] = apply_fn(chunk)

    create_chunk_dataset(h5f, 'data_train', train_idx,
                         (len(train_idx), 120, len(charset)),
                         apply_fn=lambda ch: np.array(map(one_hot_encoded_fn,
                                                          structures[ch])))
    create_chunk_dataset(h5f, 'data_test', test_idx,
                         (len(test_idx), 120, len(charset)),
                         apply_fn=lambda ch: np.array(map(one_hot_encoded_fn,
                                                          structures[ch])))

    if args.property_column:
        h5f.create_dataset('property_train', data = properties[train_idx])
        h5f.create_dataset('property_test', data = properties[test_idx])
    h5f.close()
Ejemplo n.º 4
0
def main():
    #args = get_arguments()
    data = pandas.read_hdf('data/smiles_50k.h5', 'table')
    keys = data[SMILES_COL_NAME].map(len) < 121

    if MAX_NUM_ROWS <= len(keys):
        data = data[keys].sample(n = MAX_NUM_ROWS)
    else:
        data = data[keys]

    structures = data[SMILES_COL_NAME].map(lambda x: list(x.ljust(120)))


    del data

    train_idx, test_idx = map(np.array,
                              train_test_split(structures.index, test_size = 0.20))

    charset = list(reduce(lambda x, y: set(y) | x, structures, set()))

    one_hot_encoded_fn = lambda row: map(lambda x: one_hot_array(x, len(charset)),
                                                one_hot_index(row, charset))

    h5f = h5py.File('data/processed.h5', 'w')
    h5f.create_dataset('charset', data = charset)

    def create_chunk_dataset(h5file, dataset_name, dataset, dataset_shape,
                             chunk_size=1000, apply_fn=None):
        new_data = h5file.create_dataset(dataset_name, dataset_shape,
                                         chunks=tuple([chunk_size]+list(dataset_shape[1:])))
        for (chunk_ixs, chunk) in chunk_iterator(dataset):
            if not apply_fn:
                new_data[chunk_ixs, ...] = chunk
            else:
                new_data[chunk_ixs, ...] = apply_fn(chunk)

    create_chunk_dataset(h5f, 'data_train', train_idx,
                         (len(train_idx), 120, len(charset)),
                         apply_fn=lambda ch: np.array(map(one_hot_encoded_fn,
                                                          structures[ch])))
    create_chunk_dataset(h5f, 'data_test', test_idx,
                         (len(test_idx), 120, len(charset)),
                         apply_fn=lambda ch: np.array(map(one_hot_encoded_fn,
                                                          structures[ch])))
    h5f.close()
Ejemplo n.º 5
0
def interpolate(source, dest, steps, charset, model, latent_dim, width):
    source_just = source.ljust(width)
    dest_just = dest.ljust(width)
    one_hot_encoded_fn = lambda row: map(lambda x: one_hot_array(x, len(charset)),
                                                one_hot_index(row, charset))
    source_encoded = numpy.array(map(one_hot_encoded_fn, source_just))
    source_x_latent = model.encoder.predict(source_encoded.reshape(1, width, len(charset)))
    dest_encoded = numpy.array(map(one_hot_encoded_fn, dest_just))
    dest_x_latent = model.encoder.predict(dest_encoded.reshape(1, width, len(charset)))

    step = (dest_x_latent - source_x_latent)/float(steps)
    results = []
    for i in range(steps):
        item = source_x_latent + (step  * i)
        sampled = model.decoder.predict(item.reshape(1, latent_dim)).argmax(axis=2)[0]
        sampled = decode_smiles_from_indexes(sampled, charset)
        results.append( (i, item, sampled) )

    return results
Ejemplo n.º 6
0
def interpolate(source, dest, steps, charset, model, latent_dim, width):
    source_just = source.ljust(width)
    dest_just = dest.ljust(width)
    one_hot_encoded_fn = lambda row: list(map(lambda x: one_hot_array(x, len(charset)),
                                                one_hot_index(row, charset)))
    source_encoded = numpy.array(list(map(one_hot_encoded_fn, source_just)))
    source_x_latent = model.encoder.predict(source_encoded.reshape(1, width, len(charset)))
    dest_encoded = numpy.array(list(map(one_hot_encoded_fn, dest_just)))
    dest_x_latent = model.encoder.predict(dest_encoded.reshape(1, width, len(charset)))

    step = (dest_x_latent - source_x_latent)/float(steps)
    results = []
    for i in range(steps):
        item = source_x_latent + (step  * i)
        sampled = model.decoder.predict(item.reshape(1, latent_dim)).argmax(axis=2)[0]
        sampled = decode_smiles_from_indexes(sampled, charset)
        results.append( (i, item, sampled) )

    return results
Ejemplo n.º 7
0
 def one_hot_encoded_fn(row):
     return map(lambda x: one_hot_array(x, len(charset)), one_hot_index(row, charset))