def main(): args = get_arguments() data = pandas.read_hdf(args.infile, 'table') keys = data[args.smiles_column].map(len) < 121 if args.length <= len(keys): data = data[keys].sample(n=args.length) else: data = data[keys] structures = data[args.smiles_column].map(lambda x: list(x.ljust(120))) if args.property_column: properties = data[args.property_column][keys] del data train_idx, test_idx = map( np.array, train_test_split(structures.index, test_size=0.20)) charset = list(reduce(lambda x, y: set(y) | x, structures, set())) one_hot_encoded_fn = lambda row: map( lambda x: one_hot_array(x, len(charset)), one_hot_index(row, charset)) h5f = h5py.File(args.outfile, 'w') h5f.create_dataset('charset', data=charset) def create_chunk_dataset(h5file, dataset_name, dataset, dataset_shape, chunk_size=1000, apply_fn=None): new_data = h5file.create_dataset(dataset_name, dataset_shape, chunks=tuple([chunk_size] + list(dataset_shape[1:]))) for (chunk_ixs, chunk) in chunk_iterator(dataset): if not apply_fn: new_data[chunk_ixs, ...] = chunk else: new_data[chunk_ixs, ...] = apply_fn(chunk) create_chunk_dataset( h5f, 'data_train', train_idx, (len(train_idx), 120, len(charset)), apply_fn=lambda ch: np.array(map(one_hot_encoded_fn, structures[ch]))) create_chunk_dataset( h5f, 'data_test', test_idx, (len(test_idx), 120, len(charset)), apply_fn=lambda ch: np.array(map(one_hot_encoded_fn, structures[ch]))) if args.property_column: h5f.create_dataset('property_train', data=properties[train_idx]) h5f.create_dataset('property_test', data=properties[test_idx]) h5f.close()
def main(): args = get_arguments() data1 = open(args.infile, 'r') data1 = data1.readlines() data2 = open(args.infile2, 'r') data2 = data2.readlines() data3 = open(args.infile3, 'r') data3 = data3.readlines() len_train = len(data1) len_test = len(data2) + len(data3) data = np.array(data1 + data2 + data3) train_idx = np.arange(len_train) np.random.shuffle(train_idx) test_idx = np.arange(len_train, len_train + len_test) np.random.shuffle(test_idx) if args.length < len(data): data = np.random.choice(data, args.length) charset = list(reduce(lambda x, y: set(y) | x, data, set())) one_hot_encoded_fn = lambda row: map( lambda x: one_hot_array(x, len(charset)), one_hot_index(row, charset)) h5f = h5py.File(args.outfile, 'w') h5f.create_dataset('charset', data=charset) def create_chunck_dataset(h5file, dataset_name, dataset, dataset_shape, chunk_size=1000, apply_fn=None): new_data = h5file.create_dataset(dataset_name, dataset_shape, chunks=tuple([chunk_size] + list(dataset_shape[1:]))) for (chunk_ixs, chunk) in chunk_iterator(dataset): if not apply_fn: new_data[chunk_ixs, ...] = chunk else: #print(chunk) #print(type(chunk)) new_data[chunk_ixs, ...] = apply_fn(chunk) create_chunck_dataset( h5f, 'data_train', train_idx, (len(train_idx), MAX_LEN_STR, len(charset)), apply_fn=lambda ch: np.array(map(one_hot_encoded_fn, data[ch]))) create_chunck_dataset( h5f, 'data_test', test_idx, (len(test_idx), MAX_LEN_STR, len(charset)), apply_fn=lambda ch: np.array(map(one_hot_encoded_fn, data[ch]))) h5f.close()
def main(): args = get_arguments() data = pandas.read_hdf(args.infile, 'table') keys = data[args.smiles_column].map(len) < 121 if args.length <= len(keys): data = data[keys].sample(n = args.length) else: data = data[keys] structures = data[args.smiles_column].map(lambda x: list(x.ljust(120))) if args.property_column: properties = data[args.property_column][keys] del data train_idx, test_idx = map(np.array, train_test_split(structures.index, test_size = 0.20)) charset = list(reduce(lambda x, y: set(y) | x, structures, set())) one_hot_encoded_fn = lambda row: map(lambda x: one_hot_array(x, len(charset)), one_hot_index(row, charset)) h5f = h5py.File(args.outfile, 'w') h5f.create_dataset('charset', data = charset) def create_chunk_dataset(h5file, dataset_name, dataset, dataset_shape, chunk_size=1000, apply_fn=None): new_data = h5file.create_dataset(dataset_name, dataset_shape, chunks=tuple([chunk_size]+list(dataset_shape[1:]))) for (chunk_ixs, chunk) in chunk_iterator(dataset): if not apply_fn: new_data[chunk_ixs, ...] = chunk else: new_data[chunk_ixs, ...] = apply_fn(chunk) create_chunk_dataset(h5f, 'data_train', train_idx, (len(train_idx), 120, len(charset)), apply_fn=lambda ch: np.array(map(one_hot_encoded_fn, structures[ch]))) create_chunk_dataset(h5f, 'data_test', test_idx, (len(test_idx), 120, len(charset)), apply_fn=lambda ch: np.array(map(one_hot_encoded_fn, structures[ch]))) if args.property_column: h5f.create_dataset('property_train', data = properties[train_idx]) h5f.create_dataset('property_test', data = properties[test_idx]) h5f.close()
def main(): #args = get_arguments() data = pandas.read_hdf('data/smiles_50k.h5', 'table') keys = data[SMILES_COL_NAME].map(len) < 121 if MAX_NUM_ROWS <= len(keys): data = data[keys].sample(n = MAX_NUM_ROWS) else: data = data[keys] structures = data[SMILES_COL_NAME].map(lambda x: list(x.ljust(120))) del data train_idx, test_idx = map(np.array, train_test_split(structures.index, test_size = 0.20)) charset = list(reduce(lambda x, y: set(y) | x, structures, set())) one_hot_encoded_fn = lambda row: map(lambda x: one_hot_array(x, len(charset)), one_hot_index(row, charset)) h5f = h5py.File('data/processed.h5', 'w') h5f.create_dataset('charset', data = charset) def create_chunk_dataset(h5file, dataset_name, dataset, dataset_shape, chunk_size=1000, apply_fn=None): new_data = h5file.create_dataset(dataset_name, dataset_shape, chunks=tuple([chunk_size]+list(dataset_shape[1:]))) for (chunk_ixs, chunk) in chunk_iterator(dataset): if not apply_fn: new_data[chunk_ixs, ...] = chunk else: new_data[chunk_ixs, ...] = apply_fn(chunk) create_chunk_dataset(h5f, 'data_train', train_idx, (len(train_idx), 120, len(charset)), apply_fn=lambda ch: np.array(map(one_hot_encoded_fn, structures[ch]))) create_chunk_dataset(h5f, 'data_test', test_idx, (len(test_idx), 120, len(charset)), apply_fn=lambda ch: np.array(map(one_hot_encoded_fn, structures[ch]))) h5f.close()
def interpolate(source, dest, steps, charset, model, latent_dim, width): source_just = source.ljust(width) dest_just = dest.ljust(width) one_hot_encoded_fn = lambda row: map(lambda x: one_hot_array(x, len(charset)), one_hot_index(row, charset)) source_encoded = numpy.array(map(one_hot_encoded_fn, source_just)) source_x_latent = model.encoder.predict(source_encoded.reshape(1, width, len(charset))) dest_encoded = numpy.array(map(one_hot_encoded_fn, dest_just)) dest_x_latent = model.encoder.predict(dest_encoded.reshape(1, width, len(charset))) step = (dest_x_latent - source_x_latent)/float(steps) results = [] for i in range(steps): item = source_x_latent + (step * i) sampled = model.decoder.predict(item.reshape(1, latent_dim)).argmax(axis=2)[0] sampled = decode_smiles_from_indexes(sampled, charset) results.append( (i, item, sampled) ) return results
def interpolate(source, dest, steps, charset, model, latent_dim, width): source_just = source.ljust(width) dest_just = dest.ljust(width) one_hot_encoded_fn = lambda row: list(map(lambda x: one_hot_array(x, len(charset)), one_hot_index(row, charset))) source_encoded = numpy.array(list(map(one_hot_encoded_fn, source_just))) source_x_latent = model.encoder.predict(source_encoded.reshape(1, width, len(charset))) dest_encoded = numpy.array(list(map(one_hot_encoded_fn, dest_just))) dest_x_latent = model.encoder.predict(dest_encoded.reshape(1, width, len(charset))) step = (dest_x_latent - source_x_latent)/float(steps) results = [] for i in range(steps): item = source_x_latent + (step * i) sampled = model.decoder.predict(item.reshape(1, latent_dim)).argmax(axis=2)[0] sampled = decode_smiles_from_indexes(sampled, charset) results.append( (i, item, sampled) ) return results
def one_hot_encoded_fn(row): return map(lambda x: one_hot_array(x, len(charset)), one_hot_index(row, charset))