def main(): args = get_arguments() initial_file_list = [] process_folder(args.in_folder, initial_file_list) if len(initial_file_list) == 0: print("Did not find a valid input file in " + args.in_folder) exit() if len(initial_file_list) == 1: initial_file_list.append(initial_file_list[0]) else: initial_file_list = sorted(initial_file_list) inputA = initial_file_list[0] inputB = initial_file_list[len(initial_file_list) - 1] initial_smiles_strings = [] initial_smiles_strings.append(str(obj_tools.obj2string(inputA))) initial_smiles_strings.append(str(obj_tools.obj2string(inputB))) tile_grammar = grammar.TilingGrammar(initial_smiles_strings) print("max # neighbors: " + str(tile_grammar.max_degree())) tile_grammar.store(args.out_grammarpath) if args.fix_variations: print("fixing variations...") fix_variations(args.in_folder, [], inputA, inputB) print("removing duplicates...") remove_duplicates(tile_grammar, args.in_folder, inputA, inputB, initial_smiles_strings) smiles_strings = [] for i in range(args.num_iterations): current_file_list = [] process_folder(args.in_folder, current_file_list) print("Current # of variations: " + str(len(current_file_list))) if len(current_file_list) == 1: current_file_list.append(current_file_list[0]) augment_folder(current_file_list, smiles_strings) smiles_strings = list(set(smiles_strings)) if args.fix_variations: print("fixing variations...") fix_variations(args.in_folder, current_file_list, inputA, inputB) print("removing duplicates...") remove_duplicates(tile_grammar, args.in_folder, inputA, inputB, initial_smiles_strings) print("Iteration " + str(i) + " # of strings: " + str(len(smiles_strings))) loaded_grammar = grammar.TilingGrammar([]) loaded_grammar.load(args.out_grammarpath) valid_strings = [] for w in smiles_strings: if(loaded_grammar.check_word(w) == True): if len(str(w)) > 0 : valid_strings.append(w) print("# valid strings: " + str(len(valid_strings))) df = pandas.DataFrame({args.smiles_column : valid_strings}) df.to_hdf(args.out_filepath, "table", format = "table", data_columns = True)
def decoder_rnd(args, model): latent_dim = args.latent_dim data, charset = read_latent_data(args.data) if os.path.isfile(args.model): model.load(charset, args.model, latent_rep_size=latent_dim) else: raise ValueError("Model file %s doesn't exist" % args.model) tiling_grammar = grammar.TilingGrammar([]) if os.path.isfile(args.grammar): tiling_grammar.load(args.grammar) else: raise ValueError("Grammar file %s doesn't exist" % args.grammar) for i in range(args.samples): mu, sigma = 0, 0.01 z_sample = np.random.normal(mu, sigma, latent_dim) decoded_rnd_sample = model.decoder.predict( z_sample.reshape(1, latent_dim)).argmax(axis=2)[0] char_rnd_sample = decode_smiles_from_indexes(decoded_rnd_sample, charset) if tiling_grammar.check_word(char_rnd_sample): print("random point: " + char_rnd_sample) print( "-----------------------------------------------------------------------" )
def load_input(args): if not os.path.isfile(args.input_data): raise ValueError("Input file %s doesn't exist" % args.input_data) data_train, data_test, charset = load_dataset(args.input_data) model = TilingVAE() if args.model_type == 'lstm': model = Tiling_LSTM_VAE() elif args.model_type == 'lstm_': model = Tiling_LSTM_VAE_XL() if os.path.isfile(args.model): model.load(charset, args.model, latent_rep_size=args.latent_dim) else: raise ValueError("Model file %s doesn't exist" % args.model) tiling_grammar = grammar.TilingGrammar([]) if os.path.isfile(args.grammar): tiling_grammar.load(args.grammar) else: raise ValueError("Grammar file %s doesn't exist" % args.grammar) if TREE_GRAMMAR: tiling_grammar.convert_to_tree_grammar() data = np.append(data_train, data_test, axis=0) latent_data = model.encoder.predict(data) return model, tiling_grammar, latent_data, charset
def main(): args = get_arguments() initial_smiles_strings = [] process_folder(args.in_folder, initial_smiles_strings) initial_smiles_strings = list(set(initial_smiles_strings)) print("# initial strings: " + str(len(initial_smiles_strings))) length_variations = [] for word in initial_smiles_strings: str_len_variation_generator(length_variations, word) print("# length variations: " + str(len(length_variations))) char_variations = [] for word in length_variations: str_char_variation_generator(char_variations, word) print("# char variations: " + str(len(char_variations))) tile_grammar = grammar.TilingGrammar(initial_smiles_strings) output_strings = [] for word in char_variations: if (tile_grammar.check_word(word) == True): output_strings.append(word) tile_grammar.store(args.out_grammarpath) print("# all valid variations: " + str(len(char_variations))) df = pandas.DataFrame({args.smiles_column: output_strings}) df.to_hdf(args.out_filepath, "table", format="table", data_columns=True)
def decoder_lerp(args, model): latent_dim = args.latent_dim data, charset = read_latent_data(args.data) if os.path.isfile(args.model): model.load(charset, args.model, latent_rep_size=latent_dim) else: raise ValueError("Model file %s doesn't exist" % args.model) tiling_grammar = grammar.TilingGrammar([]) if os.path.isfile(args.grammar): tiling_grammar.load(args.grammar) else: raise ValueError("Grammar file %s doesn't exist" % args.grammar) for i in range(args.samples): sample_ids = np.random.randint(0, len(data), 2) decoded_data_0 = model.decoder.predict(data[sample_ids[0]].reshape( 1, latent_dim)).argmax(axis=2)[0] char_data_0 = decode_smiles_from_indexes(decoded_data_0, charset) decoded_data_1 = model.decoder.predict(data[sample_ids[1]].reshape( 1, latent_dim)).argmax(axis=2)[0] char_data_1 = decode_smiles_from_indexes(decoded_data_1, charset) if not (tiling_grammar.check_word(char_data_0) and tiling_grammar.check_word(char_data_1)): continue if args.require_cycle and char_data_0.find( "0") == -1 and char_data_1.find("0") == -1: continue print( "-----------------------------------------------------------------------" ) print("data point 0.0: " + char_data_0) for k in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]: for step_size in [0.0001, 0.001, 0.01, 0.02, 0.05, 0.075]: rnd_offset = np.array([np.random.random(latent_dim) ]) * step_size z_sample = (1.0 - k) * data[sample_ids[0]] + k * data[ sample_ids[1]] + rnd_offset decoded_sample_k = model.decoder.predict( z_sample.reshape(1, latent_dim)).argmax(axis=2)[0] char_sample_k = decode_smiles_from_indexes( decoded_sample_k, charset) if (char_sample_k != char_data_0 and char_sample_k != char_data_1 and tiling_grammar.check_word(char_sample_k)): print("data point " + str(k) + ": " + char_sample_k + " (rnd offset = " + str(step_size) + ")") break print("data point 1.0: " + char_data_1) print( "-----------------------------------------------------------------------" )
def main(): args = get_arguments() in_smiles_string = args.in_string tiling_grammar = grammar.TilingGrammar([]) if os.path.isfile(args.in_grammar): tiling_grammar.load(args.in_grammar) else: raise ValueError("Grammar file %s doesn't exist" % args.in_grammar) success = str_to_file(args.in_folder, in_smiles_string, tiling_grammar) if not success: print("Did not find " + in_smiles_string)
def decoder_nbr(args, model): latent_dim = args.latent_dim data, charset = read_latent_data(args.data) if os.path.isfile(args.model): model.load(charset, args.model, latent_rep_size=latent_dim) else: raise ValueError("Model file %s doesn't exist" % args.model) tiling_grammar = grammar.TilingGrammar([]) if os.path.isfile(args.grammar): tiling_grammar.load(args.grammar) else: raise ValueError("Grammar file %s doesn't exist" % args.grammar) for i in range(args.samples): sample_id = np.random.randint(0, len(data)) decoded_data = model.decoder.predict(data[sample_id].reshape( 1, latent_dim)).argmax(axis=2)[0] char_data = decode_smiles_from_indexes(decoded_data, charset) if not tiling_grammar.check_word(char_data): continue if args.require_cycle and char_data.find("0") == -1: continue for step_size in [ 0.0001, 0.001, 0.01, 0.02, 0.05, 0.075, 0.1, 0.15, 0.2, 0.25 ]: z_sample = np.array([np.random.random(latent_dim)]) * step_size z_sample += data[sample_id] decoded_sample = model.decoder.predict( z_sample.reshape(1, latent_dim)).argmax(axis=2)[0] char_sample = decode_smiles_from_indexes(decoded_sample, charset) if (char_sample != char_data and tiling_grammar.check_word(char_sample)): print("data point : " + char_data) print("offset point: " + char_sample) print("offset magnitude: " + str(step_size)) print( "-----------------------------------------------------------------------" ) break
def main(): args = get_arguments() data = pandas.read_hdf(args.infile, "table") keys = data[args.smiles_column].map(len) < 121 if args.length <= len(keys): data = data[keys].sample(n=args.length) else: data = data[keys] loaded_grammar = grammar.TilingGrammar([]) loaded_grammar.load(args.ingrammar) num_data_points = len(data[args.smiles_column]) vec_dims = len(loaded_grammar.charset) + loaded_grammar.max_degree() structures_one_hot = np.zeros((num_data_points, 120, vec_dims)) for s in range(num_data_points): structures_one_hot[s] = loaded_grammar.encode_to_one_hot( data[args.smiles_column][s], 120) del data #data_train, data_test = train_test_split(structures_one_hot, test_size = 0.20) train_idx, test_idx = train_test_split(xrange(structures_one_hot.shape[0]), test_size=0.20) h5f = h5py.File(args.outfile, "w") h5f.create_dataset("connectivity_dims", data=loaded_grammar.max_degree()) h5f.create_dataset("charset", data=loaded_grammar.charset) h5f.create_dataset("data_train", data=structures_one_hot[train_idx], chunks=(200, 120, vec_dims)) h5f.create_dataset("data_test", data=structures_one_hot[test_idx], chunks=(200, 120, vec_dims)) if args.property_column: h5f.create_dataset("property_train", data=properties[train_idx]) h5f.create_dataset("property_test", data=properties[test_idx]) h5f.close()
def decoder_path(args, model): latent_dim = args.latent_dim data, charset = read_latent_data(args.data) if os.path.isfile(args.model): model.load(charset, args.model, latent_rep_size=latent_dim) else: raise ValueError("Model file %s doesn't exist" % args.model) tiling_grammar = grammar.TilingGrammar([]) if os.path.isfile(args.grammar): tiling_grammar.load(args.grammar) else: raise ValueError("Grammar file %s doesn't exist" % args.grammar) for i in range(args.samples): sample_ids = np.random.randint(0, len(data), 2) decoded_data_0 = model.decoder.predict(data[sample_ids[0]].reshape( 1, latent_dim)).argmax(axis=2)[0] char_data_0 = decode_smiles_from_indexes(decoded_data_0, charset) decoded_data_1 = model.decoder.predict(data[sample_ids[1]].reshape( 1, latent_dim)).argmax(axis=2)[0] char_data_1 = decode_smiles_from_indexes(decoded_data_1, charset) if not (tiling_grammar.check_word(char_data_0) and tiling_grammar.check_word(char_data_1)): continue if args.require_cycle and char_data_0.find( "0") == -1 and char_data_1.find("0") == -1: continue print("---------------------sample " + str(i) + "------------------------------------------") print("data point 0.0: " + char_data_0) path_ids = [] path_ids.append(sample_ids[0]) path_ids = _gen_latent_path(data, sample_ids[0], sample_ids[1], waypoints=path_ids) path_ids.append(sample_ids[1]) for p in range(len(path_ids) - 1): decoded_data_p = model.decoder.predict( data[path_ids[p + 1]].reshape(1, latent_dim)).argmax(axis=2)[0] char_data_p = decode_smiles_from_indexes(decoded_data_p, charset) if not tiling_grammar.check_word(char_data_p): continue for k in [0.2, 0.4, 0.6, 0.8]: current_distance = np.linalg.norm(data[path_ids[p]] - data[path_ids[p + 1]]) rnd_offset = np.array([np.random.random(latent_dim) ]) * 0.1 * current_distance z_sample = (1.0 - k) * data[path_ids[p]] + k * data[path_ids[ p + 1]] + rnd_offset decoded_sample_k = model.decoder.predict( z_sample.reshape(1, latent_dim)).argmax(axis=2)[0] char_sample_k = decode_smiles_from_indexes( decoded_sample_k, charset) if (char_sample_k != char_data_0 and char_sample_k != char_data_1 and char_sample_k != char_data_p and tiling_grammar.check_word(char_sample_k)): print("sample point " + str(k) + ": " + char_sample_k + " (rnd offset = " + str(0.1 * current_distance) + ")") break if p < len(path_ids) - 2: print("path waypoint " + str(p + 1) + ": " + char_data_p) print("data point 1.0: " + char_data_1) print( "-----------------------------------------------------------------------" )
def main(): args = get_arguments() data_train, data_test, charset = load_dataset(args.data) word_length = data_train.shape[1] print("----------- max word length is ", word_length, " -----------------") #print ("Grammar characters: ") #print (charset) #print("data dtype is " + str(data_train.dtype)) #print("vector dtype is " + str(data_train[0].dtype)) #print("vector shape is " + str(data_train[0].shape)) #for i in range(1): # sample_id = np.random.randint(0, len(data_train)) # exaple = data_train[sample_id] # print("training vector " + str(sample_id) + ":") # print(exaple) #return if os.path.isfile(args.grammar): model = Tiling_Triplet_LSTM_VAE() elif args.type == 'lstm': model = Tiling_LSTM_VAE() elif args.type == 'lstm_large': model = Tiling_LSTM_VAE_XL() elif args.type == 'simple': model = TilingVAE() else: model = Tiling_LSTM_VAE() if os.path.isfile(args.model): model.load(charset, args.model, max_w_length=word_length, latent_rep_size=args.latent_dim) else: model.create(charset, max_length=word_length, latent_rep_size=args.latent_dim) print("available metrics: ", model.autoencoder.metrics_names) checkpointer = ModelCheckpoint(monitor='val_loss', filepath=args.model, verbose=1, mode='min', save_best_only=True) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.00000001) filename, ext = os.path.splitext(args.model) plot_model(model.autoencoder, to_file=filename + '_nn.pdf', show_shapes=True) csv_logger = CSVLogger(filename + '_training.log', append=True) plot = PlotLearning() plot.set_filename(filename) if os.path.isfile(args.grammar): tiling_grammar = grammar.TilingGrammar([]) tiling_grammar.load(args.grammar) tri_shuffle = TriplesShuffle(train=data_train, test=data_test, charset=charset, tile_grammar=tiling_grammar) reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=0.0000000001) history = model.autoencoder.fit( { 'main_input': tri_shuffle.train_x, 'positive_input': tri_shuffle.train_y, 'negative_input': tri_shuffle.train_z }, tri_shuffle.train_x, shuffle=True, epochs=args.epochs, batch_size=args.batch_size, callbacks=[tri_shuffle, checkpointer, reduce_lr, plot, csv_logger], validation_data=({ 'main_input': tri_shuffle.test_x, 'positive_input': tri_shuffle.test_y, 'negative_input': tri_shuffle.test_z }, tri_shuffle.test_x)) else: history = model.autoencoder.fit( data_train, data_train, shuffle=True, epochs=args.epochs, batch_size=args.batch_size, callbacks=[checkpointer, reduce_lr, plot, csv_logger], validation_data=(data_test, data_test))
def sample_path_from_strings(args): if not os.path.isfile(args.input_data): raise ValueError("Input file %s doesn't exist" % args.input_data) tiling_grammar = grammar.TilingGrammar([]) if os.path.isfile(args.grammar): tiling_grammar.load(args.grammar) else: raise ValueError("Grammar file %s doesn't exist" % args.grammar) if TREE_GRAMMAR: tiling_grammar.convert_to_tree_grammar() data = pandas.read_hdf(args.input_data, 'table') words = data["structure"] if not os.path.isfile(args.latent_graph): raise ValueError("Search graph file %s doesn't exist" % args.latent_graph) search_graph = nx.read_graphml(args.latent_graph) node_list = [int(x) for x in list(search_graph.nodes)] path_weights = [] for i in range(args.num_samples): samples = np.random.randint(0, len(node_list), 2) sample_ids = [node_list[samples[0]], node_list[samples[1]]] char_data_0 = words[sample_ids[0]] char_data_1 = words[sample_ids[1]] print("---------------------path sample " + str(i) + "------------------------------------------") if not (tiling_grammar.check_word(char_data_0) and tiling_grammar.check_word(char_data_1)): print("invalid words") continue try: shortest_path = nx.shortest_path(search_graph, source=str(sample_ids[0]), target=str(sample_ids[1]), weight='weight') except nx.exception.NetworkXNoPath: print("no path between sample nodes") if len(shortest_path) < 5: print("path too short") continue decoded_words = [char_data_0] valid_words = [True] for pt_id in shortest_path[1:-1]: word = words[int(pt_id)] if word not in decoded_words: decoded_words.append(word) valid_words.append(True) decoded_words.append(char_data_1) valid_words.append(True) if valid_words.count(True) < 5: print("too few valid words") continue decoded_valid_words = [ w for w, flag in zip(decoded_words, valid_words) if flag ] edge_weights = [ tiling_grammar.word_similarity(w1, w2) for w1, w2 in zip( decoded_valid_words[:-1], decoded_valid_words[1:]) ] file_name_0 = "?" file_name_1 = "?" if args.folder_name != "": found0, file_name_0 = str_to_file(args.folder_name, char_data_0, tiling_grammar) found1, file_name_1 = str_to_file(args.folder_name, char_data_1, tiling_grammar) #print("---------------------path sample " + str(i) + "------------------------------------------") print("start :", decoded_words[0], " file: ", file_name_0) file_name_w = "?" for w, flag in zip(decoded_words, valid_words)[1:-1]: if flag: if args.folder_name != "": found, file_name_w = str_to_file(args.folder_name, w, tiling_grammar) if found: print("valid :", w, " file: ", file_name_w) else: print("valid :", w, " closest file: ", file_name_w) else: print("valid :", w) else: print("invalid:", w) print("end :", decoded_words[-1], " file: ", file_name_1) path_weights.append(sum(edge_weights)) print("edge weights: ", edge_weights) print( "----------------------------------------------------------------------------------" ) print( "----------------------------------------------------------------------------------" ) print("average accumulated path weight: ", sum(path_weights) / len(path_weights)) print("max accumulated path weight: ", max(path_weights)) print( "----------------------------------------------------------------------------------" )
def build_graph_from_strings(args): if args.graph_degree >= args.graph_size: raise ValueError( "Requested graph degree %s larger than graph size %s" % (args.graph_degree, args.model)) if not os.path.isfile(args.input_data): raise ValueError("Input file %s doesn't exist" % args.input_data) tiling_grammar = grammar.TilingGrammar([]) if os.path.isfile(args.grammar): tiling_grammar.load(args.grammar) else: raise ValueError("Grammar file %s doesn't exist" % args.grammar) if TREE_GRAMMAR: tiling_grammar.convert_to_tree_grammar() data = pandas.read_hdf(args.input_data, 'table') print("Number of SMILES strings: ", len(data)) if args.graph_size <= len(data): data = data.sample(n=args.graph_size) words = data["structure"] tmp_ids = data.index.tolist() selected_ids = [int(x) for x in tmp_ids] # setup toolbar sys.stdout.write("Inserting graph nodes [%s]" % (" " * 10)) sys.stdout.flush() sys.stdout.write("\b" * (10 + 1)) # return to start of line, after '[' search_graph = nx.Graph() #graph nodes search_graph.add_nodes_from(selected_ids) #graph edges for i, idx in enumerate(selected_ids): if i % (len(selected_ids) / 10) == len(selected_ids) / 10 - 1: sys.stdout.write("#") sys.stdout.flush() #add an edge to each similar word for j, idy in enumerate(selected_ids): if tiling_grammar.similar_words(words[idx], words[idy]): search_graph.add_edge(idx, idy, weight=0.0) #connect to k-nearest points in "string" space dist_id_pairs = [] for j in range(len(selected_ids)): idy = selected_ids[j] if idx == idy: continue dist = tiling_grammar.word_similarity(words[idx], words[idy]) dist_id_pairs.append((dist, idy)) if len(dist_id_pairs) % args.graph_degree == 0: dist_id_pairs = sorted(dist_id_pairs) dist_id_pairs = dist_id_pairs[:args.graph_degree] dist_id_pairs = sorted(dist_id_pairs) dist_id_pairs = dist_id_pairs[:args.graph_degree] for d, idy in dist_id_pairs: similarity = tiling_grammar.word_similarity(words[idx], words[idy]) search_graph.add_edge(idx, idy, weight=similarity) sys.stdout.write("\n") print("number of connected components before augmentation: ", nx.number_connected_components(search_graph)) complement = list(nx.k_edge_augmentation(search_graph, k=1, partial=True)) for (n_i, n_j) in complement: similarity = tiling_grammar.word_similarity(words[int(n_i)], words[int(n_j)]) search_graph.add_edge(n_i, n_j, weight=similarity) nx.write_graphml(search_graph, args.latent_graph)
def main(): args = get_arguments() file_list = [] process_folder(args.in_folder, file_list) inputA = file_list[0] inputB = file_list[len(file_list) - 1] initial_smiles_strings = [] initial_smiles_strings.append(str(obj_tools.obj2string(inputA))) initial_smiles_strings.append(str(obj_tools.obj2string(inputB))) tile_grammar = grammar.TilingGrammar(initial_smiles_strings) cluster_centers, node_types = shape_graph.categorize_edges( file_list[:100], tile_grammar, args.out_plot) str_node_ids = str(obj_tools.obj2strings_ids(inputA)) str_node_ids_list = str_node_ids.split("\n") smiles_strings = str_node_ids_list[:len(str_node_ids_list) / 2] node_ids_list = str_node_ids_list[len(str_node_ids_list) / 2:] node_ids = [] for node_list in node_ids_list: node_ids.append([int(i) for i in node_list.split(" ")]) graph_edges = shape_graph.ShapeGraph(obj_tools.obj2graph(inputA)) edge_categories = shape_graph.smiles_to_edge_categories( smiles_strings[0], node_ids[0], cluster_centers, graph_edges, tile_grammar) print("smiles string len: ", len(smiles_strings[0])) print(smiles_strings[0]) print("edge categories len: ", len(edge_categories)) print(edge_categories) dummy_node_id = len(node_ids[0]) padded_node_ids = [] num_nodes = 0 for char_id, _ in enumerate(smiles_strings[0]): if smiles_strings[0][char_id] in tile_grammar.charset: padded_node_ids.append(node_ids[0][num_nodes]) num_nodes += 1 else: padded_node_ids.append(dummy_node_id) padded_node_ids.append(dummy_node_id) #ensure at least one occurrence smiles_variants, node_lists = smiles_variations(smiles_strings[0], padded_node_ids, tile_grammar, 2) print("smiles variants:") print(smiles_variants) print("node lists:") print(node_lists) #print("cluster centers:") #print(cluster_centers) edge_list = tile_grammar.smiles_to_edges(smiles_strings[0], padded_node_ids) print("edge list:") print(edge_list) all_edge_categories, all_edges = shape_graph.smiles_to_all_edge_categories( smiles_strings[0], node_ids[0], cluster_centers, graph_edges, tile_grammar) if len(all_edge_categories) != len(all_edges): print("Error, mismatching number of edges", len(all_edges), "and edge categories", len(all_edge_categories)) output_str = "" for edge in all_edges: output_str += str(edge[0]) + " " output_str += "\n" for edge in all_edges: output_str += str(edge[1]) + " " output_str += "\n" for categ in all_edge_categories: output_str += str(categ) + " " output_str += "\n" print("graph embedding output string:") print(output_str)
def main(): args = get_arguments() file_list = process_folder(args.in_folder) file_list = sorted(file_list) input_a = file_list[0] input_b = file_list[len(file_list) - 1] initial_smiles_strings = [] initial_smiles_strings.append(str(obj_tools.obj2string(input_a))) initial_smiles_strings.append(str(obj_tools.obj2string(input_b))) tile_grammar = grammar.TilingGrammar(initial_smiles_strings) cluster_centers, _ = shape_graph.categorize_edges(file_list[:100], tile_grammar, args.plot) num_categories = 0 categories_prefix = [0] for clusters in cluster_centers: num_categories += clusters.shape[0] categories_prefix.append(num_categories) tile_grammar.set_categories_prefix(categories_prefix) tile_grammar.store(args.out_grammarpath) smiles_strings = [] edge_categories = [] edge_cat_min = [] edge_cat_max = [] for file_name in file_list: str_node_ids = str(obj_tools.obj2strings_ids(file_name)) if str_node_ids == '': continue str_node_ids_list = str_node_ids.split("\n") initial_strings = str_node_ids_list[:len(str_node_ids_list) / 2] node_ids_list = str_node_ids_list[len(str_node_ids_list) / 2:] current_strings = [] if args.remove_cycles: for elem in initial_strings: current_strings.append( re.sub( "[" + tile_grammar.DIGITS + tile_grammar.NUM_DELIMITER + "]", "", elem)) else: current_strings = initial_strings node_ids = [] for node_list in node_ids_list: node_ids.append([int(i) for i in node_list.split(" ")]) graph_edges = shape_graph.ShapeGraph(obj_tools.obj2graph(file_name)) for i, _ in enumerate(current_strings): dummy_node_id = len(node_ids[0]) padded_node_ids = [] num_nodes = 0 for char_id, _ in enumerate(current_strings[i]): if current_strings[i][char_id] in tile_grammar.charset: padded_node_ids.append(node_ids[0][num_nodes]) num_nodes += 1 else: padded_node_ids.append(dummy_node_id) padded_node_ids.append( dummy_node_id) #ensure at least one occurrence variant_strings, variant_nodes = smiles_variations( current_strings[i], padded_node_ids, tile_grammar, args.num_variations) for word, padded_nodes in zip(variant_strings, variant_nodes): nodes = [x for x in padded_nodes if x != dummy_node_id] if not args.remove_cycles and not tile_grammar.check_word( word): continue if len(str(word)) <= MAX_WORD_LENGTH and len( str(word)) > 0 and word not in smiles_strings: smiles_strings.append(word) current_categories = shape_graph.smiles_to_edge_categories( word, nodes, cluster_centers, graph_edges, tile_grammar) categories_str = "" for cat in current_categories: categories_str += str(cat) + " " edge_categories.append(categories_str[:-1]) if len(current_categories) > len(word): print("wrong number of edge categories: ", len(current_categories), " instead of ", len(word)) print(word) print(current_categories) category_bounds = tile_grammar.smiles_to_categories_bounds( word) min_bound_str = "" max_bound_str = "" for bounds in category_bounds: min_bound_str += str(bounds[0]) + " " max_bound_str += str(bounds[1]) + " " edge_cat_min.append(min_bound_str[:-1]) edge_cat_max.append(max_bound_str[:-1]) print("# items: " + str(len(smiles_strings))) df = pandas.DataFrame({ args.smiles_column: smiles_strings, args.categories_column: edge_categories, MIN_BOUND_COL_NAME: edge_cat_min, MAX_BOUND_COL_NAME: edge_cat_max }) df.to_hdf(args.out_filepath, "table", format="table", data_columns=True)