def main(): args = get_arguments() initial_file_list = [] process_folder(args.in_folder, initial_file_list) if len(initial_file_list) == 0: print("Did not find a valid input file in " + args.in_folder) exit() if len(initial_file_list) == 1: initial_file_list.append(initial_file_list[0]) else: initial_file_list = sorted(initial_file_list) inputA = initial_file_list[0] inputB = initial_file_list[len(initial_file_list) - 1] initial_smiles_strings = [] initial_smiles_strings.append(str(obj_tools.obj2string(inputA))) initial_smiles_strings.append(str(obj_tools.obj2string(inputB))) tile_grammar = grammar.TilingGrammar(initial_smiles_strings) print("max # neighbors: " + str(tile_grammar.max_degree())) tile_grammar.store(args.out_grammarpath) if args.fix_variations: print("fixing variations...") fix_variations(args.in_folder, [], inputA, inputB) print("removing duplicates...") remove_duplicates(tile_grammar, args.in_folder, inputA, inputB, initial_smiles_strings) smiles_strings = [] for i in range(args.num_iterations): current_file_list = [] process_folder(args.in_folder, current_file_list) print("Current # of variations: " + str(len(current_file_list))) if len(current_file_list) == 1: current_file_list.append(current_file_list[0]) augment_folder(current_file_list, smiles_strings) smiles_strings = list(set(smiles_strings)) if args.fix_variations: print("fixing variations...") fix_variations(args.in_folder, current_file_list, inputA, inputB) print("removing duplicates...") remove_duplicates(tile_grammar, args.in_folder, inputA, inputB, initial_smiles_strings) print("Iteration " + str(i) + " # of strings: " + str(len(smiles_strings))) loaded_grammar = grammar.TilingGrammar([]) loaded_grammar.load(args.out_grammarpath) valid_strings = [] for w in smiles_strings: if(loaded_grammar.check_word(w) == True): if len(str(w)) > 0 : valid_strings.append(w) print("# valid strings: " + str(len(valid_strings))) df = pandas.DataFrame({args.smiles_column : valid_strings}) df.to_hdf(args.out_filepath, "table", format = "table", data_columns = True)
def remove_duplicates(tile_grammar, folder_name, inputA, inputB, word_list = []): current_words = [] for old_str in word_list: current_words.append(old_str) for item_name in os.listdir(folder_name): subfolfer_name = os.path.join(folder_name, item_name) if os.path.isdir(subfolfer_name): remove_duplicates(tile_grammar, subfolfer_name, inputA, inputB, word_list) file_path = folder_name + "/" + item_name if file_path != inputA and file_path != inputB and not item_name.endswith("_coll_graph.obj") and item_name.endswith(".obj"): current_str = obj_tools.obj2string(file_path) base_path, extension = os.path.splitext(file_path) os.remove(base_path + "_coll_graph.obj") os.remove(base_path + "_coll_graph.mtl") if len(current_str) > 8 * MAX_WORD_LENGTH or not tile_grammar.check_word(current_str): os.remove(file_path) os.remove(base_path + ".mtl") continue current_words.append(current_str) for i in range(len(current_words) - 1): if tile_grammar.similar_words(current_words[i], current_str): os.remove(file_path) os.remove(base_path + ".mtl") current_words.pop() break
def main(): args = get_arguments() file_list = [] process_folder(args.in_folder, file_list) inputA = file_list[0] inputB = file_list[len(file_list) - 1] initial_smiles_strings = [] initial_smiles_strings.append(str(obj_tools.obj2string(inputA))) initial_smiles_strings.append(str(obj_tools.obj2string(inputB))) tile_grammar = TilingGrammar([]) if os.path.isfile(args.grammar): tile_grammar.load(args.grammar) else: raise ValueError("Grammar file %s doesn't exist" % args.grammar) if TREE_GRAMMAR: tile_grammar.convert_to_tree_grammar() cluster_centers, node_types = shape_graph.categorize_edges( file_list[:100], tile_grammar) all_edge_categories_a, all_edges_a = file_to_graph_with_categories( inputA, cluster_centers, tile_grammar) output_str_a = "" for edge in all_edges_a: output_str_a += str(edge[0]) + " " output_str_a += "\n" for edge in all_edges_a: output_str_a += str(edge[1]) + " " output_str_a += "\n" for categ in all_edge_categories_a: output_str_a += str(categ) + " " output_str_a += "\n" category_pairs = set() for edge, cat in zip(all_edges_a, all_edge_categories_a): reverse_edge = [edge[1], edge[0]] reverse_cat = all_edge_categories_a[all_edges_a.index(reverse_edge)] category_pairs.add((cat, reverse_cat)) all_edge_categories_b, all_edges_b = file_to_graph_with_categories( inputB, cluster_centers, tile_grammar) output_str_b = "" for edge in all_edges_b: output_str_b += str(edge[0]) + " " output_str_b += "\n" for edge in all_edges_b: output_str_b += str(edge[1]) + " " output_str_b += "\n" for categ in all_edge_categories_b: output_str_b += str(categ) + " " output_str_b += "\n" for edge, cat in zip(all_edges_b, all_edge_categories_b): reverse_edge = [edge[1], edge[0]] reverse_cat = all_edge_categories_b[all_edges_b.index(reverse_edge)] category_pairs.add((cat, reverse_cat)) data_train, categories_train, masks_train, data_test, categories_test, masks_test, charset, charset_cats = load_categories_dataset( args.data) num_encoder_tokens = len(charset) num_decoder_tokens = len(charset_cats) model = Seq2SeqRNN() if os.path.isfile(args.model): model.load(charset, charset_cats, args.model, lstm_size=LSTM_SIZE) else: raise ValueError("Model file %s doesn't exist" % args.model) # setup toolbar sys.stdout.write("[%s]" % (" " * args.num_attempts)) sys.stdout.flush() sys.stdout.write( "\b" * (args.num_attempts + 1)) # return to start of line, after '[' for num_attempts in range(0, args.num_attempts): target_edge_categories, target_edges = decode_graph( model, tile_grammar, charset, args.in_word, max_length=data_train.shape[1], num_variants=32) # for edge, cat in zip(target_edges, target_edge_categories): # reverse_edge = [edge[1], edge[0]] # reverse_cat = target_edge_categories[target_edges.index(reverse_edge)] # if (cat, reverse_cat) not in category_pairs: # for pair in category_pairs: # if pair[0] == cat: # node_id = edge[1] # per_node_cats = [edge_cat[1] for edge_cat in zip(target_edges, target_edge_categories) if edge_cat[0][0] == node_id] # if pair[1] not in per_node_cats: # target_edge_categories[target_edges.index(reverse_edge)] = pair[1] # break # elif pair[1] == reverse_cat: # if pair[0] == cat: # node_id = edge[0] # per_node_cats = [edge_cat[1] for edge_cat in zip(target_edges, target_edge_categories) if edge_cat[0][0] == node_id] # if pair[0] not in per_node_cats: # target_edge_categories[target_edges.index(edge)] = pair[0] # break #target_edge_categories, target_edges = file_to_graph_with_categories(random.choice(file_list), cluster_centers, tile_grammar) target_str = output_str_a + output_str_b for edge in target_edges: target_str += str(edge[0]) + " " target_str += "\n" for edge in target_edges: target_str += str(edge[1]) + " " target_str += "\n" for categ in target_edge_categories: target_str += str(categ) + " " target_str += "\n" #target_str = output_str_a + output_str_b + output_str_b filename, ext = os.path.splitext(args.out) filename += "_" + str(num_attempts) result = obj_tools.string2obj(inputA, inputB, target_str, filename) if result == 0: sys.stdout.write("\n") print("Successfull attempt with target string: ") print(target_str) break elif result == 1: sys.stdout.write("\n") print( "Successfull embedding not strictly according to the target string: " ) print(target_str) break sys.stdout.write("#") sys.stdout.flush()
def main(): args = get_arguments() file_list = [] process_folder(args.in_folder, file_list) inputA = file_list[0] inputB = file_list[len(file_list) - 1] initial_smiles_strings = [] initial_smiles_strings.append(str(obj_tools.obj2string(inputA))) initial_smiles_strings.append(str(obj_tools.obj2string(inputB))) tile_grammar = grammar.TilingGrammar(initial_smiles_strings) cluster_centers, node_types = shape_graph.categorize_edges( file_list[:100], tile_grammar, args.out_plot) str_node_ids = str(obj_tools.obj2strings_ids(inputA)) str_node_ids_list = str_node_ids.split("\n") smiles_strings = str_node_ids_list[:len(str_node_ids_list) / 2] node_ids_list = str_node_ids_list[len(str_node_ids_list) / 2:] node_ids = [] for node_list in node_ids_list: node_ids.append([int(i) for i in node_list.split(" ")]) graph_edges = shape_graph.ShapeGraph(obj_tools.obj2graph(inputA)) edge_categories = shape_graph.smiles_to_edge_categories( smiles_strings[0], node_ids[0], cluster_centers, graph_edges, tile_grammar) print("smiles string len: ", len(smiles_strings[0])) print(smiles_strings[0]) print("edge categories len: ", len(edge_categories)) print(edge_categories) dummy_node_id = len(node_ids[0]) padded_node_ids = [] num_nodes = 0 for char_id, _ in enumerate(smiles_strings[0]): if smiles_strings[0][char_id] in tile_grammar.charset: padded_node_ids.append(node_ids[0][num_nodes]) num_nodes += 1 else: padded_node_ids.append(dummy_node_id) padded_node_ids.append(dummy_node_id) #ensure at least one occurrence smiles_variants, node_lists = smiles_variations(smiles_strings[0], padded_node_ids, tile_grammar, 2) print("smiles variants:") print(smiles_variants) print("node lists:") print(node_lists) #print("cluster centers:") #print(cluster_centers) edge_list = tile_grammar.smiles_to_edges(smiles_strings[0], padded_node_ids) print("edge list:") print(edge_list) all_edge_categories, all_edges = shape_graph.smiles_to_all_edge_categories( smiles_strings[0], node_ids[0], cluster_centers, graph_edges, tile_grammar) if len(all_edge_categories) != len(all_edges): print("Error, mismatching number of edges", len(all_edges), "and edge categories", len(all_edge_categories)) output_str = "" for edge in all_edges: output_str += str(edge[0]) + " " output_str += "\n" for edge in all_edges: output_str += str(edge[1]) + " " output_str += "\n" for categ in all_edge_categories: output_str += str(categ) + " " output_str += "\n" print("graph embedding output string:") print(output_str)
def main(): args = get_arguments() file_list = process_folder(args.in_folder) file_list = sorted(file_list) input_a = file_list[0] input_b = file_list[len(file_list) - 1] initial_smiles_strings = [] initial_smiles_strings.append(str(obj_tools.obj2string(input_a))) initial_smiles_strings.append(str(obj_tools.obj2string(input_b))) tile_grammar = grammar.TilingGrammar(initial_smiles_strings) cluster_centers, _ = shape_graph.categorize_edges(file_list[:100], tile_grammar, args.plot) num_categories = 0 categories_prefix = [0] for clusters in cluster_centers: num_categories += clusters.shape[0] categories_prefix.append(num_categories) tile_grammar.set_categories_prefix(categories_prefix) tile_grammar.store(args.out_grammarpath) smiles_strings = [] edge_categories = [] edge_cat_min = [] edge_cat_max = [] for file_name in file_list: str_node_ids = str(obj_tools.obj2strings_ids(file_name)) if str_node_ids == '': continue str_node_ids_list = str_node_ids.split("\n") initial_strings = str_node_ids_list[:len(str_node_ids_list) / 2] node_ids_list = str_node_ids_list[len(str_node_ids_list) / 2:] current_strings = [] if args.remove_cycles: for elem in initial_strings: current_strings.append( re.sub( "[" + tile_grammar.DIGITS + tile_grammar.NUM_DELIMITER + "]", "", elem)) else: current_strings = initial_strings node_ids = [] for node_list in node_ids_list: node_ids.append([int(i) for i in node_list.split(" ")]) graph_edges = shape_graph.ShapeGraph(obj_tools.obj2graph(file_name)) for i, _ in enumerate(current_strings): dummy_node_id = len(node_ids[0]) padded_node_ids = [] num_nodes = 0 for char_id, _ in enumerate(current_strings[i]): if current_strings[i][char_id] in tile_grammar.charset: padded_node_ids.append(node_ids[0][num_nodes]) num_nodes += 1 else: padded_node_ids.append(dummy_node_id) padded_node_ids.append( dummy_node_id) #ensure at least one occurrence variant_strings, variant_nodes = smiles_variations( current_strings[i], padded_node_ids, tile_grammar, args.num_variations) for word, padded_nodes in zip(variant_strings, variant_nodes): nodes = [x for x in padded_nodes if x != dummy_node_id] if not args.remove_cycles and not tile_grammar.check_word( word): continue if len(str(word)) <= MAX_WORD_LENGTH and len( str(word)) > 0 and word not in smiles_strings: smiles_strings.append(word) current_categories = shape_graph.smiles_to_edge_categories( word, nodes, cluster_centers, graph_edges, tile_grammar) categories_str = "" for cat in current_categories: categories_str += str(cat) + " " edge_categories.append(categories_str[:-1]) if len(current_categories) > len(word): print("wrong number of edge categories: ", len(current_categories), " instead of ", len(word)) print(word) print(current_categories) category_bounds = tile_grammar.smiles_to_categories_bounds( word) min_bound_str = "" max_bound_str = "" for bounds in category_bounds: min_bound_str += str(bounds[0]) + " " max_bound_str += str(bounds[1]) + " " edge_cat_min.append(min_bound_str[:-1]) edge_cat_max.append(max_bound_str[:-1]) print("# items: " + str(len(smiles_strings))) df = pandas.DataFrame({ args.smiles_column: smiles_strings, args.categories_column: edge_categories, MIN_BOUND_COL_NAME: edge_cat_min, MAX_BOUND_COL_NAME: edge_cat_max }) df.to_hdf(args.out_filepath, "table", format="table", data_columns=True)