Example #1
0
def main():
    args = get_arguments()
    
    initial_file_list = []
    process_folder(args.in_folder, initial_file_list)
    if len(initial_file_list) == 0:
        print("Did not find a valid input file in " + args.in_folder)
        exit()

    if len(initial_file_list) == 1:
        initial_file_list.append(initial_file_list[0])
    else:
        initial_file_list = sorted(initial_file_list)

    inputA = initial_file_list[0]
    inputB = initial_file_list[len(initial_file_list) - 1]

    initial_smiles_strings = []
    initial_smiles_strings.append(str(obj_tools.obj2string(inputA)))
    initial_smiles_strings.append(str(obj_tools.obj2string(inputB)))

    tile_grammar = grammar.TilingGrammar(initial_smiles_strings)
    print("max # neighbors: " + str(tile_grammar.max_degree()))
    tile_grammar.store(args.out_grammarpath)

    if args.fix_variations:
        print("fixing variations...")
        fix_variations(args.in_folder, [],  inputA, inputB)

    print("removing duplicates...")
    remove_duplicates(tile_grammar, args.in_folder, inputA, inputB, initial_smiles_strings)

    smiles_strings = []
    for i in range(args.num_iterations):
        current_file_list = []
        process_folder(args.in_folder, current_file_list)
        print("Current # of variations: " + str(len(current_file_list)))
        if len(current_file_list) == 1:
            current_file_list.append(current_file_list[0])    
        augment_folder(current_file_list, smiles_strings)
        smiles_strings = list(set(smiles_strings))
        if args.fix_variations:
            print("fixing variations...")
            fix_variations(args.in_folder, current_file_list,  inputA, inputB)
        print("removing duplicates...")
        remove_duplicates(tile_grammar, args.in_folder, inputA, inputB, initial_smiles_strings)
        print("Iteration " + str(i) + " # of strings: " + str(len(smiles_strings)))

    loaded_grammar = grammar.TilingGrammar([])
    loaded_grammar.load(args.out_grammarpath)
    
    valid_strings = []
    for w in smiles_strings:
        if(loaded_grammar.check_word(w) == True):
            if len(str(w)) > 0 :
                valid_strings.append(w)      

    print("# valid strings: " + str(len(valid_strings)))
    df = pandas.DataFrame({args.smiles_column : valid_strings})
    df.to_hdf(args.out_filepath, "table", format = "table", data_columns = True)
Example #2
0
def decoder_rnd(args, model):
    latent_dim = args.latent_dim
    data, charset = read_latent_data(args.data)

    if os.path.isfile(args.model):
        model.load(charset, args.model, latent_rep_size=latent_dim)
    else:
        raise ValueError("Model file %s doesn't exist" % args.model)

    tiling_grammar = grammar.TilingGrammar([])
    if os.path.isfile(args.grammar):
        tiling_grammar.load(args.grammar)
    else:
        raise ValueError("Grammar file %s doesn't exist" % args.grammar)

    for i in range(args.samples):
        mu, sigma = 0, 0.01
        z_sample = np.random.normal(mu, sigma, latent_dim)
        decoded_rnd_sample = model.decoder.predict(
            z_sample.reshape(1, latent_dim)).argmax(axis=2)[0]
        char_rnd_sample = decode_smiles_from_indexes(decoded_rnd_sample,
                                                     charset)
        if tiling_grammar.check_word(char_rnd_sample):
            print("random point: " + char_rnd_sample)
            print(
                "-----------------------------------------------------------------------"
            )
Example #3
0
def load_input(args):
    if not os.path.isfile(args.input_data):
        raise ValueError("Input file %s doesn't exist" % args.input_data)

    data_train, data_test, charset = load_dataset(args.input_data)

    model = TilingVAE()
    if args.model_type == 'lstm':
        model = Tiling_LSTM_VAE()
    elif args.model_type == 'lstm_':
        model = Tiling_LSTM_VAE_XL()

    if os.path.isfile(args.model):
        model.load(charset, args.model, latent_rep_size=args.latent_dim)
    else:
        raise ValueError("Model file %s doesn't exist" % args.model)

    tiling_grammar = grammar.TilingGrammar([])
    if os.path.isfile(args.grammar):
        tiling_grammar.load(args.grammar)
    else:
        raise ValueError("Grammar file %s doesn't exist" % args.grammar)

    if TREE_GRAMMAR:
        tiling_grammar.convert_to_tree_grammar()

    data = np.append(data_train, data_test, axis=0)
    latent_data = model.encoder.predict(data)

    return model, tiling_grammar, latent_data, charset
def main():
    args = get_arguments()

    initial_smiles_strings = []
    process_folder(args.in_folder, initial_smiles_strings)
    initial_smiles_strings = list(set(initial_smiles_strings))
    print("# initial strings: " + str(len(initial_smiles_strings)))

    length_variations = []
    for word in initial_smiles_strings:
        str_len_variation_generator(length_variations, word)

    print("# length variations: " + str(len(length_variations)))

    char_variations = []
    for word in length_variations:
        str_char_variation_generator(char_variations, word)

    print("# char variations: " + str(len(char_variations)))

    tile_grammar = grammar.TilingGrammar(initial_smiles_strings)
    output_strings = []
    for word in char_variations:
        if (tile_grammar.check_word(word) == True):
            output_strings.append(word)

    tile_grammar.store(args.out_grammarpath)
    print("# all valid variations: " + str(len(char_variations)))

    df = pandas.DataFrame({args.smiles_column: output_strings})
    df.to_hdf(args.out_filepath, "table", format="table", data_columns=True)
Example #5
0
def decoder_lerp(args, model):
    latent_dim = args.latent_dim
    data, charset = read_latent_data(args.data)

    if os.path.isfile(args.model):
        model.load(charset, args.model, latent_rep_size=latent_dim)
    else:
        raise ValueError("Model file %s doesn't exist" % args.model)

    tiling_grammar = grammar.TilingGrammar([])
    if os.path.isfile(args.grammar):
        tiling_grammar.load(args.grammar)
    else:
        raise ValueError("Grammar file %s doesn't exist" % args.grammar)

    for i in range(args.samples):
        sample_ids = np.random.randint(0, len(data), 2)

        decoded_data_0 = model.decoder.predict(data[sample_ids[0]].reshape(
            1, latent_dim)).argmax(axis=2)[0]
        char_data_0 = decode_smiles_from_indexes(decoded_data_0, charset)

        decoded_data_1 = model.decoder.predict(data[sample_ids[1]].reshape(
            1, latent_dim)).argmax(axis=2)[0]
        char_data_1 = decode_smiles_from_indexes(decoded_data_1, charset)
        if not (tiling_grammar.check_word(char_data_0)
                and tiling_grammar.check_word(char_data_1)):
            continue

        if args.require_cycle and char_data_0.find(
                "0") == -1 and char_data_1.find("0") == -1:
            continue

        print(
            "-----------------------------------------------------------------------"
        )
        print("data point 0.0: " + char_data_0)

        for k in [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]:
            for step_size in [0.0001, 0.001, 0.01, 0.02, 0.05, 0.075]:
                rnd_offset = np.array([np.random.random(latent_dim)
                                       ]) * step_size
                z_sample = (1.0 - k) * data[sample_ids[0]] + k * data[
                    sample_ids[1]] + rnd_offset
                decoded_sample_k = model.decoder.predict(
                    z_sample.reshape(1, latent_dim)).argmax(axis=2)[0]
                char_sample_k = decode_smiles_from_indexes(
                    decoded_sample_k, charset)
                if (char_sample_k != char_data_0
                        and char_sample_k != char_data_1
                        and tiling_grammar.check_word(char_sample_k)):
                    print("data point " + str(k) + ": " + char_sample_k +
                          " (rnd offset = " + str(step_size) + ")")
                    break
        print("data point 1.0: " + char_data_1)
        print(
            "-----------------------------------------------------------------------"
        )
Example #6
0
def main():
    args = get_arguments()

    in_smiles_string = args.in_string

    tiling_grammar = grammar.TilingGrammar([])
    if os.path.isfile(args.in_grammar):
        tiling_grammar.load(args.in_grammar)
    else:
        raise ValueError("Grammar file %s doesn't exist" % args.in_grammar)

    success = str_to_file(args.in_folder, in_smiles_string, tiling_grammar)
    if not success:
        print("Did not find " + in_smiles_string)
Example #7
0
def decoder_nbr(args, model):
    latent_dim = args.latent_dim
    data, charset = read_latent_data(args.data)

    if os.path.isfile(args.model):
        model.load(charset, args.model, latent_rep_size=latent_dim)
    else:
        raise ValueError("Model file %s doesn't exist" % args.model)

    tiling_grammar = grammar.TilingGrammar([])
    if os.path.isfile(args.grammar):
        tiling_grammar.load(args.grammar)
    else:
        raise ValueError("Grammar file %s doesn't exist" % args.grammar)

    for i in range(args.samples):
        sample_id = np.random.randint(0, len(data))
        decoded_data = model.decoder.predict(data[sample_id].reshape(
            1, latent_dim)).argmax(axis=2)[0]
        char_data = decode_smiles_from_indexes(decoded_data, charset)
        if not tiling_grammar.check_word(char_data):
            continue
        if args.require_cycle and char_data.find("0") == -1:
            continue

        for step_size in [
                0.0001, 0.001, 0.01, 0.02, 0.05, 0.075, 0.1, 0.15, 0.2, 0.25
        ]:
            z_sample = np.array([np.random.random(latent_dim)]) * step_size
            z_sample += data[sample_id]
            decoded_sample = model.decoder.predict(
                z_sample.reshape(1, latent_dim)).argmax(axis=2)[0]
            char_sample = decode_smiles_from_indexes(decoded_sample, charset)
            if (char_sample != char_data
                    and tiling_grammar.check_word(char_sample)):
                print("data point  : " + char_data)
                print("offset point: " + char_sample)
                print("offset magnitude: " + str(step_size))
                print(
                    "-----------------------------------------------------------------------"
                )
                break
Example #8
0
def main():
    args = get_arguments()
    data = pandas.read_hdf(args.infile, "table")
    keys = data[args.smiles_column].map(len) < 121

    if args.length <= len(keys):
        data = data[keys].sample(n=args.length)
    else:
        data = data[keys]

    loaded_grammar = grammar.TilingGrammar([])
    loaded_grammar.load(args.ingrammar)

    num_data_points = len(data[args.smiles_column])
    vec_dims = len(loaded_grammar.charset) + loaded_grammar.max_degree()

    structures_one_hot = np.zeros((num_data_points, 120, vec_dims))
    for s in range(num_data_points):
        structures_one_hot[s] = loaded_grammar.encode_to_one_hot(
            data[args.smiles_column][s], 120)

    del data

    #data_train, data_test = train_test_split(structures_one_hot, test_size = 0.20)
    train_idx, test_idx = train_test_split(xrange(structures_one_hot.shape[0]),
                                           test_size=0.20)

    h5f = h5py.File(args.outfile, "w")
    h5f.create_dataset("connectivity_dims", data=loaded_grammar.max_degree())
    h5f.create_dataset("charset", data=loaded_grammar.charset)
    h5f.create_dataset("data_train",
                       data=structures_one_hot[train_idx],
                       chunks=(200, 120, vec_dims))
    h5f.create_dataset("data_test",
                       data=structures_one_hot[test_idx],
                       chunks=(200, 120, vec_dims))

    if args.property_column:
        h5f.create_dataset("property_train", data=properties[train_idx])
        h5f.create_dataset("property_test", data=properties[test_idx])
    h5f.close()
Example #9
0
def decoder_path(args, model):
    latent_dim = args.latent_dim
    data, charset = read_latent_data(args.data)

    if os.path.isfile(args.model):
        model.load(charset, args.model, latent_rep_size=latent_dim)
    else:
        raise ValueError("Model file %s doesn't exist" % args.model)

    tiling_grammar = grammar.TilingGrammar([])
    if os.path.isfile(args.grammar):
        tiling_grammar.load(args.grammar)
    else:
        raise ValueError("Grammar file %s doesn't exist" % args.grammar)

    for i in range(args.samples):
        sample_ids = np.random.randint(0, len(data), 2)

        decoded_data_0 = model.decoder.predict(data[sample_ids[0]].reshape(
            1, latent_dim)).argmax(axis=2)[0]
        char_data_0 = decode_smiles_from_indexes(decoded_data_0, charset)

        decoded_data_1 = model.decoder.predict(data[sample_ids[1]].reshape(
            1, latent_dim)).argmax(axis=2)[0]
        char_data_1 = decode_smiles_from_indexes(decoded_data_1, charset)
        if not (tiling_grammar.check_word(char_data_0)
                and tiling_grammar.check_word(char_data_1)):
            continue

        if args.require_cycle and char_data_0.find(
                "0") == -1 and char_data_1.find("0") == -1:
            continue

        print("---------------------sample " + str(i) +
              "------------------------------------------")
        print("data point  0.0: " + char_data_0)

        path_ids = []
        path_ids.append(sample_ids[0])
        path_ids = _gen_latent_path(data,
                                    sample_ids[0],
                                    sample_ids[1],
                                    waypoints=path_ids)
        path_ids.append(sample_ids[1])

        for p in range(len(path_ids) - 1):
            decoded_data_p = model.decoder.predict(
                data[path_ids[p + 1]].reshape(1, latent_dim)).argmax(axis=2)[0]
            char_data_p = decode_smiles_from_indexes(decoded_data_p, charset)
            if not tiling_grammar.check_word(char_data_p):
                continue

            for k in [0.2, 0.4, 0.6, 0.8]:
                current_distance = np.linalg.norm(data[path_ids[p]] -
                                                  data[path_ids[p + 1]])
                rnd_offset = np.array([np.random.random(latent_dim)
                                       ]) * 0.1 * current_distance
                z_sample = (1.0 - k) * data[path_ids[p]] + k * data[path_ids[
                    p + 1]] + rnd_offset
                decoded_sample_k = model.decoder.predict(
                    z_sample.reshape(1, latent_dim)).argmax(axis=2)[0]
                char_sample_k = decode_smiles_from_indexes(
                    decoded_sample_k, charset)
                if (char_sample_k != char_data_0
                        and char_sample_k != char_data_1
                        and char_sample_k != char_data_p
                        and tiling_grammar.check_word(char_sample_k)):
                    print("sample point " + str(k) + ": " + char_sample_k +
                          " (rnd offset = " + str(0.1 * current_distance) +
                          ")")
                    break

            if p < len(path_ids) - 2:
                print("path waypoint " + str(p + 1) + ": " + char_data_p)

        print("data point   1.0: " + char_data_1)
        print(
            "-----------------------------------------------------------------------"
        )
Example #10
0
def main():
    args = get_arguments()
    data_train, data_test, charset = load_dataset(args.data)

    word_length = data_train.shape[1]
    print("----------- max word length is ", word_length, " -----------------")

    #print ("Grammar characters: ")
    #print (charset)
    #print("data dtype is " + str(data_train.dtype))
    #print("vector dtype is " + str(data_train[0].dtype))
    #print("vector shape is " + str(data_train[0].shape))

    #for i in range(1):
    #    sample_id = np.random.randint(0, len(data_train))
    #    exaple = data_train[sample_id]
    #    print("training vector " + str(sample_id) + ":")
    #    print(exaple)

    #return

    if os.path.isfile(args.grammar):
        model = Tiling_Triplet_LSTM_VAE()
    elif args.type == 'lstm':
        model = Tiling_LSTM_VAE()
    elif args.type == 'lstm_large':
        model = Tiling_LSTM_VAE_XL()
    elif args.type == 'simple':
        model = TilingVAE()
    else:
        model = Tiling_LSTM_VAE()

    if os.path.isfile(args.model):
        model.load(charset,
                   args.model,
                   max_w_length=word_length,
                   latent_rep_size=args.latent_dim)
    else:
        model.create(charset,
                     max_length=word_length,
                     latent_rep_size=args.latent_dim)

    print("available metrics: ", model.autoencoder.metrics_names)

    checkpointer = ModelCheckpoint(monitor='val_loss',
                                   filepath=args.model,
                                   verbose=1,
                                   mode='min',
                                   save_best_only=True)

    reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                  factor=0.2,
                                  patience=3,
                                  min_lr=0.00000001)

    filename, ext = os.path.splitext(args.model)
    plot_model(model.autoencoder,
               to_file=filename + '_nn.pdf',
               show_shapes=True)

    csv_logger = CSVLogger(filename + '_training.log', append=True)

    plot = PlotLearning()
    plot.set_filename(filename)

    if os.path.isfile(args.grammar):
        tiling_grammar = grammar.TilingGrammar([])
        tiling_grammar.load(args.grammar)

        tri_shuffle = TriplesShuffle(train=data_train,
                                     test=data_test,
                                     charset=charset,
                                     tile_grammar=tiling_grammar)

        reduce_lr = ReduceLROnPlateau(monitor='val_loss',
                                      factor=0.2,
                                      patience=3,
                                      min_lr=0.0000000001)

        history = model.autoencoder.fit(
            {
                'main_input': tri_shuffle.train_x,
                'positive_input': tri_shuffle.train_y,
                'negative_input': tri_shuffle.train_z
            },
            tri_shuffle.train_x,
            shuffle=True,
            epochs=args.epochs,
            batch_size=args.batch_size,
            callbacks=[tri_shuffle, checkpointer, reduce_lr, plot, csv_logger],
            validation_data=({
                'main_input': tri_shuffle.test_x,
                'positive_input': tri_shuffle.test_y,
                'negative_input': tri_shuffle.test_z
            }, tri_shuffle.test_x))

    else:

        history = model.autoencoder.fit(
            data_train,
            data_train,
            shuffle=True,
            epochs=args.epochs,
            batch_size=args.batch_size,
            callbacks=[checkpointer, reduce_lr, plot, csv_logger],
            validation_data=(data_test, data_test))
Example #11
0
def sample_path_from_strings(args):

    if not os.path.isfile(args.input_data):
        raise ValueError("Input file %s doesn't exist" % args.input_data)

    tiling_grammar = grammar.TilingGrammar([])
    if os.path.isfile(args.grammar):
        tiling_grammar.load(args.grammar)
    else:
        raise ValueError("Grammar file %s doesn't exist" % args.grammar)

    if TREE_GRAMMAR:
        tiling_grammar.convert_to_tree_grammar()

    data = pandas.read_hdf(args.input_data, 'table')
    words = data["structure"]

    if not os.path.isfile(args.latent_graph):
        raise ValueError("Search graph file %s doesn't exist" %
                         args.latent_graph)

    search_graph = nx.read_graphml(args.latent_graph)
    node_list = [int(x) for x in list(search_graph.nodes)]

    path_weights = []
    for i in range(args.num_samples):
        samples = np.random.randint(0, len(node_list), 2)
        sample_ids = [node_list[samples[0]], node_list[samples[1]]]

        char_data_0 = words[sample_ids[0]]
        char_data_1 = words[sample_ids[1]]

        print("---------------------path sample " + str(i) +
              "------------------------------------------")
        if not (tiling_grammar.check_word(char_data_0)
                and tiling_grammar.check_word(char_data_1)):
            print("invalid words")
            continue

        try:
            shortest_path = nx.shortest_path(search_graph,
                                             source=str(sample_ids[0]),
                                             target=str(sample_ids[1]),
                                             weight='weight')
        except nx.exception.NetworkXNoPath:
            print("no path between sample nodes")

        if len(shortest_path) < 5:
            print("path too short")
            continue

        decoded_words = [char_data_0]
        valid_words = [True]

        for pt_id in shortest_path[1:-1]:
            word = words[int(pt_id)]
            if word not in decoded_words:
                decoded_words.append(word)
                valid_words.append(True)

        decoded_words.append(char_data_1)
        valid_words.append(True)

        if valid_words.count(True) < 5:
            print("too few valid words")
            continue

        decoded_valid_words = [
            w for w, flag in zip(decoded_words, valid_words) if flag
        ]
        edge_weights = [
            tiling_grammar.word_similarity(w1, w2) for w1, w2 in zip(
                decoded_valid_words[:-1], decoded_valid_words[1:])
        ]

        file_name_0 = "?"
        file_name_1 = "?"
        if args.folder_name != "":
            found0, file_name_0 = str_to_file(args.folder_name, char_data_0,
                                              tiling_grammar)
            found1, file_name_1 = str_to_file(args.folder_name, char_data_1,
                                              tiling_grammar)

        #print("---------------------path sample " + str(i) + "------------------------------------------")
        print("start  :", decoded_words[0], " file: ", file_name_0)
        file_name_w = "?"
        for w, flag in zip(decoded_words, valid_words)[1:-1]:
            if flag:
                if args.folder_name != "":
                    found, file_name_w = str_to_file(args.folder_name, w,
                                                     tiling_grammar)
                    if found:
                        print("valid  :", w, " file: ", file_name_w)
                    else:
                        print("valid  :", w, " closest file: ", file_name_w)
                else:
                    print("valid  :", w)
            else:
                print("invalid:", w)
        print("end    :", decoded_words[-1], " file: ", file_name_1)
        path_weights.append(sum(edge_weights))
        print("edge weights: ", edge_weights)
        print(
            "----------------------------------------------------------------------------------"
        )

    print(
        "----------------------------------------------------------------------------------"
    )
    print("average accumulated path weight: ",
          sum(path_weights) / len(path_weights))
    print("max accumulated path weight: ", max(path_weights))
    print(
        "----------------------------------------------------------------------------------"
    )
Example #12
0
def build_graph_from_strings(args):

    if args.graph_degree >= args.graph_size:
        raise ValueError(
            "Requested graph degree %s larger than graph size %s" %
            (args.graph_degree, args.model))

    if not os.path.isfile(args.input_data):
        raise ValueError("Input file %s doesn't exist" % args.input_data)

    tiling_grammar = grammar.TilingGrammar([])
    if os.path.isfile(args.grammar):
        tiling_grammar.load(args.grammar)
    else:
        raise ValueError("Grammar file %s doesn't exist" % args.grammar)

    if TREE_GRAMMAR:
        tiling_grammar.convert_to_tree_grammar()

    data = pandas.read_hdf(args.input_data, 'table')
    print("Number of SMILES strings: ", len(data))

    if args.graph_size <= len(data):
        data = data.sample(n=args.graph_size)

    words = data["structure"]

    tmp_ids = data.index.tolist()
    selected_ids = [int(x) for x in tmp_ids]

    # setup toolbar
    sys.stdout.write("Inserting graph nodes [%s]" % (" " * 10))
    sys.stdout.flush()
    sys.stdout.write("\b" * (10 + 1))  # return to start of line, after '['

    search_graph = nx.Graph()
    #graph nodes
    search_graph.add_nodes_from(selected_ids)
    #graph edges
    for i, idx in enumerate(selected_ids):
        if i % (len(selected_ids) / 10) == len(selected_ids) / 10 - 1:
            sys.stdout.write("#")
            sys.stdout.flush()
        #add an edge to each similar word
        for j, idy in enumerate(selected_ids):
            if tiling_grammar.similar_words(words[idx], words[idy]):
                search_graph.add_edge(idx, idy, weight=0.0)

        #connect to k-nearest points in "string" space
        dist_id_pairs = []
        for j in range(len(selected_ids)):
            idy = selected_ids[j]
            if idx == idy:
                continue
            dist = tiling_grammar.word_similarity(words[idx], words[idy])
            dist_id_pairs.append((dist, idy))
            if len(dist_id_pairs) % args.graph_degree == 0:
                dist_id_pairs = sorted(dist_id_pairs)
                dist_id_pairs = dist_id_pairs[:args.graph_degree]

        dist_id_pairs = sorted(dist_id_pairs)
        dist_id_pairs = dist_id_pairs[:args.graph_degree]

        for d, idy in dist_id_pairs:
            similarity = tiling_grammar.word_similarity(words[idx], words[idy])
            search_graph.add_edge(idx, idy, weight=similarity)

    sys.stdout.write("\n")

    print("number of connected components before augmentation: ",
          nx.number_connected_components(search_graph))

    complement = list(nx.k_edge_augmentation(search_graph, k=1, partial=True))
    for (n_i, n_j) in complement:
        similarity = tiling_grammar.word_similarity(words[int(n_i)],
                                                    words[int(n_j)])
        search_graph.add_edge(n_i, n_j, weight=similarity)

    nx.write_graphml(search_graph, args.latent_graph)
Example #13
0
def main():
    args = get_arguments()

    file_list = []
    process_folder(args.in_folder, file_list)

    inputA = file_list[0]
    inputB = file_list[len(file_list) - 1]

    initial_smiles_strings = []
    initial_smiles_strings.append(str(obj_tools.obj2string(inputA)))
    initial_smiles_strings.append(str(obj_tools.obj2string(inputB)))
    tile_grammar = grammar.TilingGrammar(initial_smiles_strings)

    cluster_centers, node_types = shape_graph.categorize_edges(
        file_list[:100], tile_grammar, args.out_plot)

    str_node_ids = str(obj_tools.obj2strings_ids(inputA))
    str_node_ids_list = str_node_ids.split("\n")
    smiles_strings = str_node_ids_list[:len(str_node_ids_list) / 2]
    node_ids_list = str_node_ids_list[len(str_node_ids_list) / 2:]

    node_ids = []
    for node_list in node_ids_list:
        node_ids.append([int(i) for i in node_list.split(" ")])

    graph_edges = shape_graph.ShapeGraph(obj_tools.obj2graph(inputA))

    edge_categories = shape_graph.smiles_to_edge_categories(
        smiles_strings[0], node_ids[0], cluster_centers, graph_edges,
        tile_grammar)

    print("smiles string len: ", len(smiles_strings[0]))
    print(smiles_strings[0])
    print("edge categories len: ", len(edge_categories))
    print(edge_categories)

    dummy_node_id = len(node_ids[0])

    padded_node_ids = []
    num_nodes = 0
    for char_id, _ in enumerate(smiles_strings[0]):
        if smiles_strings[0][char_id] in tile_grammar.charset:
            padded_node_ids.append(node_ids[0][num_nodes])
            num_nodes += 1
        else:
            padded_node_ids.append(dummy_node_id)
    padded_node_ids.append(dummy_node_id)  #ensure at least one occurrence

    smiles_variants, node_lists = smiles_variations(smiles_strings[0],
                                                    padded_node_ids,
                                                    tile_grammar, 2)
    print("smiles variants:")
    print(smiles_variants)

    print("node lists:")
    print(node_lists)

    #print("cluster centers:")
    #print(cluster_centers)

    edge_list = tile_grammar.smiles_to_edges(smiles_strings[0],
                                             padded_node_ids)
    print("edge list:")
    print(edge_list)

    all_edge_categories, all_edges = shape_graph.smiles_to_all_edge_categories(
        smiles_strings[0], node_ids[0], cluster_centers, graph_edges,
        tile_grammar)

    if len(all_edge_categories) != len(all_edges):
        print("Error, mismatching number of edges", len(all_edges),
              "and edge categories", len(all_edge_categories))

    output_str = ""
    for edge in all_edges:
        output_str += str(edge[0]) + " "
    output_str += "\n"
    for edge in all_edges:
        output_str += str(edge[1]) + " "
    output_str += "\n"
    for categ in all_edge_categories:
        output_str += str(categ) + " "
    output_str += "\n"

    print("graph embedding output string:")
    print(output_str)
def main():
    args = get_arguments()
    file_list = process_folder(args.in_folder)
    file_list = sorted(file_list)

    input_a = file_list[0]
    input_b = file_list[len(file_list) - 1]

    initial_smiles_strings = []
    initial_smiles_strings.append(str(obj_tools.obj2string(input_a)))
    initial_smiles_strings.append(str(obj_tools.obj2string(input_b)))

    tile_grammar = grammar.TilingGrammar(initial_smiles_strings)

    cluster_centers, _ = shape_graph.categorize_edges(file_list[:100],
                                                      tile_grammar, args.plot)

    num_categories = 0
    categories_prefix = [0]
    for clusters in cluster_centers:
        num_categories += clusters.shape[0]
        categories_prefix.append(num_categories)

    tile_grammar.set_categories_prefix(categories_prefix)
    tile_grammar.store(args.out_grammarpath)

    smiles_strings = []
    edge_categories = []
    edge_cat_min = []
    edge_cat_max = []

    for file_name in file_list:
        str_node_ids = str(obj_tools.obj2strings_ids(file_name))
        if str_node_ids == '':
            continue
        str_node_ids_list = str_node_ids.split("\n")
        initial_strings = str_node_ids_list[:len(str_node_ids_list) / 2]
        node_ids_list = str_node_ids_list[len(str_node_ids_list) / 2:]

        current_strings = []
        if args.remove_cycles:
            for elem in initial_strings:
                current_strings.append(
                    re.sub(
                        "[" + tile_grammar.DIGITS +
                        tile_grammar.NUM_DELIMITER + "]", "", elem))
        else:
            current_strings = initial_strings

        node_ids = []
        for node_list in node_ids_list:
            node_ids.append([int(i) for i in node_list.split(" ")])

        graph_edges = shape_graph.ShapeGraph(obj_tools.obj2graph(file_name))

        for i, _ in enumerate(current_strings):
            dummy_node_id = len(node_ids[0])

            padded_node_ids = []
            num_nodes = 0
            for char_id, _ in enumerate(current_strings[i]):
                if current_strings[i][char_id] in tile_grammar.charset:
                    padded_node_ids.append(node_ids[0][num_nodes])
                    num_nodes += 1
                else:
                    padded_node_ids.append(dummy_node_id)
            padded_node_ids.append(
                dummy_node_id)  #ensure at least one occurrence

            variant_strings, variant_nodes = smiles_variations(
                current_strings[i], padded_node_ids, tile_grammar,
                args.num_variations)
            for word, padded_nodes in zip(variant_strings, variant_nodes):
                nodes = [x for x in padded_nodes if x != dummy_node_id]
                if not args.remove_cycles and not tile_grammar.check_word(
                        word):
                    continue
                if len(str(word)) <= MAX_WORD_LENGTH and len(
                        str(word)) > 0 and word not in smiles_strings:
                    smiles_strings.append(word)
                    current_categories = shape_graph.smiles_to_edge_categories(
                        word, nodes, cluster_centers, graph_edges,
                        tile_grammar)
                    categories_str = ""
                    for cat in current_categories:
                        categories_str += str(cat) + " "
                    edge_categories.append(categories_str[:-1])

                    if len(current_categories) > len(word):
                        print("wrong number of edge categories: ",
                              len(current_categories), " instead of ",
                              len(word))
                        print(word)
                        print(current_categories)

                    category_bounds = tile_grammar.smiles_to_categories_bounds(
                        word)
                    min_bound_str = ""
                    max_bound_str = ""
                    for bounds in category_bounds:
                        min_bound_str += str(bounds[0]) + " "
                        max_bound_str += str(bounds[1]) + " "
                    edge_cat_min.append(min_bound_str[:-1])
                    edge_cat_max.append(max_bound_str[:-1])

    print("# items: " + str(len(smiles_strings)))

    df = pandas.DataFrame({
        args.smiles_column: smiles_strings,
        args.categories_column: edge_categories,
        MIN_BOUND_COL_NAME: edge_cat_min,
        MAX_BOUND_COL_NAME: edge_cat_max
    })
    df.to_hdf(args.out_filepath, "table", format="table", data_columns=True)