def read_dataset(file_path): if os.path.exists(file_path): return data_utils.read_dataset(file_path) else: print("No file found: {0}".format(file_path)) sys.exit(1)
def main(argv): tf.set_random_seed(FLAGS.seed) if not tf.gfile.IsDirectory(FLAGS.exp_dir): tf.gfile.MakeDirs(FLAGS.exp_dir) print('Made directory') train_set, val_set, _ = mdu.read_dataset(FLAGS.dataset) molecule_mapping = mdu.read_molecule_mapping_for_set(FLAGS.dataset) inv_mol_mapping = {v: k for k, v in enumerate(molecule_mapping)} if FLAGS.dataset.startswith('zinc'): bond_mapping = mdu.read_bond_mapping_for_set(FLAGS.dataset) inv_bond_mapping = {('%d_%d' % v): k for k, v in enumerate(bond_mapping)} stereo = True else: bond_mapping = None inv_bond_mapping = None stereo = False # unique set of training data, used for evaluation train_set_unique = set(train_set) train_set, val_set, _ = mdu.read_molecule_graphs_set(FLAGS.dataset) n_node_types = len(molecule_mapping) print(n_node_types) n_edge_types = mdu.get_max_edge_type(train_set) + 1 max_n_nodes = max(len(m.atoms) for m in train_set) train_set = mdu.Dataset(train_set, FLAGS.batch_size, shuffle=True) val_set = mdu.Dataset(val_set, FLAGS.batch_size, shuffle=True) # n_node_types: number of node types (assumed categorical) # n_edge_types: number of edge types/ labels model_hparams = hparams.get_hparams_ChEMBL() print('Number of node/edge types: ', n_node_types, n_edge_types) print('Inside train function now...') with tf.device('/gpu:1'): if FLAGS.sample: sample(model_hparams, train_set, val_set, eval_every=FLAGS.eval_every, exp_dir=FLAGS.exp_dir, summary_writer=None, n_node_types=n_node_types, n_edge_types=n_edge_types) else: train(model_hparams, train_set, val_set, eval_every=FLAGS.eval_every, exp_dir=FLAGS.exp_dir, summary_writer=None, n_node_types=n_node_types, n_edge_types=n_edge_types)
def __init__(self, conll_file_path, surface_char2id=None, lemma_char2id=None, morph_tag2id=None, transformation2id=None, mode='train', max_sentences=0): """Initialize ConllDataset. Arguments: conll_file_path (str): conll file path surface_char2id (dict): Default is None. if None calculated over given data lemma_char2id (dict): Default is None. if None calculated over given data morph_tag2id (dict): Default is None. if None calculated over given data transformation2id (dict): Default is None. if None calculated over given data mode (str): 'train' or 'test'. If 'test' vocab dicts will not be updated max_sentences (int): Maximum number of sentences to be loaded into dataset. Default is 0 which means no limitation """ self.sentences = read_dataset(conll_file_path) if 0 < max_sentences < len(self.sentences): self.sentences = self.sentences[:max_sentences] if surface_char2id: self.surface_char2id = surface_char2id else: self.surface_char2id = dict() self.surface_char2id[self.PAD_token] = len(self.surface_char2id) self.surface_char2id[self.EOS_token] = len(self.surface_char2id) if lemma_char2id: self.lemma_char2id = lemma_char2id else: self.lemma_char2id = dict() self.lemma_char2id[self.PAD_token] = len(self.lemma_char2id) self.lemma_char2id[self.EOS_token] = len(self.lemma_char2id) self.lemma_char2id[self.START_TAG] = len(self.lemma_char2id) if transformation2id: self.transformation2id = transformation2id else: self.transformation2id = dict() self.transformation2id[self.PAD_token] = len( self.transformation2id) if morph_tag2id: self.morph_tag2id = morph_tag2id else: self.morph_tag2id = dict() self.morph_tag2id[self.PAD_token] = len(self.morph_tag2id) self.morph_tag2id[self.EOS_token] = len(self.morph_tag2id) self.morph_tag2id[self.START_TAG] = len(self.morph_tag2id) self.mode = mode if mode == 'train': self.create_vocabs()
def visualize_scale(count_path, scale_path, dataset_dir): from data_utils import read_dataset from mlutils.exp import yaml_load from matplotlib import pyplot as plt _, _, vocab = read_dataset(dataset_dir) scale = np.load(scale_path) if len(scale.shape) == 2: scale = scale[0] count = yaml_load(count_path) kv = sorted(count.items(), key=lambda x: -x[1]) s = scale[[vocab.stoi[w[0]] for w in kv]] # scale = np.array([scale[vocab.stoi[w[0]]] for w in kv]) plt.plot(s)
def recover_topic_embedding(topic_word_paths, embedding_path, dataset_dir): """Evaluate the WETC of topics generated by NPMI metric.""" from data_utils import read_dataset assert isinstance(topic_word_paths, list), 'Multiple paths should be specified.' _, _, vocab = read_dataset(dataset_dir) embedding = np.load(embedding_path) scores = [] for p in topic_word_paths: with open(p) as f: r = [] for line in f: idx = [int(vocab.stoi[w]) for w in line.split()] r.append(wetc(embedding[idx])) scores.append(r) return np.array(scores)
import tensorflow as tf import numpy as np import pickle import time import data_utils import matplotlib.pyplot as plt SAVE = "PATH" TRAIN_SIZE = 200 LATENT_DIMENSIONS = 256 WORD_EMBEDDING_SIZE = 100 BATCH_SIZE = 64 LR = 5e-3 STEPS = 500 X, Y, en_word2idx, en_idx2word, en_vocab, bn_word2idx, bn_idx2word, bn_vocab = data_utils.read_dataset( SAVE + "\\" + 'data.pkl') with open(SAVE + "\\" + "en_word2idx.pkl", "wb") as a: pickle.dump(en_word2idx, a) with open(SAVE + "\\" + "bn_word2idx.pkl", "wb") as b: pickle.dump(bn_word2idx, b) with open(SAVE + "\\" + "bn_idx2word.pkl", "wb") as c: pickle.dump(bn_idx2word, c) def data_padding(x, y, length=15): for i in range(len(x)): x[i] = x[i] + (length - len(x[i])) * [en_word2idx['<pad>']] y[i] = [bn_word2idx['<go>']] + y[i] + [ bn_word2idx['<eos>'] ] + (length - len(y[i])) * [bn_word2idx['<pad>']]
# import dependencies import tensorflow as tf import numpy as np from sklearn.model_selection import train_test_split import time import data_utils import matplotlib.pyplot as plt from nltk.translate.bleu_score import sentence_bleu # read dataset X, Y, en_word2idx, en_idx2word, en_vocab, de_word2idx, de_idx2word, de_vocab = data_utils.read_dataset('data.pkl') # inspect data print 'Sentence in English - encoded:', X[0] print 'Sentence in German - encoded:', Y[0] print 'Decoded:\n------------------------' for i in range(len(X[1])): print en_idx2word[X[1][i]], print '\n' for i in range(len(Y[1])): print de_idx2word[Y[1][i]], # data processing # data padding def data_padding(x, y, length = 15): for i in range(len(x)): x[i] = x[i] + (length - len(x[i])) * [en_word2idx['<pad>']]
def f(sentences,en_vocab): X, Y, en_word2idx, en_idx2word, en_vocab, zh_word2idx, zh_idx2word, zh_vocab = data_utils.read_dataset('en_n_zh.pkl') print(en_vocab) # In[3]: # data processing def replace_sentence_with_unk(sentence,en_vocab): for x in sentence: for y in range(len(x)): if x[y] not in en_vocab: x[y]='<ukn>' # data padding def data_padding(x, y=None, length = 16): for i in range(len(x)): #x[i] = x[i] + (length - len(x[i])) * [en_word2idx['<pad>']] #y[i] = [zh_word2idx['<go>']] + y[i] + [zh_word2idx['<eos>']] + (length-len(y[i])) * [zh_word2idx['<pad>']] x[i] = x[i] + (length - len(x[i])) * [en_word2idx['<pad>']] if y is not None: y[i] = y[i] + (length - len(y[i])) * [zh_word2idx['<pad>']] import random def generate_useless_sentence(X,vocab): useless_data=[] for i in range(len(X)): sentence_length=len(X[random.randint(0,len(X)-1)]) temp_sentence=[random.randint(2,len(vocab)) for x in range(sentence_length)] useless_data.append(temp_sentence) return useless_data def mix_data(source_sentences,useless_sentences): over_all_data=[] label_of_mix_data=[] for x in range(len(source_sentences)+len(useless_sentences)): if(random.randint(0,1) is 0): if len(source_sentences) is 0: continue over_all_data.append(source_sentences[0]) label_of_mix_data.append(1) del source_sentences[0] else: if len(useless_sentences) is 0: continue over_all_data.append(useless_sentences[0]) label_of_mix_data.append(0) del useless_sentences[0] return over_all_data,label_of_mix_data #data_padding(X, Y) #print(Y) def process_data(X,en_vocab,Y,zh_vocab): en_useless=generate_useless_sentence(X,en_vocab) zh_useless=generate_useless_sentence(Y,zh_vocab) en_all,en_label=mix_data(X,en_useless) zh_all,zh_label=mix_data(Y,zh_useless) data_padding(en_all) data_padding(zh_all) return en_all,en_label,zh_all,zh_label en_all,en_label,zh_all,zh_label=process_data(X,en_vocab,Y,zh_vocab) en_all_train, en_all_test, en_label_train, en_label_test = train_test_split(en_all, en_label, test_size = 0.1) # In[4]: n_inputs = 16 # MNIST n_hidden0 = 32 n_hidden1 = 64 n_hidden2 = 32 #n_hidden3 = 64 #n_hidden4 = 32 n_outputs = 2 # In[5]: #reset_graph() X_var = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X") Y_var = tf.placeholder(tf.int64, shape=(None), name="y") # In[6]: with tf.name_scope("dnn"): hidden0 = tf.layers.dense(X_var, n_hidden0, name="hidden0", activation=tf.nn.relu) hidden1 = tf.layers.dense(hidden0, n_hidden1, name="hidden1", activation=tf.nn.relu) hidden2 = tf.layers.dense(hidden1, n_hidden2, name="hidden2", activation=tf.nn.relu) #hidden3 = tf.layers.dense(hidden2,n_hidden3,name="hidden3", # activation=tf.nn.relu) #hidden4 = tf.layers.dense(hidden3,n_hidden4,name='hidden4', # activation=tf.nn.relu) logits = tf.layers.dense(hidden2, n_outputs, name="outputs") # In[7]: with tf.name_scope("loss"): xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=Y_var, logits=logits) loss = tf.reduce_mean(xentropy, name="loss") softmax=tf.nn.softmax(logits=logits) softmax_mean=tf.reduce_mean(tf.slice(softmax,[0,1],[-1,1])) # In[8]: learning_rate = 0.0001 with tf.name_scope("train"): optimizer = tf.train.GradientDescentOptimizer(learning_rate) training_op = optimizer.minimize(loss) # In[9]: #X, Y, en_word2idx, en_idx2word, en_vocab, zh_word2idx, zh_idx2word, zh_vocab # In[10]: with tf.name_scope("eval"): correct = tf.nn.in_top_k(logits, Y_var, 1) accuracy = tf.reduce_mean(tf.cast(correct, tf.float32)) # In[11]: #print(X) #print([zh_idx2word[id] for id in range(len(zh_vocab))]) # In[12]: #useless_data=generate_useless_sentence(X,zh_word2idx,zh_idx2word,zh_vocab) # In[13]: #[zh_idx2word[id] for id in useless_data[0]] #print(len(useless_data[0])) # In[14]: #[random.randint(0,1) for x in range(100)] # In[15]: n_epochs = 1000001 batch_size = 50 n_batches = int(np.ceil(len(en_all) / batch_size)) init = tf.global_variables_initializer() # In[16]: def next_batch(en_all,en_vocab,batch_size): current_position=0 while 1: if current_position <= len(en_all): yield en_all[current_position:current_position+batch_size],en_label[current_position: current_position+batch_size] else: current_position=0 yield en_all[current_position:current_position+batch_size],en_label[current_position: current_position+batch_size] current_position+=batch_size # In[17]: get=next_batch(en_all,en_vocab,batch_size) print(len(en_all_train)) print(batch_size) checkpoint_path = "/tmp/check_sentence.ckpt" saver = tf.train.Saver() # In[18]: '''
help='use vanilla SGD instead of Adam', action='store_true') return parser.parse_args() if __name__ == '__main__': args = parse_args() ckpt_path = os.path.join(args.basedir, args.expname) assert os.path.exists(ckpt_path) if args.random_seed is not None: print('Fixing random seed', args.random_seed) np.random.seed(args.random_seed) tf.compat.v1.set_random_seed(args.random_seed) random.seed(args.seed) # TODO seed everything train_set, test_set = read_dataset(args.metadatadir) render_kwargs_train, render_kwargs_test, start, grad_vars, models =\ create_nerf(args) # optimizer = tf.keras.optimizers.Adam(args.learning_rate, beta_1=0) test_writer = SummaryWriter(ckpt_path + '/test') loss_dict = meta_evaluate(models, metalearning_iter=start, test_scenes=test_set, N_importance=args.N_importance, half_res=args.half_res, testskip=args.testskip, white_bkgd=args.white_bkgd, log_fn=print,
# dependencies import tensorflow as tf import numpy as np from sklearn.model_selection import train_test_split import time import data_utils #import matplotlib.pyplot as plt ''' # In[2]: def f(): print('1') ''' X, Y, en_word2idx, en_idx2word, en_vocab, zh_word2idx, zh_idx2word, zh_vocab = data_utils.read_dataset('en_n_zh.pkl') print(en_vocab) # In[3]: # data processing def replace_sentence_with_unk(sentence,en_vocab): for x in sentence: for y in range(len(x)): if x[y] not in en_vocab: x[y]='<ukn>' # data padding def data_padding(x, y=None, length = 16): for i in range(len(x)):
parser = argparse.ArgumentParser(description='USL-H training script') parser.add_argument('--metric', type=str, required=True, help='Choose a metric to train. VUP|NUP|MLM') parser.add_argument('--weight-path', type=str, default='', help='Path to directory that stores the weight') # Dataset parser.add_argument('--train-ctx-path', type=str, help='Path to context training set') parser.add_argument('--train-res-path', type=str, required=True, help='Path to response training set') parser.add_argument('--valid-ctx-path', type=str, help='Path to context validation set') parser.add_argument('--valid-res-path', type=str, required=True, help='Path to response validation set') parser.add_argument('--batch-size', type=int, default=16, help='samples per batches') parser.add_argument('--max-epochs', type=int, default=1, help='number of epoches to train') parser.add_argument('--num-workers', type=int, default=1, help='number of worker for dataset') parser.add_argument('--ctx-token-len', type=int, default=25, help='number of tokens for context') parser.add_argument('--res-token-len', type=int, default=25, help='number of tokens for response') # Modeling parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate') parser.add_argument('--lr', type=float, default=1e-5, help='learning rate') parser.add_argument('--weight-decay', type=float, default=1e-5, help='L2 regularization') args = parser.parse_args() train_ctx = read_dataset(args.train_ctx_path) if args.train_ctx_path else None train_res = read_dataset(args.train_res_path) valid_ctx = read_dataset(args.valid_ctx_path) if args.valid_ctx_path else None valid_res = read_dataset(args.valid_res_path) train(args, train_ctx, train_res, valid_ctx, valid_res) print ("[!] done")
parser = argparse.ArgumentParser( description='Calculating min and max of MLM for normalizatiion') parser.add_argument('--weight-path', type=str, default='./checkpoints', help='Path to directory that stores the weight') parser.add_argument('--data-path', type=str, required=True, help='Path to the directory of training set') parser.add_argument('--output-path', type=str, default='mlm_minmax_score.json', help='Output path for the min max values') args = parser.parse_args() xdata = read_dataset(args.data_path) model = MLMScorer.load_from_checkpoint( checkpoint_path=args.weight_path).to(device) model.eval() print('[!] loading model complete') scores = calc_minmax(model, xdata) print('[!] normalizing complete') with open(args.output_path, 'w') as f: f.write(json.dumps(scores, indent=4)) f.close() print('[!] complete')
import tensorflow as tf import numpy as np from sklearn.model_selection import train_test_split import data_utils import matplotlib.pyplot as plt # read dataset X, Y, spanish_word2idx, spanish_idx2word, spanish_vocab, english_word2idx, english_idx2word, english_vocab = data_utils.read_dataset('./data.pkl') # Data padding def data_padding(x, y, length = 15): for i in range(len(X)): x[i] = x[i] + (length - len(x[i])) * [spanish_word2idx['<pad>']] y[i] = [english_word2idx['<go>']] + y[i] + (length - len(y[i])) * [english_word2idx['<pad>']] data_padding(X, Y) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1) del X del Y # Build the model input_sequence_length = 15 output_sequence_length = 16 spanish_vocab_size = len(spanish_vocab) + 2 # + <pad>, <unk> english_vocab_size = len(english_vocab) + 4 # + <pad>, <eos>, <go> # Placeholders
# @Author : mrobotor ([email protected]) # @Link : http://darklunar.ml # @Version : $Id$ import os import tensorflow as tf import numpy as np import matplotlib.pyplot as plt from conv_vae import ConvVAE from data_utils import read_dataset import time from scipy.stats import norm # data processing # read data set train_ds, valid_ds = read_dataset('./imgdata', test_size = 0.097) print(train_ds.images().shape) print((train_ds.images().nbytes + valid_ds.images().nbytes) / (1024.0 * 1024.0), 'MB') latent_dim = 10 batch_size = 50 # let's create ConvVAE cvae = ConvVAE(latent_dim, batch_size) # let's train ConvVAE num_epochs = 15 interval = 200 saver = tf.train.Saver(max_to_keep = 2)