Esempio n. 1
0
def read_dataset(file_path):

    if os.path.exists(file_path):
        return data_utils.read_dataset(file_path)
    else:
        print("No file found: {0}".format(file_path))
        sys.exit(1)
Esempio n. 2
0
def main(argv):
    tf.set_random_seed(FLAGS.seed)
    if not tf.gfile.IsDirectory(FLAGS.exp_dir):
        tf.gfile.MakeDirs(FLAGS.exp_dir)

    print('Made directory')
    train_set, val_set, _ = mdu.read_dataset(FLAGS.dataset)
    molecule_mapping = mdu.read_molecule_mapping_for_set(FLAGS.dataset)
    inv_mol_mapping = {v: k for k, v in enumerate(molecule_mapping)}

    if FLAGS.dataset.startswith('zinc'):
        bond_mapping = mdu.read_bond_mapping_for_set(FLAGS.dataset)
        inv_bond_mapping = {('%d_%d' % v): k
                            for k, v in enumerate(bond_mapping)}
        stereo = True
    else:
        bond_mapping = None
        inv_bond_mapping = None
        stereo = False

    # unique set of training data, used for evaluation
    train_set_unique = set(train_set)
    train_set, val_set, _ = mdu.read_molecule_graphs_set(FLAGS.dataset)

    n_node_types = len(molecule_mapping)
    print(n_node_types)
    n_edge_types = mdu.get_max_edge_type(train_set) + 1
    max_n_nodes = max(len(m.atoms) for m in train_set)

    train_set = mdu.Dataset(train_set, FLAGS.batch_size, shuffle=True)
    val_set = mdu.Dataset(val_set, FLAGS.batch_size, shuffle=True)

    # n_node_types: number of node types (assumed categorical)
    # n_edge_types: number of edge types/ labels
    model_hparams = hparams.get_hparams_ChEMBL()
    print('Number of node/edge types: ', n_node_types, n_edge_types)
    print('Inside train function now...')
    with tf.device('/gpu:1'):
        if FLAGS.sample:
            sample(model_hparams,
                   train_set,
                   val_set,
                   eval_every=FLAGS.eval_every,
                   exp_dir=FLAGS.exp_dir,
                   summary_writer=None,
                   n_node_types=n_node_types,
                   n_edge_types=n_edge_types)
        else:
            train(model_hparams,
                  train_set,
                  val_set,
                  eval_every=FLAGS.eval_every,
                  exp_dir=FLAGS.exp_dir,
                  summary_writer=None,
                  n_node_types=n_node_types,
                  n_edge_types=n_edge_types)
Esempio n. 3
0
    def __init__(self,
                 conll_file_path,
                 surface_char2id=None,
                 lemma_char2id=None,
                 morph_tag2id=None,
                 transformation2id=None,
                 mode='train',
                 max_sentences=0):
        """Initialize ConllDataset.

        Arguments:
            conll_file_path (str): conll file path
            surface_char2id (dict): Default is None. if None calculated over given data
            lemma_char2id (dict): Default is None. if None calculated over given data
            morph_tag2id (dict): Default is None. if None calculated over given data
            transformation2id (dict): Default is None. if None calculated over given data
            mode (str): 'train' or 'test'. If 'test' vocab dicts will not be updated
            max_sentences (int): Maximum number of sentences to be loaded into dataset.
                Default is 0 which means no limitation
        """
        self.sentences = read_dataset(conll_file_path)
        if 0 < max_sentences < len(self.sentences):
            self.sentences = self.sentences[:max_sentences]
        if surface_char2id:
            self.surface_char2id = surface_char2id
        else:
            self.surface_char2id = dict()
            self.surface_char2id[self.PAD_token] = len(self.surface_char2id)
            self.surface_char2id[self.EOS_token] = len(self.surface_char2id)
        if lemma_char2id:
            self.lemma_char2id = lemma_char2id
        else:
            self.lemma_char2id = dict()
            self.lemma_char2id[self.PAD_token] = len(self.lemma_char2id)
            self.lemma_char2id[self.EOS_token] = len(self.lemma_char2id)
            self.lemma_char2id[self.START_TAG] = len(self.lemma_char2id)

        if transformation2id:
            self.transformation2id = transformation2id
        else:
            self.transformation2id = dict()
            self.transformation2id[self.PAD_token] = len(
                self.transformation2id)

        if morph_tag2id:
            self.morph_tag2id = morph_tag2id
        else:
            self.morph_tag2id = dict()
            self.morph_tag2id[self.PAD_token] = len(self.morph_tag2id)
            self.morph_tag2id[self.EOS_token] = len(self.morph_tag2id)
            self.morph_tag2id[self.START_TAG] = len(self.morph_tag2id)
        self.mode = mode
        if mode == 'train':
            self.create_vocabs()
def visualize_scale(count_path, scale_path, dataset_dir):
    from data_utils import read_dataset
    from mlutils.exp import yaml_load
    from matplotlib import pyplot as plt
    _, _, vocab = read_dataset(dataset_dir)
    scale = np.load(scale_path)
    if len(scale.shape) == 2:
        scale = scale[0]
    count = yaml_load(count_path)
    kv = sorted(count.items(), key=lambda x: -x[1])
    s = scale[[vocab.stoi[w[0]] for w in kv]]
    # scale = np.array([scale[vocab.stoi[w[0]]] for w in kv])
    plt.plot(s)
def recover_topic_embedding(topic_word_paths, embedding_path, dataset_dir):
    """Evaluate the WETC of topics generated by NPMI metric."""
    from data_utils import read_dataset
    assert isinstance(topic_word_paths,
                      list), 'Multiple paths should be specified.'
    _, _, vocab = read_dataset(dataset_dir)
    embedding = np.load(embedding_path)
    scores = []
    for p in topic_word_paths:
        with open(p) as f:
            r = []
            for line in f:
                idx = [int(vocab.stoi[w]) for w in line.split()]
                r.append(wetc(embedding[idx]))
            scores.append(r)
    return np.array(scores)
Esempio n. 6
0
import tensorflow as tf
import numpy as np
import pickle
import time
import data_utils
import matplotlib.pyplot as plt

SAVE = "PATH"
TRAIN_SIZE = 200
LATENT_DIMENSIONS = 256
WORD_EMBEDDING_SIZE = 100
BATCH_SIZE = 64
LR = 5e-3
STEPS = 500

X, Y, en_word2idx, en_idx2word, en_vocab, bn_word2idx, bn_idx2word, bn_vocab = data_utils.read_dataset(
    SAVE + "\\" + 'data.pkl')

with open(SAVE + "\\" + "en_word2idx.pkl", "wb") as a:
    pickle.dump(en_word2idx, a)
with open(SAVE + "\\" + "bn_word2idx.pkl", "wb") as b:
    pickle.dump(bn_word2idx, b)
with open(SAVE + "\\" + "bn_idx2word.pkl", "wb") as c:
    pickle.dump(bn_idx2word, c)


def data_padding(x, y, length=15):
    for i in range(len(x)):
        x[i] = x[i] + (length - len(x[i])) * [en_word2idx['<pad>']]
        y[i] = [bn_word2idx['<go>']] + y[i] + [
            bn_word2idx['<eos>']
        ] + (length - len(y[i])) * [bn_word2idx['<pad>']]
# import dependencies
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
import time
import data_utils
import matplotlib.pyplot as plt
from nltk.translate.bleu_score import sentence_bleu

# read dataset
X, Y, en_word2idx, en_idx2word, en_vocab, de_word2idx, de_idx2word, de_vocab = data_utils.read_dataset('data.pkl')

# inspect data
print 'Sentence in English - encoded:', X[0]
print 'Sentence in German - encoded:', Y[0]
print 'Decoded:\n------------------------'

for i in range(len(X[1])):
    print en_idx2word[X[1][i]],

print '\n'

for i in range(len(Y[1])):
    print de_idx2word[Y[1][i]],

# data processing

# data padding
def data_padding(x, y, length = 15):
    for i in range(len(x)):
        x[i] = x[i] + (length - len(x[i])) * [en_word2idx['<pad>']]
Esempio n. 8
0
def f(sentences,en_vocab):
    X, Y, en_word2idx, en_idx2word, en_vocab, zh_word2idx, zh_idx2word, zh_vocab = data_utils.read_dataset('en_n_zh.pkl')
    print(en_vocab)


    # In[3]:


    # data processing
    def replace_sentence_with_unk(sentence,en_vocab):
        for x in sentence:
            for y in range(len(x)):
                if x[y]  not in en_vocab:
                    x[y]='<ukn>'
    # data padding
    def data_padding(x, y=None, length = 16):
        for i in range(len(x)):
            #x[i] = x[i] + (length - len(x[i])) * [en_word2idx['<pad>']]
            #y[i] = [zh_word2idx['<go>']] + y[i] + [zh_word2idx['<eos>']] + (length-len(y[i])) * [zh_word2idx['<pad>']]
            x[i] = x[i] + (length - len(x[i])) * [en_word2idx['<pad>']]
            if y is not None:
                y[i] = y[i] + (length - len(y[i])) * [zh_word2idx['<pad>']]
    import random
    def generate_useless_sentence(X,vocab):
        useless_data=[]
        for i in range(len(X)):
            sentence_length=len(X[random.randint(0,len(X)-1)])
            temp_sentence=[random.randint(2,len(vocab)) for x in range(sentence_length)]
            useless_data.append(temp_sentence)
        return useless_data
    def mix_data(source_sentences,useless_sentences):
        over_all_data=[]
        label_of_mix_data=[]
        for x in range(len(source_sentences)+len(useless_sentences)):
            if(random.randint(0,1) is 0):
                if len(source_sentences) is 0:
                    continue
                over_all_data.append(source_sentences[0])
                label_of_mix_data.append(1)
                del source_sentences[0]
            else:
                if len(useless_sentences) is 0:
                    continue
                over_all_data.append(useless_sentences[0])
                label_of_mix_data.append(0)
                del useless_sentences[0]
        return over_all_data,label_of_mix_data
    #data_padding(X, Y)
    #print(Y)
    def process_data(X,en_vocab,Y,zh_vocab):
        en_useless=generate_useless_sentence(X,en_vocab)
        zh_useless=generate_useless_sentence(Y,zh_vocab)
        en_all,en_label=mix_data(X,en_useless)
        zh_all,zh_label=mix_data(Y,zh_useless)
        data_padding(en_all)
        data_padding(zh_all)
        return en_all,en_label,zh_all,zh_label
    en_all,en_label,zh_all,zh_label=process_data(X,en_vocab,Y,zh_vocab)
    en_all_train,  en_all_test, en_label_train, en_label_test = train_test_split(en_all, en_label, test_size = 0.1)


    # In[4]:


    n_inputs = 16  # MNIST
    n_hidden0 = 32
    n_hidden1 = 64
    n_hidden2 = 32
    #n_hidden3 = 64
    #n_hidden4 = 32
    n_outputs = 2


    # In[5]:


    #reset_graph()

    X_var = tf.placeholder(tf.float32, shape=(None, n_inputs), name="X")
    Y_var = tf.placeholder(tf.int64, shape=(None), name="y") 


    # In[6]:


    with tf.name_scope("dnn"):
        hidden0 = tf.layers.dense(X_var, n_hidden0, name="hidden0",
                                  activation=tf.nn.relu)
        hidden1 = tf.layers.dense(hidden0, n_hidden1, name="hidden1",
                                  activation=tf.nn.relu)
        hidden2 = tf.layers.dense(hidden1, n_hidden2, name="hidden2",
                                  activation=tf.nn.relu)
        #hidden3 = tf.layers.dense(hidden2,n_hidden3,name="hidden3",
                                # activation=tf.nn.relu)
        #hidden4 = tf.layers.dense(hidden3,n_hidden4,name='hidden4',
                                # activation=tf.nn.relu)
        logits = tf.layers.dense(hidden2, n_outputs, name="outputs")


    # In[7]:


    with tf.name_scope("loss"):
        xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=Y_var, logits=logits)
        loss = tf.reduce_mean(xentropy, name="loss")
        softmax=tf.nn.softmax(logits=logits)
        softmax_mean=tf.reduce_mean(tf.slice(softmax,[0,1],[-1,1]))


    # In[8]:


    learning_rate = 0.0001

    with tf.name_scope("train"):
        optimizer = tf.train.GradientDescentOptimizer(learning_rate)
        training_op = optimizer.minimize(loss)


    # In[9]:


    #X, Y, en_word2idx, en_idx2word, en_vocab, zh_word2idx, zh_idx2word, zh_vocab 


    # In[10]:


    with tf.name_scope("eval"):
        correct = tf.nn.in_top_k(logits, Y_var, 1)
        accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))


    # In[11]:


    #print(X)
    #print([zh_idx2word[id] for id in range(len(zh_vocab))])


    # In[12]:


    #useless_data=generate_useless_sentence(X,zh_word2idx,zh_idx2word,zh_vocab)


    # In[13]:


    #[zh_idx2word[id] for id in useless_data[0]]
    #print(len(useless_data[0]))


    # In[14]:


    #[random.randint(0,1) for x in range(100)]


    # In[15]:


    n_epochs = 1000001
    batch_size = 50
    n_batches = int(np.ceil(len(en_all) / batch_size))
    init = tf.global_variables_initializer()


    # In[16]:


    def next_batch(en_all,en_vocab,batch_size):
        current_position=0
        while 1:
            if current_position <= len(en_all):
                yield en_all[current_position:current_position+batch_size],en_label[current_position:                                                                                current_position+batch_size]
            else:
                current_position=0
                yield en_all[current_position:current_position+batch_size],en_label[current_position:                                                                               current_position+batch_size]
            current_position+=batch_size


    # In[17]:


    get=next_batch(en_all,en_vocab,batch_size)
    print(len(en_all_train))
    print(batch_size)
    checkpoint_path = "/tmp/check_sentence.ckpt"
    saver = tf.train.Saver()


    # In[18]:


    '''
Esempio n. 9
0
                        help='use vanilla SGD instead of Adam',
                        action='store_true')
    return parser.parse_args()


if __name__ == '__main__':
    args = parse_args()
    ckpt_path = os.path.join(args.basedir, args.expname)
    assert os.path.exists(ckpt_path)
    if args.random_seed is not None:
        print('Fixing random seed', args.random_seed)
        np.random.seed(args.random_seed)
        tf.compat.v1.set_random_seed(args.random_seed)
        random.seed(args.seed)
    # TODO seed everything
    train_set, test_set = read_dataset(args.metadatadir)

    render_kwargs_train, render_kwargs_test, start, grad_vars, models =\
        create_nerf(args)

    #
    optimizer = tf.keras.optimizers.Adam(args.learning_rate, beta_1=0)
    test_writer = SummaryWriter(ckpt_path + '/test')
    loss_dict = meta_evaluate(models,
                              metalearning_iter=start,
                              test_scenes=test_set,
                              N_importance=args.N_importance,
                              half_res=args.half_res,
                              testskip=args.testskip,
                              white_bkgd=args.white_bkgd,
                              log_fn=print,
# dependencies
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
import time
import data_utils
#import matplotlib.pyplot as plt
'''

# In[2]:
def f():
    print('1')
'''

X, Y, en_word2idx, en_idx2word, en_vocab, zh_word2idx, zh_idx2word, zh_vocab = data_utils.read_dataset('en_n_zh.pkl')
print(en_vocab)


# In[3]:


# data processing
def replace_sentence_with_unk(sentence,en_vocab):
    for x in sentence:
        for y in range(len(x)):
            if x[y]  not in en_vocab:
                x[y]='<ukn>'
# data padding
def data_padding(x, y=None, length = 16):
    for i in range(len(x)):
Esempio n. 11
0
    parser = argparse.ArgumentParser(description='USL-H training script')
    parser.add_argument('--metric', type=str, required=True, help='Choose a metric to train. VUP|NUP|MLM')
    parser.add_argument('--weight-path', type=str, default='', help='Path to directory that stores the weight')

    # Dataset
    parser.add_argument('--train-ctx-path', type=str, help='Path to context training set')
    parser.add_argument('--train-res-path', type=str, required=True, help='Path to response training set')
    parser.add_argument('--valid-ctx-path', type=str, help='Path to context validation set')
    parser.add_argument('--valid-res-path', type=str, required=True, help='Path to response validation set')
    parser.add_argument('--batch-size', type=int, default=16, help='samples per batches')
    parser.add_argument('--max-epochs', type=int, default=1, help='number of epoches to train')
    parser.add_argument('--num-workers', type=int, default=1, help='number of worker for dataset')
    parser.add_argument('--ctx-token-len', type=int, default=25, help='number of tokens for context')
    parser.add_argument('--res-token-len', type=int, default=25, help='number of tokens for response')

    # Modeling
    parser.add_argument('--dropout', type=float, default=0.2, help='dropout rate')
    parser.add_argument('--lr', type=float, default=1e-5, help='learning rate')
    parser.add_argument('--weight-decay', type=float, default=1e-5, help='L2 regularization')

    args = parser.parse_args()

    train_ctx = read_dataset(args.train_ctx_path) if args.train_ctx_path else None
    train_res = read_dataset(args.train_res_path)
    valid_ctx = read_dataset(args.valid_ctx_path) if args.valid_ctx_path else None
    valid_res = read_dataset(args.valid_res_path)

    train(args, train_ctx, train_res, valid_ctx, valid_res)
    print ("[!] done")
Esempio n. 12
0
    parser = argparse.ArgumentParser(
        description='Calculating min and max of MLM for normalizatiion')
    parser.add_argument('--weight-path',
                        type=str,
                        default='./checkpoints',
                        help='Path to directory that stores the weight')
    parser.add_argument('--data-path',
                        type=str,
                        required=True,
                        help='Path to the directory of training set')
    parser.add_argument('--output-path',
                        type=str,
                        default='mlm_minmax_score.json',
                        help='Output path for the min max values')

    args = parser.parse_args()
    xdata = read_dataset(args.data_path)

    model = MLMScorer.load_from_checkpoint(
        checkpoint_path=args.weight_path).to(device)
    model.eval()
    print('[!] loading model complete')

    scores = calc_minmax(model, xdata)
    print('[!] normalizing complete')

    with open(args.output_path, 'w') as f:
        f.write(json.dumps(scores, indent=4))
        f.close()
    print('[!] complete')
Esempio n. 13
0
import tensorflow as tf
import numpy as np
from sklearn.model_selection import train_test_split
import data_utils
import matplotlib.pyplot as plt

# read dataset
X, Y, spanish_word2idx, spanish_idx2word, spanish_vocab, english_word2idx, english_idx2word, english_vocab = data_utils.read_dataset('./data.pkl')

# Data padding
def data_padding(x, y, length = 15):
    for i in range(len(X)):
        x[i] = x[i] + (length - len(x[i])) * [spanish_word2idx['<pad>']]
        y[i] = [english_word2idx['<go>']] + y[i] + (length - len(y[i])) * [english_word2idx['<pad>']]

data_padding(X, Y)

X_train,  X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.1)

del X
del Y

# Build the model

input_sequence_length = 15
output_sequence_length = 16

spanish_vocab_size = len(spanish_vocab) + 2 # + <pad>, <unk>
english_vocab_size = len(english_vocab) + 4 # + <pad>, <eos>, <go>

# Placeholders
Esempio n. 14
0
# @Author  : mrobotor ([email protected])
# @Link    : http://darklunar.ml
# @Version : $Id$

import os
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
from conv_vae import ConvVAE
from data_utils import read_dataset
import time
from scipy.stats import norm

# data processing
# read data set
train_ds, valid_ds = read_dataset('./imgdata', test_size = 0.097)
print(train_ds.images().shape)
print((train_ds.images().nbytes + valid_ds.images().nbytes) / (1024.0 * 1024.0), 'MB')

latent_dim = 10
batch_size = 50

# let's create ConvVAE
cvae = ConvVAE(latent_dim, batch_size)

# let's train ConvVAE
num_epochs = 15
interval = 200

saver = tf.train.Saver(max_to_keep = 2)