Example #1
0
    def export_model(self, out_dir='default'):
        if out_dir == 'default':
            out_dir = self.output_dictionary

        utils.save_pkl(self.embedding,
                       out_dir + config['TRAIN']['embedding_pkl'])
        utils.save_pkl(self.softmax_w,
                       out_dir + config['TRAIN']['softmax_w_pkl'])
Example #2
0
    def train(self, learning_rate=0.01, print_step=1000, stop_threshold=0):
        losses = []
        aver_losses = []
        wa_scores = []
        if print_step == 0:
            print_step = self.n_batches

        for _ in range(self.epochs):
            iteration = 0
            start = time.time()

            with open(self.filename, 'r') as f:
                reader = csv.reader(f)
                for row in reader:
                    # Print step
                    iteration += 1
                    if iteration % print_step == 0:
                        end = time.time()
                        print("Epochs: {}".format(_),
                              "Iteration: {}".format(iteration),
                              "Avg. Training loss: {:.4f}".format(np.mean(losses)),
                              "{:.4f} sec/ {} sample".format((end - start), self.batch_size * print_step))
                        aver_losses.append(np.mean(losses))
                        losses = []
                        start = time.time()

                    # Print word analogy
                    if iteration % (print_step * 10) == 0:
                        eval = Embedding(np.array(self.E), self.int_to_vocab, self.vocab_to_int)
                        wa_score = self.word_analogy.evaluate(eval, high_level_category=False, restrict_top_words=False)
                        wa_scores.append(wa_score['all'])

                        self.export_model(self.output_dictionary + 'step-{}/'.format(int(iteration)))

                    loss = self._train_one_sample(int(row[0]), int(row[1]), learning_rate)
                    losses.append(loss)

            eval = Embedding(np.array(self.E), self.int_to_vocab, self.vocab_to_int)
            wa_score = self.word_analogy.evaluate(eval, high_level_category=False, restrict_top_words=False)
            wa_scores.append(wa_score['all'])
            print('Epochs: {}, WA score: {}'.format(_, wa_score['all']))

            # Save step
            if _ % 5 == 0:
                self.export_model(self.output_dictionary + 'step-{}/'.format(int(_)))

        # export losses
        utils.save_pkl(aver_losses, self.output_dictionary + config['TRAIN']['loss_file'])
        utils.save_pkl(wa_scores, self.output_dictionary + config['TRAIN']['acc_file'])
Example #3
0
    def _sample_contexts(self, from_file=True):
        if not from_file:
            samples = utils.sample_context(self.context_distribution,
                                           self.n_context_sample)
            return samples

        # Sample contexts
        if self.scope + 1 > len(self.contexts):
            for i in range(self.scope + 1 - len(self.contexts)):
                samples = utils.sample_context(self.context_distribution,
                                               self.n_context_sample)
                self.contexts.append(samples)

            # Save result back to pkl
            print('Uploading sample context file, scope: ', self.scope)
            utils.save_pkl(self.contexts, self.sample_contexts_file_name)

        return self.contexts[self.scope]
Example #4
0
    def save_dicts(self):
        # make directories
        dict_path = self.output_path + config['PREPROCESS']['output_dict_path']
        if not os.path.exists(dict_path):
            os.makedirs(dict_path)

        # Save dictionaries
        utils.save_pkl(self.vocab_to_int, dict_path + config['PREPROCESS']['vocab_to_int'])
        utils.save_pkl(self.int_to_vocab, dict_path + config['PREPROCESS']['int_to_vocab'])
        utils.save_pkl(self.cont_to_int, dict_path + config['PREPROCESS']['cont_to_int'])
        utils.save_pkl(self.int_to_cont, dict_path + config['PREPROCESS']['int_to_cont'])
Example #5
0
 def export_model(self):
     utils.save_pkl(
         self.embedding,
         self.output_dictionary + config['TRAIN']['embedding_pkl'])
     utils.save_pkl(
         self.softmax_w,
         self.output_dictionary + config['TRAIN']['softmax_w_pkl'])
     utils.save_pkl(
         self.softmax_b,
         self.output_dictionary + config['TRAIN']['softmax_b_pkl'])
Example #6
0
    for i in range(args.continue_from, args.scope):
        w = data[i][0]
        c = data[i][1]

        length = model.snml_length_sampling(w, c, epochs=args.epochs)
        snml_lengths.append(length)

        # print process
        if (i + 1) % print_step == 0:
            end = time.time()
            print('Run {} step in: {:.4f} sec, snml length: {}'.format(i + 1, (end - start), sum(snml_lengths)))
            start = time.time()

        # save steps
        if (i + 1) % 1000 == 0:
            step_path = args.model + '{}-step/'.format(i + 1)
            filename = step_path + 'scope-{}-snml_length.pkl'.format(args.scope)
            utils.save_pkl(snml_lengths, filename)

    print('{} scope snml length: {}'.format(args.scope, sum(snml_lengths)))

    # Save result to file
    filename = args.model + 'scope-{}-snml_length.txt'.format(args.scope)
    output = open(filename, 'w')
    for i in snml_lengths:
        output.write(str(i) + '\n')
    output.close()

    # upload to gcs
    utils.upload_to_gcs(filename, force_update=True)
Example #7
0
    print_step = 50000
    start = time.time()
    for i in range(args.continue_from, args.scope):
        w = data[i][0]
        c = data[i][1]

        length = model.snml_length(w, c, epochs=args.epochs)
        snml_lengths.append(length)

        # print process
        if (i + 1) % print_step == 0:
            end = time.time()
            print('Run {} step in: {:.4f} sec, snml length: {}'.format(
                i + 1, (end - start), sum(snml_lengths)))
            start = time.time()

        # save steps
        # if (i + 1) % 500000 == 0:
        #     step_path = args.model + '{}-step/'.format(i + 1)
        #     filename = step_path + 'scope-{}-snml_length.pkl'.format(args.scope)
        #     utils.save_pkl(snml_lengths, filename)

    print('{} scope snml length: {}'.format(args.scope, sum(snml_lengths)))

    # Save result to file
    filename = args.model + 'scope-{}-snml_length.pkl'.format(args.scope)
    utils.save_pkl(snml_lengths, filename, local=True)

    # upload to gcs
    # utils.upload_to_gcs(filename, force_update=True)
Example #8
0
    epochs = 16
    dim = '200'

    # read snml train file
    data = np.genfromtxt('../../../data/text8/scope.csv',
                         delimiter=',').astype(int)

    loss_list = []
    n_sample = 2000
    model = Model('../../../output/text8/momentum/snml/1/' + dim + 'dim/',
                  '../../../data/text8/contexts/',
                  n_context_sample=3000,
                  learning_rate=0.0004)

    for i in range(n_sample):
        datum = data[i]
        w, c = data[i][0], data[i][1]
        w = int(w)
        c = int(c)

        ps_a = -np.log(
            model.train_one_sample(w, c, epochs=epochs, update_weight=True))
        loss_list.append(ps_a)

        if i % 100 == 0:
            print('{} th loop'.format(i))

    utils.save_pkl(loss_list,
                   '../../../output/text8/momentum/test/4.pkl',
                   local=True)
    parser = argparse.ArgumentParser()
    parser.add_argument('--data_path',
                        default='../../data/processed data/split/',
                        type=str)
    args = parser.parse_args()

    print('Reading file...')
    contexts = []
    iteration = 0
    for file in os.listdir(args.data_path):
        iteration += 1
        if iteration % 1000 == 0:
            print('Importing ', file)
        data = np.genfromtxt(args.data_path + file, delimiter=',').astype(int)
        contexts.extend(data[:, 1])

    context_counts = Counter(contexts)
    n_context = len(context_counts)
    n_data = len(contexts)

    print('Making distribution...')
    context_distribution = np.zeros(n_context)
    for i in range(n_context):
        context_distribution[i] = context_counts[i] / n_data

    print('Saving file...')
    utils.save_pkl(context_distribution, 'context_distribution.pkl')

    print('Finished!')
    print('Saved: {} contexts / {} records'.format(n_context, n_data))
Example #10
0
    def train(self,
              n_sampled=200,
              epochs=1,
              batch_size=10000,
              print_step=1000):
        self.embedding_file = config['TRAIN']['embedding'].format(
            self.n_embedding, n_sampled, epochs, batch_size)

        # computation graph
        train_graph = tf.Graph()

        with train_graph.as_default():
            # training data
            dataset = tf.data.experimental.make_csv_dataset(
                self.data_path + config['TRAIN']['train_data'],
                batch_size=batch_size,
                column_names=['input', 'output'],
                header=False,
                num_epochs=epochs)
            datum = dataset.make_one_shot_iterator().get_next()
            inputs, labels = datum['input'], datum['output']

            # embedding layer
            embedding = tf.Variable(
                tf.random_uniform((self.n_vocab, self.n_embedding), -1, 1))
            embed = tf.nn.embedding_lookup(embedding, inputs)

            # softmax layer
            softmax_w = tf.Variable(
                tf.truncated_normal((self.n_context, self.n_embedding)))
            softmax_b = tf.Variable(tf.zeros(self.n_context))

            # Calculate the loss using negative sampling
            labels = tf.reshape(labels, [-1, 1])
            loss = tf.nn.sampled_softmax_loss(weights=softmax_w,
                                              biases=softmax_b,
                                              labels=labels,
                                              inputs=embed,
                                              num_sampled=n_sampled,
                                              num_classes=self.n_context)

            cost = tf.reduce_mean(loss)
            optimizer = tf.train.AdamOptimizer().minimize(cost)

        with tf.Session(graph=train_graph) as sess:
            iteration = 1
            loss = 0
            losses = []
            sess.run(tf.global_variables_initializer())

            try:
                start = time.time()
                while True:
                    train_loss, _ = sess.run([cost, optimizer])
                    loss += train_loss
                    losses.append(train_loss)

                    if iteration % print_step == 0:
                        end = time.time()
                        print(
                            "Iteration: {}".format(iteration),
                            "Avg. Training loss: {:.4f}".format(loss /
                                                                print_step),
                            "{:.4f} sec/ {} sample".format(
                                (end - start), batch_size * print_step))
                        loss = 0
                        start = time.time()

                    iteration += 1
            except tf.errors.OutOfRangeError:
                print("End of dataset")

            # export embedding matrix
            self.embedding = embedding.eval()
            self.softmax_w = softmax_w.eval()
            self.softmax_b = softmax_b.eval()

            # export losses
            utils.save_pkl(
                losses, self.output_dictionary + config['TRAIN']['loss_file'])
Example #11
0
        loss_list.extend(m.get_loss_batch(w, c))

    return loss_list


if __name__ == "__main__":
    dims = [50, 100, 110, 120, 130, 140, 150, 160, 200, 300]

    # read snml train file
    data = np.genfromtxt('../../../../data/wiki/scope.csv',
                         delimiter=',').astype(int)
    n_sample = 3000000

    for dim in dims:
        print(dim)
        # full data
        model = Model(
            '../../../output/wiki/20200126/1/train2/{}dim/step-90/'.format(
                dim),
            '../../../data/wiki/contexts/',
            n_context_sample=3000,
            learning_rate=0.1)

        loss_list = get_loss_list_batch(model, data[299999:n_sample + 299999])
        save_pkl(
            loss_list,
            'C:\\Users/hungp/Downloads/information criteria on sg/wiki/20200126 snml/cv_{}_dim.pkl'
            .format(dim),
            local=True)
        # save_pkl(loss_list, 'cv_lines/cv_{}_dim.pkl'.format(dim), local=True)
    with open(args.data_path + config['TRAIN']['train_data']) as fp:
        line = fp.readline()
        iteration = 0
        for line in fp:
            iteration += 1
            try:
                context = int(line.split(',')[1])
                contexts.append(context)
            except:
                print('Failed {}th line: {}'.format(iteration, line))
            finally:
                if iteration % 10000000 == 0:
                    print('Processed: {} lines'.format(iteration))

    context_counts = Counter(contexts)
    n_context = len(context_counts)
    n_data = len(contexts)

    print('Making distribution...')
    context_distribution = np.zeros(n_context)
    for i in range(n_context):
        context_distribution[i] = context_counts[i] / n_data

    print('Saving file...')
    utils.save_pkl(context_distribution, args.data_path + 'contexts/context_distribution.pkl')

    print('Finished!')
    print('Saved: {} contexts / {} records'.format(n_context, n_data))


Example #13
0
from multiprocessing import Pool
import multiprocessing
import utils.tools as utils

if __name__ == "__main__":
    words = [6581, 93, 4519, 506]
    contexts = [390, 1172, 1545, 22]

    model = ModelMomentum(
        '../../../output/text8/momentum/snml/48epochs/1/100dim/',
        '../../../data/text8/contexts/',
        n_context_sample=600)

    for i in range(len(words)):
        word = words[i]
        context = contexts[i]

        # Update all other context
        print('Start: ', word)

        # implement pools
        job_args = [(word, c, 48, 3000) for c in range(model.V_dash)]
        p = Pool(multiprocessing.cpu_count())
        probs = p.map(model._train_job, job_args)
        p.close()
        p.join()

        # save context's probs
        utils.save_pkl(
            probs, '../../../output/test/contexts_probs_{}.pkl'.format(word))
Example #14
0
    def train(self, print_step=1000, stop_threshold=0):
        iteration = 1
        loss = 0
        losses = []
        epoch_sum_loss = 0.
        last_epoch_loss = 999999.
        wa_scores = []
        if print_step == 0:
            print_step = self.n_batches

        try:
            start = time.time()
            while True:
                train_loss, _ = self.sess.run(
                    [self.full_cost, self.full_optimizer])
                # train_loss, _ = self.sess.run([self.cost, self.optimizer])
                loss += train_loss
                epoch_sum_loss += train_loss
                losses.append(train_loss)

                if iteration % print_step == 0:
                    end = time.time()
                    print(
                        "Iteration: {}".format(iteration),
                        "Avg. Training loss: {:.4f}".format(loss / print_step),
                        "{:.4f} sec/ {} sample".format(
                            (end - start), self.batch_size * print_step))
                    loss = 0
                    start = time.time()

                if iteration % self.n_batches == 0:
                    epochs = iteration / self.n_batches
                    epoch_loss = epoch_sum_loss / self.n_batches
                    epoch_sum_loss = 0
                    epoch_loss_diff = np.abs(epoch_loss - last_epoch_loss)
                    print('Epochs {} loss: {}'.format(epochs, epoch_loss))

                    # word analogy score
                    embedding = self.sess.run(self.embedding_g)
                    eval = Embedding(embedding, self.int_to_vocab,
                                     self.vocab_to_int)
                    wa_score = self.word_analogy.evaluate(
                        eval,
                        high_level_category=False,
                        restrict_top_words=False)
                    wa_scores.append(wa_score['all'])

                    # stop criteria
                    if epoch_loss_diff < stop_threshold:
                        self.epochs = iteration / self.n_batches
                        # output file
                        self.embedding_file = config['TRAIN'][
                            'embedding'].format(self.n_embedding,
                                                self.n_sampled,
                                                int(self.epochs),
                                                self.batch_size)
                        print('Loss diff: {}, stop training.'.format(
                            epoch_loss_diff))
                        print(self.output_dictionary + self.embedding_file)
                        break

                    # Save step
                    if epochs % 10 == 0:
                        self.embedding = self.sess.run(self.embedding_g)
                        self.softmax_w = self.sess.run(self.softmax_w_g)
                        self.export_model(self.output_dictionary +
                                          'step-{}/'.format(int(epochs)))

                    last_epoch_loss = epoch_loss

                iteration += 1
        except tf.errors.OutOfRangeError:
            print("End of dataset")

        # export embedding matrix
        self.embedding = self.sess.run(self.embedding_g)
        self.softmax_w = self.sess.run(self.softmax_w_g)

        # export losses
        utils.save_pkl(losses,
                       self.output_dictionary + config['TRAIN']['loss_file'])
        utils.save_pkl(wa_scores,
                       self.output_dictionary + config['TRAIN']['acc_file'])
Example #15
0
if __name__ == "__main__":
    # Data file
    raw_data_path = '../data/raw data/test.txt '
    context_to_dict_path = 'data/text8/dict/cont_to_int.dict'
    output_path = 'data/text8/contexts/distribution_from_raw.pkl'
    int_to_cont = load_pkl('data/text8/dict/int_to_cont.dict', local=True)

    # Load data
    with open(raw_data_path, encoding='utf-8') as f:
        words = f.read().split()

    # Load dict
    context_to_dict = load_pkl(context_to_dict_path, local=True)

    # Convert vocab to int
    context = []
    for word in words:
        if word in context_to_dict:
            context.append(context_to_dict[word])

    context_counts = Counter(context)
    n_context = len(context_to_dict)
    n_data = sum(list(context_counts.values()))

    context_distribution = np.zeros(n_context)
    for c, count in context_counts.items():
        context_distribution[c] = count / n_data

    context_distribution = np.array(context_distribution)
    save_pkl(context_distribution, output_path)
if __name__ == "__main__":
    context_path = '../notebooks/output/50-context-500000-data-18-questions/contexts/'
    n_context_sample = 50
    scope = 5000
    file_name = os.path.join(context_path,
                             'sample_contexts_{}.pkl'.format(n_context_sample))

    context_distribution = utils.load_pkl(context_path +
                                          'context_distribution.pkl')
    if os.path.exists(file_name):
        print('Load file')
        contexts = utils.load_pkl(file_name)
    else:
        contexts = []

    print('Current contexts: ', len(contexts))

    # Sample contexts
    if scope + 1 > len(contexts):
        for i in range(scope - len(contexts)):
            samples = utils.sample_context_uniform(len(context_distribution),
                                                   n_context_sample)
            contexts.append(samples)

    # Save result back to pkl
    utils.save_pkl(contexts, file_name)

    print(len(contexts))
    print(len(contexts[0]))