def syntactic_training_data(rng=data_rng, num_to_run=None, output='syntactic', print_freq=print_freq):
     training_block = ngram_reader.training_block(rng.random_sample())
     block_size = training_block.shape[0]
     for count in xrange(num_to_run or block_size):
         if count % print_freq == 0:
             sys.stdout.write('\r%s: ngram %d of %d' % (output, count, num_to_run or block_size))
             sys.stdout.flush()
         train_index = sample_cumulative_discrete_distribution(training_block[:,-1], rng=data_rng)
         correct_symbols, error_symbols, ngram_frequency = ngram_reader.contrastive_symbols_from_row(training_block[train_index], rng=data_rng)
         yield list(correct_symbols) + list(error_symbols)
Esempio n. 2
0
 def syntactic_training_data(rng=data_rng,
                             num_to_run=None,
                             output='syntactic',
                             print_freq=print_freq):
     training_block = ngram_reader.training_block(rng.random_sample())
     block_size = training_block.shape[0]
     for count in xrange(num_to_run or block_size):
         if count % print_freq == 0:
             sys.stdout.write('\r%s: ngram %d of %d' %
                              (output, count, num_to_run or block_size))
             sys.stdout.flush()
         train_index = sample_cumulative_discrete_distribution(
             training_block[:, -1], rng=data_rng)
         correct_symbols, error_symbols, ngram_frequency = ngram_reader.contrastive_symbols_from_row(
             training_block[train_index], rng=data_rng)
         yield list(correct_symbols) + list(error_symbols)
Esempio n. 3
0
    def add_noise_to_symbols(self, symbols, column_index=None, rng=None, max_tries=5):
        seq_length = symbols.shape[0]

        if column_index is None:
            column_index = seq_length / 2

        tries = 0
        replacement_word = symbols[column_index]
        while tries < max_tries:
            tries += 1
            replacement_word = sample_cumulative_discrete_distribution(self.cumulative_word_frequencies)
            if replacement_word != 0 and replacement_word != symbols[column_index]:
                break
        assert replacement_word < self.vocab_size

        noisy = symbols.copy()
        noisy[column_index] = replacement_word
        return noisy
Esempio n. 4
0
    def add_noise_to_symbols(self,
                             symbols,
                             column_index=None,
                             rng=None,
                             max_tries=5):
        seq_length = symbols.shape[0]

        if column_index is None:
            column_index = seq_length / 2

        tries = 0
        replacement_word = symbols[column_index]
        while tries < max_tries:
            tries += 1
            replacement_word = sample_cumulative_discrete_distribution(
                self.cumulative_word_frequencies)
            if replacement_word != 0 and replacement_word != symbols[
                    column_index]:
                break
        assert replacement_word < self.vocab_size

        noisy = symbols.copy()
        noisy[column_index] = replacement_word
        return noisy
        last_time = time.clock()
        model.increase_k()
        stats_for_k = {}

        if not args['dont_run_syntactic']:
            # syntactic update step
            augmented_costs = []
            costs = []
            for block_num in xrange(args['syntactic_blocks_to_run']):
                training_block = ngram_reader.training_block(data_rng.random_sample())
                block_size = training_block.shape[0]
                for count in xrange(block_size):
                    if count % print_freq == 0:
                        sys.stdout.write('\rk %i b%i: ngram %d of %d' % (model.k, block_num, count, block_size))
                        sys.stdout.flush()
                    train_index = sample_cumulative_discrete_distribution(training_block[:,-1], rng=data_rng)
                    correct_symbols, error_symbols, ngram_frequency = ngram_reader.contrastive_symbols_from_row(training_block[train_index], rng=data_rng)
                    augmented_cost, cost = model.update_w(*(list(correct_symbols) + list(error_symbols)))
                    if not np.isfinite(cost):
                        print 'single nan detected'
                        save_model('nan_dump.pkl.gz')
                        import IPython
                        IPython.embed()
                    augmented_costs.append(augmented_cost)
                    costs.append(cost)
                if args['syntactic_blocks_to_run'] > 1:
                    print
                    print  '%i intermediate mean %f' % (block_num, np.mean(costs[-block_size:]))

            print
            if not np.isfinite(np.mean(costs)):
Esempio n. 6
0
            print 'syntactic mean score \t%f' % syn_validation_mean
            print 'syntactic mean weighted score \t%f' % syn_validation_weighted_mean

        # print 'time since block init: %f' % (time.clock() - last_time)

        # semantic update step
        if not args['dont_run_semantic']:
            this_count = 0
            augmented_costs = []
            costs = []
            for block_num in xrange(args['semantic_blocks_to_run']):
                for i in xrange(args['semantic_block_size']):
                    train_i = -1
                    while train_i not in indices_in_intersection:
                        train_i = sample_cumulative_discrete_distribution(
                            ngram_reader.cumulative_word_frequencies,
                            rng=data_rng)
                    for j in xrange(args['k_nearest']):
                        train_j = -1
                        while train_j not in indices_in_intersection:
                            train_j = sample_cumulative_discrete_distribution(
                                ngram_reader.cumulative_word_frequencies,
                                rng=data_rng)
                        if word_similarity.word_pairwise_sims[
                                train_i, train_j] == -np.inf:
                            continue
                        sim = word_similarity.word_pairwise_sims[train_i,
                                                                 train_j]
                        augmented_cost, cost = model.update_v(
                            train_i, train_j, sim)
                        augmented_costs.append(augmented_cost)
            print 'validation:'
            print 'syntactic mean score \t%f' % syn_validation_mean
            print 'syntactic mean weighted score \t%f' % syn_validation_weighted_mean

        # print 'time since block init: %f' % (time.clock() - last_time)

        # semantic update step
        if not args['dont_run_semantic']:
            this_count = 0
            augmented_costs = []
            costs = []
            for block_num in xrange(args['semantic_blocks_to_run']):
                for i in xrange(args['semantic_block_size']):
                    train_i = -1
                    while train_i not in indices_in_intersection:
                        train_i = sample_cumulative_discrete_distribution(ngram_reader.cumulative_word_frequencies, rng=data_rng)
                    for j in xrange(args['k_nearest']):
                        train_j = -1
                        while train_j not in indices_in_intersection:
                            train_j = sample_cumulative_discrete_distribution(ngram_reader.cumulative_word_frequencies, rng=data_rng)
                        if word_similarity.word_pairwise_sims[train_i, train_j] == -np.inf:
                            continue
                        sim = word_similarity.word_pairwise_sims[train_i, train_j]
                        augmented_cost, cost = model.update_v(train_i, train_j, sim)
                        augmented_costs.append(augmented_cost)
                        costs.append(cost)

                    if i % print_freq == 0:
                        sys.stdout.write('\r k %i: pair : %d / %d' % (model.k, i, args['semantic_block_size']))
                        sys.stdout.flush()
Esempio n. 8
0
        if not args['dont_run_syntactic']:
            # syntactic update step
            augmented_costs = []
            costs = []
            for block_num in xrange(args['syntactic_blocks_to_run']):
                training_block = ngram_reader.training_block(
                    data_rng.random_sample())
                block_size = training_block.shape[0]
                for count in xrange(block_size):
                    if count % print_freq == 0:
                        sys.stdout.write(
                            '\rk %i b%i: ngram %d of %d' %
                            (model.k, block_num, count, block_size))
                        sys.stdout.flush()
                    train_index = sample_cumulative_discrete_distribution(
                        training_block[:, -1], rng=data_rng)
                    correct_symbols, error_symbols, ngram_frequency = ngram_reader.contrastive_symbols_from_row(
                        training_block[train_index], rng=data_rng)
                    augmented_cost, cost = model.update_w(
                        *(list(correct_symbols) + list(error_symbols)))
                    if not np.isfinite(cost):
                        print 'single nan detected'
                        save_model('nan_dump.pkl.gz')
                        import IPython
                        IPython.embed()
                    augmented_costs.append(augmented_cost)
                    costs.append(cost)
                if args['syntactic_blocks_to_run'] > 1:
                    print
                    print '%i intermediate mean %f' % (
                        block_num, np.mean(costs[-block_size:]))