Beispiel #1
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))
    log.info('preprocessing data')

    if args.amazon is True:
        line_iterator = \
                AmazonProductReviewCorpusReader(args.infile).review_generator()
    else:
        line_iterator = file_line_generator(args.infile)

    if args.sentence_splitter:
        sent_splitter = nltk.data.load(args.sentence_splitter)

    with utf8_file_open(args.outfile, 'w') as outfile:

        for (i, line) in enumerate(line_iterator):
            log_iterations(log, i, 100000)

            if args.replace_digits:
                line = re.sub(r'\d', args.replace_digits, line,
                        0, REGEX_FLAGS)

            if args.strip_html:
                line = nltk.clean_html(line)

            if args.sentence_splitter:
                line = sent_splitter.tokenize(line)
            else:
                line = [line]

            if args.tokenize:
                line = [tokenize(l) for l in line]

            if not args.tokenize:
                outfile.write('\n'.join(line))
            else:
                outfile.write('\n'.join([' '.join(l) for l in line]))

            outfile.write('\n')

    log.info('finished')
Beispiel #2
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))
    log.info('preprocessing data')

    if args.amazon is True:
        line_iterator = \
                AmazonProductReviewCorpusReader(args.infile).review_generator()
    else:
        line_iterator = file_line_generator(args.infile)

    if args.sentence_splitter:
        sent_splitter = nltk.data.load(args.sentence_splitter)

    with utf8_file_open(args.outfile, 'w') as outfile:

        for (i, line) in enumerate(line_iterator):
            log_iterations(log, i, 100000)

            if args.replace_digits:
                line = re.sub(r'\d', args.replace_digits, line, 0, REGEX_FLAGS)

            if args.strip_html:
                line = nltk.clean_html(line)

            if args.sentence_splitter:
                line = sent_splitter.tokenize(line)
            else:
                line = [line]

            if args.tokenize:
                line = [tokenize(l) for l in line]

            if not args.tokenize:
                outfile.write(u'\n'.join(line))
            else:
                outfile.write(u'\n'.join([u' '.join(l) for l in line]))

            outfile.write(u'\n')

    log.info('finished')
Beispiel #3
0
def main(argv=None):
    """See argument parser description."""

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    with utf8_file_open(args.outfile, 'w') as outfile:
        outfile.write(u'mean max min std_dev\n')

        for (count, tupel) in enumerate(calc_matrix_statistics(args.infile)):
            log_iterations(log, count, 10000)

            outfile.write(u'%f %f %f %f\n' % tupel)

    log.info('finished')
def main(argv=None):
    """See argument parser description."""

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    with utf8_file_open(args.outfile, 'w') as outfile:
        outfile.write(u'mean max min std_dev\n')

        for (count, tupel) in enumerate(calc_matrix_statistics(args.infile)):
            log_iterations(log, count, 10000)

            outfile.write(u'%f %f %f %f\n' % tupel)

    log.info('finished')
Beispiel #5
0
def main(argv=None):
    """See argument parser description."""

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')
    file2_content = list(file_line_generator(args.file2))

    log.info('combining files')

    with utf8_file_open(args.out_file, 'w') as outfile:

        for c, line1 in enumerate(file_line_generator(args.file1)):
            log_iterations(log, c, 1000)

            for line2 in file2_content:
                outfile.write(line1 + args.separator + line2 + '\n')

    log.info('finished')
Beispiel #6
0
def main(argv=None):
    """See argument parser description."""

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')
    file2_content = list(file_line_generator(args.file2))

    log.info('combining files')

    with utf8_file_open(args.out_file, 'w') as outfile:

        for c, line1 in enumerate(file_line_generator(args.file1)):
            log_iterations(log, c, 1000)

            for line2 in file2_content:
                outfile.write(line1 + args.separator + line2 + u'\n')

    log.info('finished')
Beispiel #7
0
    def run(self):
        vocab = dict(self.vocab)

        # Get a mapping from index to word
        vocab_entries = sort_dict_by_label(vocab)
        vocab_entries = zip(*vocab_entries)[0]
        log_probabs = 0.
        num_ppl_examples = 0
        num_examples = 0

        with utf8_file_open(self.result_file, 'w') as outfile:

            for batch, _ in self.next_batch(self.predict_file):
                # Handle each prediction
                #             for (cur_count, (example, predictions)) in enumerate(self.predict_single()):

                log_iterations(log, num_examples, 10000)
                num_examples += len(batch)

                if self.perplexity:
                    batch = zip(*batch)
                    # Pass only the context, not the target word
                    predictions = self.predictor_method(batch[0])
                else:
                    self.predictor_method(batch)

                if self.store_softmax or self.store_rank or self.store_argmax \
                        or self.information or self.perplexity:
                    sm, probabs, cur_log_probabs, cur_num_ppl_examples = \
                            self._calc_probabilities_from_similarity(batch[1], predictions[1])
                    num_ppl_examples += cur_num_ppl_examples

                if self.store_rank or self.information:
                    # rankdata sorts ascending, i.e., distances, but we have
                    # similarities, hence, 1-sm
                    ranks = rankdata(1 - sm, method='min').astype(int)

                    if self.store_rank:
                        outfile.write(ndarray_to_string(ranks))

                    if self.information:
                        unique_ranks = set(ranks)
                        hard_idx = vocab[u'hard']
                        sorted_unique_ranks = ' '.join(
                            map(str, sorted(unique_ranks)))
                        sorted_unique_ranks = ''
                        top_ten_entries = ' '.join([
                            vocab_entries[i] for i in np.argsort(1 - sm)[:10]
                        ])
                        print '#%d\t%s\t%s' % (ranks[hard_idx],
                                               sorted_unique_ranks,
                                               top_ten_entries)

                if self.store_argmax:
                    maximum = np.argmax(sm)
                    #                 outfile.write(vocab_entries[maximum] + u' (%d)\t' % maximum)
                    outfile.write(vocab_entries[maximum])

                if self.store_softmax:

                    if self.normalize_with_root:
                        sm = np.sqrt(sm)
                        sm = sm / np.linalg.norm(sm, 2, axis=-1)

                    outfile.write(ndarray_to_string(sm))

                if self.perplexity:

                    if self.save_word:
                        indices_in_predict_vocab = [
                            self.vocab_mapping[batch[1][i]]
                            for i in range(len(batch[1]))
                        ]
                        indices_in_original_vocab = [
                            self.vocab_mapping_list[i]
                            for i in indices_in_predict_vocab
                        ]
                        words = [
                            self.vocab.keys()[self.vocab.values().index(i)]
                            for i in indices_in_original_vocab
                        ]

                        outfile.write(u'\n'.join(
                            "%s %s" % (x, y)
                            for x, y in zip(map(unicode, probabs), words)))
                    else:
                        outfile.write(u'\n'.join(map(unicode, probabs)))

                    log_probabs += cur_log_probabs if cur_log_probabs is not np.nan else 0.

                if self.predictions:
                    outfile.write(ndarray_to_string(predictions[0][0]))

                outfile.write(u'\n')

            # print all results

    #         for predictions in predictions:
    #             outfile.write(ndarray_to_string(predictions[0][0]) + u'\t')
    #
    #             if args.store_softmax:
    #                 outfile.write(ndarray_to_string(predictions[1][0]) + u'\t')
    #
    #             outfile.write(vocab_entries[predictions[2][0]] + u' (%d)' % predictions[2][0])
    #             outfile.write(u'\n')
    # #             outfile.write(unicode(predictions) + u'\n')
        if self.perplexity:
            ppl = np.exp(-1. / (num_ppl_examples) * log_probabs)
            log.info('Perplexity on %d examples is %f', num_ppl_examples, ppl)
Beispiel #8
0
    def run(self):
        self.before_run_begins()

#         printing.pydotprint(self.model.trainer, outfile='trainer.png')
#         theano.printing.pydotprint(self.model.validator, outfile='validator.png')

        self.start_time = time()
        example_count_since_validation, costs_since_validation = 0, 0.0

        theano_processing = 0.0
        t = time()

        batch_generator = self._do_skip_examples()

        while True:
            log_iterations(log, self.train_total_batches, 10000)

            (batch, epoch_finished) = batch_generator.next()
            arguments = self.prepare_arguments(batch)

            t_start = time()
            output = self.model.trainer(*arguments)
            theano_processing += time() - t_start

            cost = output[0]
            self.model.total_examples += len(batch)
            self.model.total_costs += float(cost)
            self.train_total_batches += 1
            costs_since_validation += float(cost)
            example_count_since_validation += len(batch)

            self.model.update_learning_rate(self.remaining())

            if epoch_finished:
                batch_generator = self.next_batch(self.train_file)
                self.epoch_finished()

            if self.dump_ready(epoch_finished):
                self.dump(self.model)

            early_stopping = False

            if self.validation_ready():
                # report training error
                t = float(time() - t)
                avg_cost = costs_since_validation / \
                        float(example_count_since_validation)
                log.info('Average loss on %d example of the training set is %f',
                         example_count_since_validation, avg_cost)
                log.info('Speed of training is %f example/s',
                         example_count_since_validation / t)
                log.info('Percentage of time spent by theano processing is %f',
                         theano_processing / t)
                log.info('Processed %d so far.', self.model.total_examples)

                example_count_since_validation, costs_since_validation = 0, 0.0

                early_stopping = self.validate()

                t = time()
                theano_processing = 0.0

            if self.early_exit(early_stopping):
                break

        self.exit_train()
Beispiel #9
0
    def run(self):
        vocab = dict(self.vocab)

        # Get a mapping from index to word
        vocab_entries = sort_dict_by_label(vocab)
        vocab_entries = zip(*vocab_entries)[0]
        log_probabs = 0.
        num_ppl_examples = 0
        num_examples = 0

        with utf8_file_open(self.result_file, 'w') as outfile:

            for batch, _ in self.next_batch(self.predict_file):
            # Handle each prediction
#             for (cur_count, (example, predictions)) in enumerate(self.predict_single()):

                log_iterations(log, num_examples, 10000)
                num_examples += len(batch)

                if self.perplexity:
                    batch = zip(*batch)
                    # Pass only the context, not the target word
                    predictions = self.predictor_method(batch[0])
                else:
                    self.predictor_method(batch)

                if self.store_softmax or self.store_rank or self.store_argmax \
                        or self.information or self.perplexity:
                    sm, probabs, cur_log_probabs, cur_num_ppl_examples = \
                            self._calc_probabilities_from_similarity(batch[1], predictions[1])
                    num_ppl_examples += cur_num_ppl_examples

                if self.store_rank or self.information:
                    # rankdata sorts ascending, i.e., distances, but we have
                    # similarities, hence, 1-sm
                    ranks = rankdata(1 - sm, method='min').astype(int)

                    if self.store_rank:
                        outfile.write(ndarray_to_string(ranks))

                    if self.information:
                        unique_ranks = set(ranks)
                        hard_idx = vocab[u'hard']
                        sorted_unique_ranks = ' '.join(map(str, sorted(unique_ranks)))
                        sorted_unique_ranks = ''
                        top_ten_entries = ' '.join([vocab_entries[i] for i in np.argsort(1 - sm)[:10]])
                        print '#%d\t%s\t%s' % (ranks[hard_idx],
                                sorted_unique_ranks,
                                top_ten_entries)

                if self.store_argmax:
                    maximum = np.argmax(sm)
    #                 outfile.write(vocab_entries[maximum] + u' (%d)\t' % maximum)
                    outfile.write(vocab_entries[maximum])

                if self.store_softmax:

                    if self.normalize_with_root:
                        sm = np.sqrt(sm)
                        sm = sm / np.linalg.norm(sm, 2, axis=-1)

                    outfile.write(ndarray_to_string(sm))

                if self.perplexity:

                    if self.save_word:
                        indices_in_predict_vocab = [self.vocab_mapping[batch[1][i]] for i in range(len(batch[1]))]
                        indices_in_original_vocab = [self.vocab_mapping_list[i] for i in indices_in_predict_vocab]
                        words = [self.vocab.keys()[self.vocab.values().index(i)] for i in indices_in_original_vocab]

                        outfile.write( u'\n'.join("%s %s" % (x, y) for x, y in zip(map(unicode, probabs), words)) )
                    else:
                        outfile.write(u'\n'.join(map(unicode, probabs)))

                    log_probabs += cur_log_probabs if cur_log_probabs is not np.nan else 0.

                if self.predictions:
                    outfile.write(ndarray_to_string(predictions[0][0]))

                outfile.write(u'\n')

            # print all results
    #         for predictions in predictions:
    #             outfile.write(ndarray_to_string(predictions[0][0]) + u'\t')
    #
    #             if args.store_softmax:
    #                 outfile.write(ndarray_to_string(predictions[1][0]) + u'\t')
    #
    #             outfile.write(vocab_entries[predictions[2][0]] + u' (%d)' % predictions[2][0])
    #             outfile.write(u'\n')
    # #             outfile.write(unicode(predictions) + u'\n')
        if self.perplexity:
            ppl = np.exp(-1. / (num_ppl_examples) * log_probabs)
            log.info('Perplexity on %d examples is %f', num_ppl_examples, ppl)
Beispiel #10
0
    def run(self):
        self.before_run_begins()

        #         printing.pydotprint(self.model.trainer, outfile='trainer.png')
        #         theano.printing.pydotprint(self.model.validator, outfile='validator.png')

        self.start_time = time()
        example_count_since_validation, costs_since_validation = 0, 0.0

        theano_processing = 0.0
        t = time()

        batch_generator = self._do_skip_examples()

        while True:
            log_iterations(log, self.train_total_batches, 10000)

            (batch, epoch_finished) = batch_generator.next()
            arguments = self.prepare_arguments(batch)

            t_start = time()
            output = self.model.trainer(*arguments)
            theano_processing += time() - t_start

            cost = output[0]
            self.model.total_examples += len(batch)
            self.model.total_costs += float(cost)
            self.train_total_batches += 1
            costs_since_validation += float(cost)
            example_count_since_validation += len(batch)

            self.model.update_learning_rate(self.remaining())

            if epoch_finished:
                batch_generator = self.next_batch(self.train_file)
                self.epoch_finished()

            if self.dump_ready(epoch_finished):
                self.dump(self.model)

            early_stopping = False

            if self.validation_ready():
                # report training error
                t = float(time() - t)
                avg_cost = costs_since_validation / \
                        float(example_count_since_validation)
                log.info(
                    'Average loss on %d example of the training set is %f',
                    example_count_since_validation, avg_cost)
                log.info('Speed of training is %f example/s',
                         example_count_since_validation / t)
                log.info('Percentage of time spent by theano processing is %f',
                         theano_processing / t)
                log.info('Processed %d so far.', self.model.total_examples)

                example_count_since_validation, costs_since_validation = 0, 0.0

                early_stopping = self.validate()

                t = time()
                theano_processing = 0.0

            if self.early_exit(early_stopping):
                break

        self.exit_train()