Ejemplo n.º 1
0
def main(argv=None):
    """See argument parser description."""

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    counter = Counter()

    with utf8_file_open(args.infile, 'r') as infile:

        for line in infile:
            line = line.strip()

            if args.lowercase:
                line = line.lower()


#             line = line.decode('utf-8').strip()

#             log.info(line)
#             if line == '' or line.startswith('<doc id='):
#                 continue

            counter.update(line.strip().split())

    with utf8_file_open(args.outfile, 'w') as outfile:
        for (key, count) in sort_dict_by_label(counter, True):
            outfile.write(u'%s\t%i\n' % (key, count))

    log.info('finished')
Ejemplo n.º 2
0
def main(argv=None):
    """See argument parser description."""

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    counter = Counter()

    with utf8_file_open(args.infile, 'r') as infile:

        for line in infile:
            line = line.strip()

            if args.lowercase:
                line = line.lower()
#             line = line.decode('utf-8').strip()

#             log.info(line)
#             if line == '' or line.startswith('<doc id='):
#                 continue

            counter.update(line.strip().split())

    with utf8_file_open(args.outfile, 'w') as outfile:
        for (key, count) in sort_dict_by_label(counter, True):
            outfile.write(u'%s\t%i\n' % (key, count))

    log.info('finished')
Ejemplo n.º 3
0
def write_vocabulary_file(output_file, vocab):
    """Write the given vocabulary to the given file.

    The vocabulary items are stored in order of the vocab values, i.e., in the
    same order as they have been read by read_vocabulary_id_file.

    Parameters
    ----------
    output_file : str
        filename of the output
    vocab : dict(str, int)
        vocabulary that has been read by read_vocabulary_id_file
    """

    with utf8_file_open(output_file, 'w') as vocab_file:
        vocab_file.write('\n'.join(k[0]
                for k in sort_dict_by_label(vocab)))
        vocab_file.write('\n')
Ejemplo n.º 4
0
def write_vocabulary_file(output_file, vocab):
    """Write the given vocabulary to the given file.

    The vocabulary items are stored in order of the vocab values, i.e., in the
    same order as they have been read by read_vocabulary_id_file.

    Parameters
    ----------
    output_file : str
        filename of the output
    vocab : dict(str, int)
        vocabulary that has been read by read_vocabulary_id_file
    """

    with utf8_file_open(output_file, 'w') as vocab_file:
        vocab_file.write(u'\n'.join(k[0]
                for k in sort_dict_by_label(vocab)))
        vocab_file.write(u'\n')
Ejemplo n.º 5
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')
    model = load_object_from_file(args.model_file)

    # read vocabulary from file
    vocab = sort_dict_by_label(read_vocabulary_id_file(args.vocabulary))

    # get matrices from model
    r_matrix = model.R.get_value()
    q_matrix = model.Q.get_value()

    # get input embeddings
    if args.model_type == 'vlbl':
        in_we = r_matrix
    elif args.model_type == 'vlbl_dist':
        # this will not work with the old versions of models - because of sparsity
        d_matrix = model.D.get_value().todense()
        in_we = np.dot(d_matrix, r_matrix)
        # need to convert from numpy.matrix to numpy.ndarray
        in_we = in_we.view(type=np.ndarray)

    with utf8_file_open(args.result_file + ".in", 'w') as outfile:
        for (word, ind) in vocab:
            outfile.write(
                unicode(word) + u' ' + u' '.join(map(str, in_we[ind])) + u'\n')

    with utf8_file_open(args.result_file + ".out", 'w') as outfile:
        for (word, ind) in vocab:
            outfile.write(
                unicode(word) + u' ' + u' '.join(map(str, q_matrix[ind])) +
                u'\n')

    log.info('finished')
Ejemplo n.º 6
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')
    model = load_object_from_file(args.model_file)

    # read vocabulary from file
    vocab = sort_dict_by_label(read_vocabulary_id_file(args.vocabulary))

    # get matrices from model
    r_matrix = model.R.get_value()
    q_matrix = model.Q.get_value()

    # get input embeddings
    if args.model_type == 'vlbl':
        in_we = r_matrix
    elif args.model_type == 'vlbl_dist':
        # this will not work with the old versions of models - because of sparsity
        d_matrix = model.D.get_value().todense()
        in_we = np.dot(d_matrix, r_matrix)
        # need to convert from numpy.matrix to numpy.ndarray
        in_we = in_we.view(type=np.ndarray)

    with utf8_file_open(args.result_file + ".in", 'w') as outfile:
        for (word, ind) in vocab:
            outfile.write(unicode(word) + u' ' + u' '.join(map(str, in_we[ind])) + u'\n')

    with utf8_file_open(args.result_file + ".out", 'w') as outfile:
        for (word, ind) in vocab:
            outfile.write(unicode(word) + u' ' + u' '.join(map(str, q_matrix[ind])) + u'\n')

    log.info('finished')
Ejemplo n.º 7
0
    def run(self):
        vocab = dict(self.vocab)

        # Get a mapping from index to word
        vocab_entries = sort_dict_by_label(vocab)
        vocab_entries = zip(*vocab_entries)[0]
        log_probabs = 0.
        num_ppl_examples = 0
        num_examples = 0

        with utf8_file_open(self.result_file, 'w') as outfile:

            for batch, _ in self.next_batch(self.predict_file):
                # Handle each prediction
                #             for (cur_count, (example, predictions)) in enumerate(self.predict_single()):

                log_iterations(log, num_examples, 10000)
                num_examples += len(batch)

                if self.perplexity:
                    batch = zip(*batch)
                    # Pass only the context, not the target word
                    predictions = self.predictor_method(batch[0])
                else:
                    self.predictor_method(batch)

                if self.store_softmax or self.store_rank or self.store_argmax \
                        or self.information or self.perplexity:
                    sm, probabs, cur_log_probabs, cur_num_ppl_examples = \
                            self._calc_probabilities_from_similarity(batch[1], predictions[1])
                    num_ppl_examples += cur_num_ppl_examples

                if self.store_rank or self.information:
                    # rankdata sorts ascending, i.e., distances, but we have
                    # similarities, hence, 1-sm
                    ranks = rankdata(1 - sm, method='min').astype(int)

                    if self.store_rank:
                        outfile.write(ndarray_to_string(ranks))

                    if self.information:
                        unique_ranks = set(ranks)
                        hard_idx = vocab[u'hard']
                        sorted_unique_ranks = ' '.join(
                            map(str, sorted(unique_ranks)))
                        sorted_unique_ranks = ''
                        top_ten_entries = ' '.join([
                            vocab_entries[i] for i in np.argsort(1 - sm)[:10]
                        ])
                        print '#%d\t%s\t%s' % (ranks[hard_idx],
                                               sorted_unique_ranks,
                                               top_ten_entries)

                if self.store_argmax:
                    maximum = np.argmax(sm)
                    #                 outfile.write(vocab_entries[maximum] + u' (%d)\t' % maximum)
                    outfile.write(vocab_entries[maximum])

                if self.store_softmax:

                    if self.normalize_with_root:
                        sm = np.sqrt(sm)
                        sm = sm / np.linalg.norm(sm, 2, axis=-1)

                    outfile.write(ndarray_to_string(sm))

                if self.perplexity:

                    if self.save_word:
                        indices_in_predict_vocab = [
                            self.vocab_mapping[batch[1][i]]
                            for i in range(len(batch[1]))
                        ]
                        indices_in_original_vocab = [
                            self.vocab_mapping_list[i]
                            for i in indices_in_predict_vocab
                        ]
                        words = [
                            self.vocab.keys()[self.vocab.values().index(i)]
                            for i in indices_in_original_vocab
                        ]

                        outfile.write(u'\n'.join(
                            "%s %s" % (x, y)
                            for x, y in zip(map(unicode, probabs), words)))
                    else:
                        outfile.write(u'\n'.join(map(unicode, probabs)))

                    log_probabs += cur_log_probabs if cur_log_probabs is not np.nan else 0.

                if self.predictions:
                    outfile.write(ndarray_to_string(predictions[0][0]))

                outfile.write(u'\n')

            # print all results

    #         for predictions in predictions:
    #             outfile.write(ndarray_to_string(predictions[0][0]) + u'\t')
    #
    #             if args.store_softmax:
    #                 outfile.write(ndarray_to_string(predictions[1][0]) + u'\t')
    #
    #             outfile.write(vocab_entries[predictions[2][0]] + u' (%d)' % predictions[2][0])
    #             outfile.write(u'\n')
    # #             outfile.write(unicode(predictions) + u'\n')
        if self.perplexity:
            ppl = np.exp(-1. / (num_ppl_examples) * log_probabs)
            log.info('Perplexity on %d examples is %f', num_ppl_examples, ppl)
Ejemplo n.º 8
0
    def run(self):
        vocab = dict(self.vocab)

        # Get a mapping from index to word
        vocab_entries = sort_dict_by_label(vocab)
        vocab_entries = zip(*vocab_entries)[0]
        log_probabs = 0.
        num_ppl_examples = 0
        num_examples = 0

        with utf8_file_open(self.result_file, 'w') as outfile:

            for batch, _ in self.next_batch(self.predict_file):
            # Handle each prediction
#             for (cur_count, (example, predictions)) in enumerate(self.predict_single()):

                log_iterations(log, num_examples, 10000)
                num_examples += len(batch)

                if self.perplexity:
                    batch = zip(*batch)
                    # Pass only the context, not the target word
                    predictions = self.predictor_method(batch[0])
                else:
                    self.predictor_method(batch)

                if self.store_softmax or self.store_rank or self.store_argmax \
                        or self.information or self.perplexity:
                    sm, probabs, cur_log_probabs, cur_num_ppl_examples = \
                            self._calc_probabilities_from_similarity(batch[1], predictions[1])
                    num_ppl_examples += cur_num_ppl_examples

                if self.store_rank or self.information:
                    # rankdata sorts ascending, i.e., distances, but we have
                    # similarities, hence, 1-sm
                    ranks = rankdata(1 - sm, method='min').astype(int)

                    if self.store_rank:
                        outfile.write(ndarray_to_string(ranks))

                    if self.information:
                        unique_ranks = set(ranks)
                        hard_idx = vocab[u'hard']
                        sorted_unique_ranks = ' '.join(map(str, sorted(unique_ranks)))
                        sorted_unique_ranks = ''
                        top_ten_entries = ' '.join([vocab_entries[i] for i in np.argsort(1 - sm)[:10]])
                        print '#%d\t%s\t%s' % (ranks[hard_idx],
                                sorted_unique_ranks,
                                top_ten_entries)

                if self.store_argmax:
                    maximum = np.argmax(sm)
    #                 outfile.write(vocab_entries[maximum] + u' (%d)\t' % maximum)
                    outfile.write(vocab_entries[maximum])

                if self.store_softmax:

                    if self.normalize_with_root:
                        sm = np.sqrt(sm)
                        sm = sm / np.linalg.norm(sm, 2, axis=-1)

                    outfile.write(ndarray_to_string(sm))

                if self.perplexity:

                    if self.save_word:
                        indices_in_predict_vocab = [self.vocab_mapping[batch[1][i]] for i in range(len(batch[1]))]
                        indices_in_original_vocab = [self.vocab_mapping_list[i] for i in indices_in_predict_vocab]
                        words = [self.vocab.keys()[self.vocab.values().index(i)] for i in indices_in_original_vocab]

                        outfile.write( u'\n'.join("%s %s" % (x, y) for x, y in zip(map(unicode, probabs), words)) )
                    else:
                        outfile.write(u'\n'.join(map(unicode, probabs)))

                    log_probabs += cur_log_probabs if cur_log_probabs is not np.nan else 0.

                if self.predictions:
                    outfile.write(ndarray_to_string(predictions[0][0]))

                outfile.write(u'\n')

            # print all results
    #         for predictions in predictions:
    #             outfile.write(ndarray_to_string(predictions[0][0]) + u'\t')
    #
    #             if args.store_softmax:
    #                 outfile.write(ndarray_to_string(predictions[1][0]) + u'\t')
    #
    #             outfile.write(vocab_entries[predictions[2][0]] + u' (%d)' % predictions[2][0])
    #             outfile.write(u'\n')
    # #             outfile.write(unicode(predictions) + u'\n')
        if self.perplexity:
            ppl = np.exp(-1. / (num_ppl_examples) * log_probabs)
            log.info('Perplexity on %d examples is %f', num_ppl_examples, ppl)