Esempio n. 1
0
def main(argv=None):
    """See argument parser description."""

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')

    if args.vocabulary is None:
        vocab = args.vocabulary
    else:
        vocab = read_vocabulary_id_file(args.vocabulary)

    text = list(file_line_generator(args.infile))

    ngram_range = map(int, tuple(args.ngram.split(',')))
    vectorizer = CountVectorizer(token_pattern='[^ ]+',
                                 min_df=0.0,
                                 vocabulary=vocab,
                                 ngram_range=ngram_range,
                                 dtype=int)

    log.info('creating features')
    bow = vectorizer.fit_transform(text)

    log.info('storing result')
    np.savetxt(args.out_feature_file, bow.todense(), fmt='%d')

    with utf8_file_open(args.out_feature_file + '.vocab', 'w') as vocab_file:
        vocab_file.write(u'\n'.join(vectorizer.get_feature_names()))

    log.info('finished')
Esempio n. 2
0
def main(argv=None):
    """See argument parser description."""

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')

    if args.vocabulary is None:
        vocab = args.vocabulary
    else:
        vocab = read_vocabulary_id_file(args.vocabulary)

    text = list(file_line_generator(args.infile))

    ngram_range = list(map(int, tuple(args.ngram.split(','))))
    vectorizer = CountVectorizer(token_pattern='[^ ]+', min_df=0.0,
            vocabulary=vocab, ngram_range=ngram_range, dtype=int)

    log.info('creating features')
    bow = vectorizer.fit_transform(text)

    log.info('storing result')
    np.savetxt(args.out_feature_file, bow.todense(), fmt='%d')

    with utf8_file_open(args.out_feature_file + '.vocab', 'w') as vocab_file:
        vocab_file.write('\n'.join(vectorizer.get_feature_names()))

    log.info('finished')
Esempio n. 3
0
 def configure(self, args):
     super(EmbeddingsMiniBatchTrainer, self).configure(args)
     self.vocab = read_vocabulary_id_file(args.vocabulary)
     self.vocab_size = len(self.vocab.keys())
     self.effective_vocab_size = len(self.vocab.keys())
     self.word_embedding_size = args.word_embedding_size
     self.do_dump_vocabulary = args.dump_vocabulary
     self.do_dump_embeddings = args.dump_embeddings
     log.debug('Effective size of the vocabulary %d',
             self.effective_vocab_size)
Esempio n. 4
0
 def configure(self, args):
     super(EmbeddingsMiniBatchTrainer, self).configure(args)
     self.vocab = read_vocabulary_id_file(args.vocabulary)
     self.vocab_size = len(self.vocab.keys())
     self.effective_vocab_size = len(self.vocab.keys())
     self.word_embedding_size = args.word_embedding_size
     self.do_dump_vocabulary = args.dump_vocabulary
     self.do_dump_embeddings = args.dump_embeddings
     log.debug('Effective size of the vocabulary %d',
               self.effective_vocab_size)
Esempio n. 5
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading embeddings')
    vocab = read_vocabulary_id_file(args.vocabulary)
    embs = np.loadtxt(args.embeddings)

    log.info('loading documents')
    features, labels = load_data(args.corpus_dir, vocab, embs)

    log.info('performing cross validation')
    single_predictions, classification_result, weight_vectors = \
            do_cross_validation(features, labels)

    log.info('storing results')
    np.savetxt(os.path.join(args.output_dir, 'svm-weights.csv'),
               weight_vectors, '%f', ';', '\n')

    with utf8_file_open(os.path.join(args.output_dir, 'predictions.csv'), 'w') \
            as pred_file:
        pred_file.write(u'fold_no;doc;true_label;pred_label\n')

        for sp in single_predictions:
            pred_file.write(u';'.join(map(unicode, sp)) + u'\n')

    all_true_labels = [sp[2] for sp in single_predictions]
    all_pred_labels = [sp[3] for sp in single_predictions]
    confusion = confusion_matrix(all_true_labels, all_pred_labels)

    np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'),
               confusion, '%d', ';', '\n')

    classification_result[NO_OF_FOLDS, :] = get_classification_result(
        -1, all_true_labels, all_pred_labels)

    header = u'fold_no;accuracy;precision;recall;f1'
    np.savetxt(os.path.join(args.output_dir, 'metrics.csv'),
               classification_result,
               '%f',
               u';',
               u'\n',
               header=header)

    log.info(classification_result)
    log.info('finished')
Esempio n. 6
0
    def configure(self, args):
        super(vLblNCEPredictor, self).configure(args)
        self.vocab = read_vocabulary_id_file(args.vocabulary)
        self.vocab_size = len(self.vocab.keys())
        self.effective_vocab_size = len(self.vocab.keys())
        self.perplexity = args.perplexity
        self.save_word = args.save_word
        self.result_file = args.result_file
        self.store_rank = args.store_rank
        self.store_argmax = args.store_argmax
        self.store_softmax = args.store_softmax
        self.normalize_with_root = args.normalize_with_root
        self.information = args.information
        self.predictions = args.predictions

        # This code is taken from SimpleVLblNceTrainer
        if args.pred_vocab:
            # Element i contains the index of the i'th prediction vocabulary
            # token in the original vocabulary.
            self.vocab_mapping_list = list()

            # Mapping from the model vocabulary to the prediction vocabulary
            # indices
            self.vocab_mapping = dict()

            for i, token in enumerate(file_line_generator(args.pred_vocab)):

                if not token in self.vocab:
                    raise ValueError('Token "%s" in prediction vocabulary ' +
                                     'does not exist in model vocabulary.' %
                                     token)

                self.vocab_mapping_list.append(self.vocab[token])
                self.vocab_mapping[self.vocab[token]] = i
        else:
            self.vocab_mapping_list = range(len(self.vocab))
            self.vocab_mapping = dict(
                zip(self.vocab_mapping_list, self.vocab_mapping_list))

        if self.perplexity:
            self.example_iterator_type = PaddedWindowExamplesGenerator
            self.example_processor = self._process_example_full_text
            self.learn_eos = True  # We need to set that because otherwise PaddedWindowExampleGenerator will ignore end-of-sentence tags (</S>)
            self.disable_padding = False
            self.w_indices = debug_print(T.imatrix('w'), 'w')
            self.inputs.append(self.w_indices)
        else:
            self.example_processor = self._process_example_context_per_line
Esempio n. 7
0
    def configure(self, args):
        super(vLblNCEPredictor, self).configure(args)
        self.vocab = read_vocabulary_id_file(args.vocabulary)
        self.vocab_size = len(self.vocab.keys())
        self.effective_vocab_size = len(self.vocab.keys())
        self.perplexity = args.perplexity
        self.save_word = args.save_word
        self.result_file = args.result_file
        self.store_rank = args.store_rank
        self.store_argmax = args.store_argmax
        self.store_softmax = args.store_softmax
        self.normalize_with_root = args.normalize_with_root
        self.information = args.information
        self.predictions = args.predictions

        # This code is taken from SimpleVLblNceTrainer
        if args.pred_vocab:
            # Element i contains the index of the i'th prediction vocabulary
            # token in the original vocabulary.
            self.vocab_mapping_list = list()

            # Mapping from the model vocabulary to the prediction vocabulary
            # indices
            self.vocab_mapping = dict()

            for i, token in enumerate(file_line_generator(args.pred_vocab)):

                if not token in self.vocab:
                    raise ValueError('Token "%s" in prediction vocabulary ' +
                            'does not exist in model vocabulary.' % token)

                self.vocab_mapping_list.append(self.vocab[token])
                self.vocab_mapping[self.vocab[token]] = i
        else:
            self.vocab_mapping_list = range(len(self.vocab))
            self.vocab_mapping = dict(
                    zip(self.vocab_mapping_list, self.vocab_mapping_list))

        if self.perplexity:
            self.example_iterator_type = PaddedWindowExamplesGenerator
            self.example_processor = self._process_example_full_text
            self.learn_eos = True  # We need to set that because otherwise PaddedWindowExampleGenerator will ignore end-of-sentence tags (</S>)
            self.disable_padding = False
            self.w_indices = debug_print(T.imatrix('w'), 'w')
            self.inputs.append(self.w_indices)
        else:
            self.example_processor = self._process_example_context_per_line
Esempio n. 8
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading embeddings')
    vocab = read_vocabulary_id_file(args.vocabulary)
    embs = np.loadtxt(args.embeddings)

    log.info('loading documents')
    features, labels = load_data(args.corpus_dir, vocab, embs)

    log.info('performing cross validation')
    single_predictions, classification_result, weight_vectors = \
            do_cross_validation(features, labels)

    log.info('storing results')
    np.savetxt(os.path.join(args.output_dir, 'svm-weights.csv'),
            weight_vectors, '%f', ';', '\n')

    with utf8_file_open(os.path.join(args.output_dir, 'predictions.csv'), 'w') \
            as pred_file:
        pred_file.write(u'fold_no;doc;true_label;pred_label\n')

        for sp in single_predictions:
            pred_file.write(u';'.join(map(unicode, sp)) + u'\n')

    all_true_labels = [sp[2] for sp in single_predictions]
    all_pred_labels = [sp[3] for sp in single_predictions]
    confusion = confusion_matrix(all_true_labels, all_pred_labels)

    np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'),
            confusion, '%d', ';', '\n')

    classification_result[NO_OF_FOLDS, :] = get_classification_result(-1,
                all_true_labels, all_pred_labels)

    header = u'fold_no;accuracy;precision;recall;f1'
    np.savetxt(os.path.join(args.output_dir, 'metrics.csv'),
            classification_result, '%f', u';', u'\n', header=header)

    log.info(classification_result)
    log.info('finished')
Esempio n. 9
0
def main(argv=None):
    """See argument parser description."""

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')
    vocab = read_vocabulary_id_file(args.vocabulary, False)

    _, ext = os.path.splitext(args.feature_file)

    if ext == 'npy':
        features = np.load(args.feature_file)
    else:
        features = np.loadtxt(args.feature_file)

    log.info('creating features')

    with utf8_file_open(args.out_feature_file, 'w') as outfile:

        for line in file_line_generator(args.infile):
            toks = line.split()
            cur_features = np.zeros((len(toks), features.shape[1]))

            for (i, tok) in enumerate(toks):
                cur_features[i, :] = features[vocab.get(
                    tok, SpecialTokenID.UNKNOWN.value)]

            if args.avg:
                res = ndarray_to_string(np.mean(cur_features, axis=0))
            else:
                res = ndarray_to_string(
                    np.reshape(cur_features,
                               np.prod(cur_features.shape),
                               order='C'))

            outfile.write(res + u'\n')

    log.info('finished')
Esempio n. 10
0
def main(argv=None):
    """See argument parser description."""

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')
    vocab = read_vocabulary_id_file(args.vocabulary, False)

    _, ext = os.path.splitext(args.feature_file)

    if ext == 'npy':
        features = np.load(args.feature_file)
    else:
        features = np.loadtxt(args.feature_file)

    log.info('creating features')

    with utf8_file_open(args.out_feature_file, 'w') as outfile:

        for line in file_line_generator(args.infile):
            toks = line.split()
            cur_features = np.zeros((len(toks), features.shape[1]))

            for (i, tok) in enumerate(toks):
                cur_features[i, :] = features[
                        vocab.get(tok, SpecialTokenID.UNKNOWN.value)]

            if args.avg:
                res = ndarray_to_string(np.mean(cur_features, axis=0))
            else:
                res = ndarray_to_string(np.reshape(cur_features,
                        np.prod(cur_features.shape), order='C'))

            outfile.write(res + u'\n')

    log.info('finished')
Esempio n. 11
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')
    model = load_object_from_file(args.model_file)

    # read vocabulary from file
    vocab = sort_dict_by_label(read_vocabulary_id_file(args.vocabulary))

    # get matrices from model
    r_matrix = model.R.get_value()
    q_matrix = model.Q.get_value()

    # get input embeddings
    if args.model_type == 'vlbl':
        in_we = r_matrix
    elif args.model_type == 'vlbl_dist':
        # this will not work with the old versions of models - because of sparsity
        d_matrix = model.D.get_value().todense()
        in_we = np.dot(d_matrix, r_matrix)
        # need to convert from numpy.matrix to numpy.ndarray
        in_we = in_we.view(type=np.ndarray)

    with utf8_file_open(args.result_file + ".in", 'w') as outfile:
        for (word, ind) in vocab:
            outfile.write(
                unicode(word) + u' ' + u' '.join(map(str, in_we[ind])) + u'\n')

    with utf8_file_open(args.result_file + ".out", 'w') as outfile:
        for (word, ind) in vocab:
            outfile.write(
                unicode(word) + u' ' + u' '.join(map(str, q_matrix[ind])) +
                u'\n')

    log.info('finished')
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')
    model = load_object_from_file(args.model_file)

    # read vocabulary from file
    vocab = sort_dict_by_label(read_vocabulary_id_file(args.vocabulary))

    # get matrices from model
    r_matrix = model.R.get_value()
    q_matrix = model.Q.get_value()

    # get input embeddings
    if args.model_type == 'vlbl':
        in_we = r_matrix
    elif args.model_type == 'vlbl_dist':
        # this will not work with the old versions of models - because of sparsity
        d_matrix = model.D.get_value().todense()
        in_we = np.dot(d_matrix, r_matrix)
        # need to convert from numpy.matrix to numpy.ndarray
        in_we = in_we.view(type=np.ndarray)

    with utf8_file_open(args.result_file + ".in", 'w') as outfile:
        for (word, ind) in vocab:
            outfile.write(unicode(word) + u' ' + u' '.join(map(str, in_we[ind])) + u'\n')

    with utf8_file_open(args.result_file + ".out", 'w') as outfile:
        for (word, ind) in vocab:
            outfile.write(unicode(word) + u' ' + u' '.join(map(str, q_matrix[ind])) + u'\n')

    log.info('finished')
Esempio n. 13
0
 def configure(self, args):
     super(LblPredictor, self).configure(args)
     self.vocab = read_vocabulary_id_file(args.vocabulary)
     self.vocab_size = len(self.vocab.keys())
     self.effective_vocab_size = len(self.vocab.keys())
Esempio n. 14
0
 def configure(self, args):
     self.sent_vocab = set(read_vocabulary_id_file(args.sent_vocab, False))
     super(HingeSentimentMiniBatchTrainer, self).configure(args)
Esempio n. 15
0
 def configure(self, args):
     super(LblPredictor, self).configure(args)
     self.vocab = read_vocabulary_id_file(args.vocabulary)
     self.vocab_size = len(self.vocab.keys())
     self.effective_vocab_size = len(self.vocab.keys())
Esempio n. 16
0
 def configure(self, args):
     self.sent_vocab = set(read_vocabulary_id_file(args.sent_vocab, False))
     super(HingeSentimentMiniBatchTrainer, self).configure(args)