Ejemplo n.º 1
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading feature and label data')
    train_labels = np.asarray(
        map(int, list(file_line_generator(args.train_labels))))
    train_features = np.loadtxt(args.train_data)

    if train_features.ndim == 1:
        train_features = train_features.reshape((train_features.shape[0], 1))

    test_labels = np.asarray(
        map(int, list(file_line_generator(args.test_labels))))
    test_features = np.loadtxt(args.test_data)

    if test_features.ndim == 1:
        test_features = test_features.reshape((test_features.shape[0], 1))

    log.info('performing classification')
    single_predictions, classification_result, weight_vectors, model = \
            calc_results(train_features, train_labels, test_features,
            test_labels, args.normalize, args.mode == True)

    log.info('storing results')
    save_object_to_file(model, os.path.join(args.output_dir, 'svm'))

    np.savetxt(os.path.join(args.output_dir, 'weights.csv'), weight_vectors,
               '%f', ';', '\n')

    header = 'instance_index;true_label;pred_label'
    np.savetxt(os.path.join(args.output_dir, 'predictions.csv'),
               single_predictions,
               '%d',
               ';',
               '\n',
               header=header)

    all_true_labels = single_predictions[:, 1]
    all_pred_labels = single_predictions[:, 2]
    confusion = confusion_matrix(all_true_labels, all_pred_labels)

    np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'),
               confusion, '%d', ';', '\n')

    header = 'accuracy;precision;recall;f1'
    np.savetxt(os.path.join(args.output_dir, 'metrics.csv'),
               classification_result,
               '%f',
               ';',
               '\n',
               header=header)

    log.info(classification_result)
    log.info('finished')
Ejemplo n.º 2
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading feature and label data')
    train_labels = np.asarray(list(map(int, list(file_line_generator(args.train_labels)))))
    train_features = np.loadtxt(args.train_data)

    if train_features.ndim == 1:
        train_features = train_features.reshape((train_features.shape[0], 1))

    test_labels = np.asarray(list(map(int, list(file_line_generator(args.test_labels)))))
    test_features = np.loadtxt(args.test_data)

    if test_features.ndim == 1:
        test_features = test_features.reshape((test_features.shape[0], 1))

    log.info('performing classification')
    single_predictions, classification_result, weight_vectors, model = \
            calc_results(train_features, train_labels, test_features,
            test_labels, args.normalize, args.mode == True)

    log.info('storing results')
    save_object_to_file(model, os.path.join(args.output_dir, 'svm'))

    np.savetxt(os.path.join(args.output_dir, 'weights.csv'),
            weight_vectors, '%f', ';', '\n')

    header = 'instance_index;true_label;pred_label'
    np.savetxt(os.path.join(args.output_dir, 'predictions.csv'),
            single_predictions, '%d', ';', '\n', header=header)

    all_true_labels = single_predictions[:, 1]
    all_pred_labels = single_predictions[:, 2]
    confusion = confusion_matrix(all_true_labels, all_pred_labels)

    np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'),
            confusion, '%d', ';', '\n')

    header = 'accuracy;precision;recall;f1'
    np.savetxt(os.path.join(args.output_dir, 'metrics.csv'),
            classification_result, '%f', ';', '\n', header=header)

    log.info(classification_result)
    log.info('finished')
Ejemplo n.º 3
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('converting file')

    with utf8_file_open(args.outfile, 'w') as outfile:

        for line in file_line_generator(args.prediction_file):

            if line.startswith(u'#'):
                continue

            (_, true_label, pred_label) = line.split(';')
            true_label = int(true_label)
            pred_label = int(pred_label)

            tp = 1 if true_label == 1 and pred_label == 1 else 0
            model_pos = 1 if pred_label == 1 else 0
            gold_pos = 1 if true_label == 1 else 0

            outfile.write(u'%d %d %d\n' % (tp, model_pos, gold_pos))
    log.info('finished')
Ejemplo n.º 4
0
def getData(emb_file):
    """Load the data file.

    Parameters
    ----------
    emb_file : str
        name of the data file in which the first tab-separated column contains
        the title and the second column the values of an item

    Returns
    -------
    list(str)
        item titles
    list(ndarray)
        item values
    """
    titles = []
    data = []

    for l in file_line_generator(emb_file):
        token, emb = l.split('\t')
        titles.append(token)
        data.append(np.fromstring(emb, sep=' '))

    return titles, np.asarray(data)
Ejemplo n.º 5
0
def main(argv=None):
    """See argument parser description."""

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')

    if args.vocabulary is None:
        vocab = args.vocabulary
    else:
        vocab = read_vocabulary_id_file(args.vocabulary)

    text = list(file_line_generator(args.infile))

    ngram_range = list(map(int, tuple(args.ngram.split(','))))
    vectorizer = CountVectorizer(token_pattern='[^ ]+', min_df=0.0,
            vocabulary=vocab, ngram_range=ngram_range, dtype=int)

    log.info('creating features')
    bow = vectorizer.fit_transform(text)

    log.info('storing result')
    np.savetxt(args.out_feature_file, bow.todense(), fmt='%d')

    with utf8_file_open(args.out_feature_file + '.vocab', 'w') as vocab_file:
        vocab_file.write('\n'.join(vectorizer.get_feature_names()))

    log.info('finished')
Ejemplo n.º 6
0
def main(argv=None):
    log.info('started application')

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args()
    log.info('start parameters: ' + str(args))
    log.info('reading index file')
    idx = get_indices(args.indices)
    max_idx = max(idx)
    log.info('filtering file')

    with utf8_file_open(args.outfile, 'w') as outfile:

        for (cur_idx, line) in enumerate(
                file_line_generator(args.infile, False)):

            if not args.inverse:

                if cur_idx in idx:
                    outfile.write(line)

                if cur_idx >= max_idx:
                    break
            else:

                if cur_idx not in idx:
                    outfile.write(line)


    log.info('finished')
Ejemplo n.º 7
0
def getData(emb_file):
    """Load the data file.

    Parameters
    ----------
    emb_file : str
        name of the data file in which the first tab-separated column contains
        the title and the second column the values of an item

    Returns
    -------
    list(str)
        item titles
    list(ndarray)
        item values
    """
    titles = []
    data = []

    for l in file_line_generator(emb_file):
        token, emb = l.split(u'\t')
        titles.append(token)
        data.append(np.fromstring(emb, sep=u' '))

    return titles, np.asarray(data)
Ejemplo n.º 8
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading feature and label data')
    labels = np.asarray(map(int, list(file_line_generator(args.label_file))))

    log.info('performing cross validation')
    single_predictions, classification_result = do_cross_validation(labels)

    log.info('storing results')
    header = 'fold_no;instance_index;true_label;pred_label'
    np.savetxt(os.path.join(args.output_dir, 'predictions.csv'),
            single_predictions, '%d', ';', '\n', header=header)

    all_true_labels = single_predictions[:, 2]
    all_pred_labels = single_predictions[:, 3]
    confusion = confusion_matrix(all_true_labels, all_pred_labels)

    np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'),
            confusion, '%d', ';', '\n')

    classification_result[NO_OF_FOLDS, :] = get_classification_result(-1,
                all_true_labels, all_pred_labels)

    header = 'fold_no;accuracy;precision;recall;f1'
    np.savetxt(os.path.join(args.output_dir, 'metrics.csv'),
            classification_result, '%f', ';', '\n', header=header)

    log.info(classification_result)
    log.info('finished')
Ejemplo n.º 9
0
def main(argv=None):
    """See argument parser description."""

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')

    if args.vocabulary is None:
        vocab = args.vocabulary
    else:
        vocab = read_vocabulary_id_file(args.vocabulary)

    text = list(file_line_generator(args.infile))

    ngram_range = map(int, tuple(args.ngram.split(',')))
    vectorizer = CountVectorizer(token_pattern='[^ ]+',
                                 min_df=0.0,
                                 vocabulary=vocab,
                                 ngram_range=ngram_range,
                                 dtype=int)

    log.info('creating features')
    bow = vectorizer.fit_transform(text)

    log.info('storing result')
    np.savetxt(args.out_feature_file, bow.todense(), fmt='%d')

    with utf8_file_open(args.out_feature_file + '.vocab', 'w') as vocab_file:
        vocab_file.write(u'\n'.join(vectorizer.get_feature_names()))

    log.info('finished')
Ejemplo n.º 10
0
def main(argv=None):
    log.info('started application')

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args()
    log.info('start parameters: ' + str(args))
    log.info('reading index file')
    idx = get_indices(args.indices)
    max_idx = max(idx)
    log.info('filtering file')

    with utf8_file_open(args.outfile, 'w') as outfile:

        for (cur_idx, line) in enumerate(
                file_line_generator(args.infile, False)):

            if not args.inverse:

                if cur_idx in idx:
                    outfile.write(line)

                if cur_idx >= max_idx:
                    break
            else:

                if cur_idx not in idx:
                    outfile.write(line)


    log.info('finished')
Ejemplo n.º 11
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('converting file')

    with utf8_file_open(args.outfile, 'w') as outfile:

        for line in file_line_generator(args.prediction_file):

            if line.startswith('#'):
                continue

            (_, true_label, pred_label) = line.split(';')
            true_label = int(true_label)
            pred_label = int(pred_label)

            tp = 1 if true_label == 1 and pred_label == 1 else 0
            model_pos = 1 if pred_label == 1 else 0
            gold_pos = 1 if true_label == 1 else 0

            outfile.write('%d %d %d\n' % (tp, model_pos, gold_pos))
    log.info('finished')
Ejemplo n.º 12
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading feature and label data')
    labels = np.asarray(map(int, list(file_line_generator(args.label_file))))

    log.info('performing cross validation')
    single_predictions, classification_result = do_cross_validation(labels)

    log.info('storing results')
    header = 'fold_no;instance_index;true_label;pred_label'
    np.savetxt(os.path.join(args.output_dir, 'predictions.csv'),
            single_predictions, '%d', ';', '\n', header=header)

    all_true_labels = single_predictions[:, 2]
    all_pred_labels = single_predictions[:, 3]
    confusion = confusion_matrix(all_true_labels, all_pred_labels)

    np.savetxt(os.path.join(args.output_dir, 'confusion_matrix.csv'),
            confusion, '%d', ';', '\n')

    classification_result[NO_OF_FOLDS, :] = get_classification_result(-1,
                all_true_labels, all_pred_labels)

    header = 'fold_no;accuracy;precision;recall;f1'
    np.savetxt(os.path.join(args.output_dir, 'metrics.csv'),
            classification_result, '%f', ';', '\n', header=header)

    log.info(classification_result)
    log.info('finished')
def main(argv=None):
    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')
    items = []

    for line in file_line_generator(args.data_file):
        items.append(tuple(line.split()))

    log.info('compute majority labels')
    cluster_to_label_count = defaultdict(Counter)

    # Count labels per cluster
    for (label, cluster_id) in items:
        cluster_to_label_count[cluster_id][label] += 1

    majority_labels = dict()

    # Get majority label per cluster
    for cluster_id in cluster_to_label_count:
        majority_labels[cluster_id] = cluster_to_label_count[cluster_id].most_common(1)[0][0]

    log.info('assign labels to examples')

    with utf8_file_open(args.predicted_labels, 'w') as pred_file:

        for example_line in file_line_generator(args.data_file):
            pred_file.write(majority_labels[example_line.split()[1]] + '\n')


    if args.cluster_labels:

        with utf8_file_open(args.cluster_labels, 'w') as outfile:

            for (cluster_id, label) in sort_dict_by_key(majority_labels):
                outfile.write('%s %s\n' % (cluster_id, label))

    log.info('finished')
Ejemplo n.º 14
0
def main(argv=None):
    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')
    items = []

    for line in file_line_generator(args.data_file):
        items.append(tuple(line.split()))

    log.info('compute majority labels')
    cluster_to_label_count = defaultdict(Counter)

    # Count labels per cluster
    for (label, cluster_id) in items:
        cluster_to_label_count[cluster_id][label] += 1

    majority_labels = dict()

    # Get majority label per cluster
    for cluster_id in cluster_to_label_count:
        majority_labels[cluster_id] = cluster_to_label_count[
            cluster_id].most_common(1)[0][0]

    log.info('assign labels to examples')

    with utf8_file_open(args.predicted_labels, 'w') as pred_file:

        for example_line in file_line_generator(args.data_file):
            pred_file.write(majority_labels[example_line.split()[1]] + u'\n')

    if args.cluster_labels:

        with utf8_file_open(args.cluster_labels, 'w') as outfile:

            for (cluster_id, label) in sort_dict_by_key(majority_labels):
                outfile.write(u'%s %s\n' % (cluster_id, label))

    log.info('finished')
Ejemplo n.º 15
0
def read_unigram_frequencies(filename):
    """Read the unigram frequencies for all vocabulary items from the file.

    1 frequency per line.
    Caution: Don't forget to add the 4 special tokens, e.g., <UNK>. Besides
    <UNK> we don't want to draw them as noise, therefore they should have
    a count of 0.
    """
    unigram_dist = []

    for line in file_line_generator(filename):
        unigram_dist.append(int(line))

    return unigram_dist
Ejemplo n.º 16
0
def main(argv=None):
    """See argument parser description."""

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')
    file2_content = list(file_line_generator(args.file2))

    log.info('combining files')

    with utf8_file_open(args.out_file, 'w') as outfile:

        for c, line1 in enumerate(file_line_generator(args.file1)):
            log_iterations(log, c, 1000)

            for line2 in file2_content:
                outfile.write(line1 + args.separator + line2 + u'\n')

    log.info('finished')
Ejemplo n.º 17
0
def main(argv=None):
    """See argument parser description."""

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')
    file2_content = list(file_line_generator(args.file2))

    log.info('combining files')

    with utf8_file_open(args.out_file, 'w') as outfile:

        for c, line1 in enumerate(file_line_generator(args.file1)):
            log_iterations(log, c, 1000)

            for line2 in file2_content:
                outfile.write(line1 + args.separator + line2 + '\n')

    log.info('finished')
Ejemplo n.º 18
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')
    true = []
    pred = []

    for line in file_line_generator(args.true_labels):
        true.append(line)

    for line in file_line_generator(args.pred_labels):
        pred.append(line)

    acc = accuracy_score(true, pred)
    log.info('accuracy: %f' % acc)

    if args.precision or args.recall or args.f_measure:
        p, r, f, _ = precision_recall_fscore_support(
            true,
            pred,
            args.beta,
            pos_label=args.pos_label,
            average=None if not args.avg else args.avg)

        if args.precision:
            log.info('precision: %f' % p)
        if args.recall:
            log.info('recall: %f' % r)
        if args.f_measure:
            log.info('f-measure: %f' % f)

    log.info('finished')
Ejemplo n.º 19
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')
    vocab = embeddings.read_vocabulary_file(args.vocabulary, False)
    contexts = list(file_line_generator(args.contexts))
    dists = np.loadtxt(args.distributions)

    log.info('computing results')
    # Add X in the n-grams' centers
    # Assume we have the same context size left and right.
    x_pos = len(contexts[0].split()) // 2
    contexts = [
        sp[:x_pos] + ['X'] + sp[x_pos:]
        for sp in [c.split() for c in contexts]
    ]

    # Sorts all words for each context descending.
    sort_words_per_context_value = np.sort(dists, 1)[:, ::-1]
    sort_words_per_context_idx = np.argsort(dists, 1)[:, ::-1]

    # Sorts all contexts according to their probability assigned to "similar".
    sort_context_for_similar_idx = np.argsort(dists[:, 465])[::-1]
    sort_context_for_similar_value = np.sort(dists[:, 465])[::-1]

    log.info('writing data data')

    with utf8_file_open(args.out_file, 'w') as likelihood_file:

        # Write results to a file
        for (i, idx) in enumerate(sort_context_for_similar_idx):
            likelihood_file.write(u' '.join(contexts[idx]) + u'\t' +
                                  unicode(sort_context_for_similar_value[i]) +
                                  u'\n')

            # 10 most likely words for the current context
            for j in xrange(10):
                likelihood_file.write(
                    vocab[sort_words_per_context_idx[idx, j]] + u'\t' +
                    unicode(sort_words_per_context_value[idx, j]) + u'\n')

            likelihood_file.write(u'\n')

    log.info('finished')
Ejemplo n.º 20
0
    def configure(self, args):
        super(vLblNCEPredictor, self).configure(args)
        self.vocab = read_vocabulary_id_file(args.vocabulary)
        self.vocab_size = len(self.vocab.keys())
        self.effective_vocab_size = len(self.vocab.keys())
        self.perplexity = args.perplexity
        self.save_word = args.save_word
        self.result_file = args.result_file
        self.store_rank = args.store_rank
        self.store_argmax = args.store_argmax
        self.store_softmax = args.store_softmax
        self.normalize_with_root = args.normalize_with_root
        self.information = args.information
        self.predictions = args.predictions

        # This code is taken from SimpleVLblNceTrainer
        if args.pred_vocab:
            # Element i contains the index of the i'th prediction vocabulary
            # token in the original vocabulary.
            self.vocab_mapping_list = list()

            # Mapping from the model vocabulary to the prediction vocabulary
            # indices
            self.vocab_mapping = dict()

            for i, token in enumerate(file_line_generator(args.pred_vocab)):

                if not token in self.vocab:
                    raise ValueError('Token "%s" in prediction vocabulary ' +
                                     'does not exist in model vocabulary.' %
                                     token)

                self.vocab_mapping_list.append(self.vocab[token])
                self.vocab_mapping[self.vocab[token]] = i
        else:
            self.vocab_mapping_list = range(len(self.vocab))
            self.vocab_mapping = dict(
                zip(self.vocab_mapping_list, self.vocab_mapping_list))

        if self.perplexity:
            self.example_iterator_type = PaddedWindowExamplesGenerator
            self.example_processor = self._process_example_full_text
            self.learn_eos = True  # We need to set that because otherwise PaddedWindowExampleGenerator will ignore end-of-sentence tags (</S>)
            self.disable_padding = False
            self.w_indices = debug_print(T.imatrix('w'), 'w')
            self.inputs.append(self.w_indices)
        else:
            self.example_processor = self._process_example_context_per_line
Ejemplo n.º 21
0
    def configure(self, args):
        super(vLblNCEPredictor, self).configure(args)
        self.vocab = read_vocabulary_id_file(args.vocabulary)
        self.vocab_size = len(self.vocab.keys())
        self.effective_vocab_size = len(self.vocab.keys())
        self.perplexity = args.perplexity
        self.save_word = args.save_word
        self.result_file = args.result_file
        self.store_rank = args.store_rank
        self.store_argmax = args.store_argmax
        self.store_softmax = args.store_softmax
        self.normalize_with_root = args.normalize_with_root
        self.information = args.information
        self.predictions = args.predictions

        # This code is taken from SimpleVLblNceTrainer
        if args.pred_vocab:
            # Element i contains the index of the i'th prediction vocabulary
            # token in the original vocabulary.
            self.vocab_mapping_list = list()

            # Mapping from the model vocabulary to the prediction vocabulary
            # indices
            self.vocab_mapping = dict()

            for i, token in enumerate(file_line_generator(args.pred_vocab)):

                if not token in self.vocab:
                    raise ValueError('Token "%s" in prediction vocabulary ' +
                            'does not exist in model vocabulary.' % token)

                self.vocab_mapping_list.append(self.vocab[token])
                self.vocab_mapping[self.vocab[token]] = i
        else:
            self.vocab_mapping_list = range(len(self.vocab))
            self.vocab_mapping = dict(
                    zip(self.vocab_mapping_list, self.vocab_mapping_list))

        if self.perplexity:
            self.example_iterator_type = PaddedWindowExamplesGenerator
            self.example_processor = self._process_example_full_text
            self.learn_eos = True  # We need to set that because otherwise PaddedWindowExampleGenerator will ignore end-of-sentence tags (</S>)
            self.disable_padding = False
            self.w_indices = debug_print(T.imatrix('w'), 'w')
            self.inputs.append(self.w_indices)
        else:
            self.example_processor = self._process_example_context_per_line
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')
    vocab = embeddings.read_vocabulary_file(args.vocabulary, False)
    contexts = list(file_line_generator(args.contexts))
    dists = np.loadtxt(args.distributions)

    log.info('computing results')
    # Add X in the n-grams' centers
    # Assume we have the same context size left and right.
    x_pos = len(contexts[0].split()) // 2
    contexts = [sp[:x_pos] + ['X'] + sp[x_pos:]
            for sp in [c.split() for c in contexts]]

    # Sorts all words for each context descending.
    sort_words_per_context_value = np.sort(dists, 1)[: , ::-1]
    sort_words_per_context_idx = np.argsort(dists, 1)[: , ::-1]

    # Sorts all contexts according to their probability assigned to "similar".
    sort_context_for_similar_idx = np.argsort(dists[:, 465])[::-1]
    sort_context_for_similar_value = np.sort(dists[:, 465])[::-1]

    log.info('writing data data')

    with utf8_file_open(args.out_file, 'w') as likelihood_file:

        # Write results to a file
        for (i, idx) in enumerate(sort_context_for_similar_idx):
            likelihood_file.write(u' '.join(contexts[idx]) + u'\t' +
                    unicode(sort_context_for_similar_value[i]) + u'\n')

            # 10 most likely words for the current context
            for j in xrange(10):
                likelihood_file.write(vocab[sort_words_per_context_idx[idx, j]] +
                        u'\t' + unicode(sort_words_per_context_value[idx, j]) +
                        u'\n')

            likelihood_file.write(u'\n')

    log.info('finished')
Ejemplo n.º 23
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))
    log.info('preprocessing data')

    if args.amazon is True:
        line_iterator = \
                AmazonProductReviewCorpusReader(args.infile).review_generator()
    else:
        line_iterator = file_line_generator(args.infile)

    if args.sentence_splitter:
        sent_splitter = nltk.data.load(args.sentence_splitter)

    with utf8_file_open(args.outfile, 'w') as outfile:

        for (i, line) in enumerate(line_iterator):
            log_iterations(log, i, 100000)

            if args.replace_digits:
                line = re.sub(r'\d', args.replace_digits, line,
                        0, REGEX_FLAGS)

            if args.strip_html:
                line = nltk.clean_html(line)

            if args.sentence_splitter:
                line = sent_splitter.tokenize(line)
            else:
                line = [line]

            if args.tokenize:
                line = [tokenize(l) for l in line]

            if not args.tokenize:
                outfile.write('\n'.join(line))
            else:
                outfile.write('\n'.join([' '.join(l) for l in line]))

            outfile.write('\n')

    log.info('finished')
Ejemplo n.º 24
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))
    log.info('preprocessing data')

    if args.amazon is True:
        line_iterator = \
                AmazonProductReviewCorpusReader(args.infile).review_generator()
    else:
        line_iterator = file_line_generator(args.infile)

    if args.sentence_splitter:
        sent_splitter = nltk.data.load(args.sentence_splitter)

    with utf8_file_open(args.outfile, 'w') as outfile:

        for (i, line) in enumerate(line_iterator):
            log_iterations(log, i, 100000)

            if args.replace_digits:
                line = re.sub(r'\d', args.replace_digits, line, 0, REGEX_FLAGS)

            if args.strip_html:
                line = nltk.clean_html(line)

            if args.sentence_splitter:
                line = sent_splitter.tokenize(line)
            else:
                line = [line]

            if args.tokenize:
                line = [tokenize(l) for l in line]

            if not args.tokenize:
                outfile.write(u'\n'.join(line))
            else:
                outfile.write(u'\n'.join([u' '.join(l) for l in line]))

            outfile.write(u'\n')

    log.info('finished')
Ejemplo n.º 25
0
    def review_generator(self, remove_meta_cols=True):
        """Iterate over all reviews

        Parameters
        ----------
        remove_meta_cols : bool
            indicates whether or not to remove the first 7 meta data columns
        """

        for line in file_line_generator(self.infile):
            line = line.decode(errors='ignore')

            if remove_meta_cols is True:
                line = self._extract_body(line)

            yield line

        raise StopIteration()
Ejemplo n.º 26
0
    def review_generator(self, remove_meta_cols=True):
        """Iterate over all reviews

        Parameters
        ----------
        remove_meta_cols : bool
            indicates whether or not to remove the first 7 meta data columns
        """

        for line in file_line_generator(self.infile):
            line = line.decode(errors='ignore')

            if remove_meta_cols is True:
                line = self._extract_body(line)

            yield line

        raise StopIteration()
Ejemplo n.º 27
0
def main(argv=None):
    """See argument parser description."""

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    vocab = pd.Series(file_line_generator(args.vocabulary, comment='##'))

    with open(args.infile, 'rb') as infile:
        integers = np.fromfile(infile, np.int32)

    with utf8_file_open(args.outfile, 'w') as outfile:
        outfile.write(u'\n'.join(vocab[integers]))
        outfile.write(u'\n')

    log.info('finished')
Ejemplo n.º 28
0
def main(argv=None):

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('transforming data')

    with utf8_file_open(args.outfile, 'w') as outfile:
        for line in file_line_generator(args.infile):
            token, signature = line.split(u'\t')
            outfile.write(u'%s\t%s\n' %
                          (token,
                           prepare_brown_signature(signature, args.max_size,
                                                   args.right)))

    log.info('finished')
Ejemplo n.º 29
0
def main(argv=None):
    """See argument parser description."""

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    vocab = pd.Series(file_line_generator(args.vocabulary, comment='##'))

    with open(args.infile, 'rb') as infile:
        integers = np.fromfile(infile, np.int32)

    with utf8_file_open(args.outfile, 'w') as outfile:
        outfile.write('\n'.join(vocab[integers]))
        outfile.write('\n')

    log.info('finished')
Ejemplo n.º 30
0
def get_indices(indices):
    """Generates line indices to keep.

    Parameters
    ----------
    indices : str
        either name of a file containing indices one per line or a comma
        separated string

    Returns
    -------
    int
        next index
    """

    if os.path.exists(indices):
        return set(map(int, file_line_generator(indices, True)))

    return set((int(i.strip()) for i in indices.split(',')))
Ejemplo n.º 31
0
def get_indices(indices):
    """Generates line indices to keep.

    Parameters
    ----------
    indices : str
        either name of a file containing indices one per line or a comma
        separated string

    Returns
    -------
    int
        next index
    """

    if os.path.exists(indices):
        return set(map(int, file_line_generator(indices, True)))

    return set((int(i.strip()) for i in indices.split(u',')))
Ejemplo n.º 32
0
def main(argv=None):
    """See argument parser description."""

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')
    vocab = read_vocabulary_id_file(args.vocabulary, False)

    _, ext = os.path.splitext(args.feature_file)

    if ext == 'npy':
        features = np.load(args.feature_file)
    else:
        features = np.loadtxt(args.feature_file)

    log.info('creating features')

    with utf8_file_open(args.out_feature_file, 'w') as outfile:

        for line in file_line_generator(args.infile):
            toks = line.split()
            cur_features = np.zeros((len(toks), features.shape[1]))

            for (i, tok) in enumerate(toks):
                cur_features[i, :] = features[vocab.get(
                    tok, SpecialTokenID.UNKNOWN.value)]

            if args.avg:
                res = ndarray_to_string(np.mean(cur_features, axis=0))
            else:
                res = ndarray_to_string(
                    np.reshape(cur_features,
                               np.prod(cur_features.shape),
                               order='C'))

            outfile.write(res + u'\n')

    log.info('finished')
Ejemplo n.º 33
0
def main(argv=None):
    """See argument parser description."""

    if argv is None:
        argv = sys.argv[1:]

    args = parser.parse_args(argv)
    log.info('start parameters: ' + str(args))

    log.info('loading data')
    vocab = read_vocabulary_id_file(args.vocabulary, False)

    _, ext = os.path.splitext(args.feature_file)

    if ext == 'npy':
        features = np.load(args.feature_file)
    else:
        features = np.loadtxt(args.feature_file)

    log.info('creating features')

    with utf8_file_open(args.out_feature_file, 'w') as outfile:

        for line in file_line_generator(args.infile):
            toks = line.split()
            cur_features = np.zeros((len(toks), features.shape[1]))

            for (i, tok) in enumerate(toks):
                cur_features[i, :] = features[
                        vocab.get(tok, SpecialTokenID.UNKNOWN.value)]

            if args.avg:
                res = ndarray_to_string(np.mean(cur_features, axis=0))
            else:
                res = ndarray_to_string(np.reshape(cur_features,
                        np.prod(cur_features.shape), order='C'))

            outfile.write(res + u'\n')

    log.info('finished')
Ejemplo n.º 34
0
def read_vocabulary_file(input_file, add_special_tokens=True):
    """Read the textual vocabulary into a list. Items that are empty after
    calling str.strip on them will be mapped to u'<EMPTY>'.

    Parameters
    ----------
    input_file : str
        location of the vocabulary
    add_special_tokens : bool
        indicates whether or not to add special tokens to the front of the
        vocabulary, like <UNK> for unknown tokens, etc.

    Returns
    -------
    list(str)
        vocabulary from token to unique id
    """
    vocab = list(file_line_generator(input_file))

    if add_special_tokens:
        _add_special_tokens(vocab)

    return [v.strip() if v.strip() else '<EMPTY>' for v in vocab]
Ejemplo n.º 35
0
def read_vocabulary_file(input_file, add_special_tokens=True):
    """Read the textual vocabulary into a list. Items that are empty after
    calling str.strip on them will be mapped to u'<EMPTY>'.

    Parameters
    ----------
    input_file : str
        location of the vocabulary
    add_special_tokens : bool
        indicates whether or not to add special tokens to the front of the
        vocabulary, like <UNK> for unknown tokens, etc.

    Returns
    -------
    list(str)
        vocabulary from token to unique id
    """
    vocab = list(file_line_generator(input_file))

    if add_special_tokens:
        _add_special_tokens(vocab)

    return [v.strip() if v.strip() else u'<EMPTY>' for v in vocab]
Ejemplo n.º 36
0
def calc_matrix_statistics(matrix_file):
    """Calculates some basic statistics for huge matrix files.

    If a matrix is too big to be imported into a program, use this method to
    calculate the mean, maximum, minimum, and standard deviation of every line
    in the file. It returns a generator.

    Parameters
    ----------
    matrix_file : str
        filename of the matrix file; the file must be a csv file with spaces as
        separator

    Returns
    -------
    generator : (float, float, float, float)
        mean, max, min, std_dev of current line in the matrix file
    """

    for line in file_line_generator(matrix_file):
        a = np.fromstring(line, sep=u' ')
        yield (np.mean(a), np.max(a), np.min(a), np.std(a))

    raise StopIteration()
Ejemplo n.º 37
0
def calc_matrix_statistics(matrix_file):
    """Calculates some basic statistics for huge matrix files.

    If a matrix is too big to be imported into a program, use this method to
    calculate the mean, maximum, minimum, and standard deviation of every line
    in the file. It returns a generator.

    Parameters
    ----------
    matrix_file : str
        filename of the matrix file; the file must be a csv file with spaces as
        separator

    Returns
    -------
    generator : (float, float, float, float)
        mean, max, min, std_dev of current line in the matrix file
    """

    for line in file_line_generator(matrix_file):
        a = np.fromstring(line, sep=' ')
        yield (np.mean(a), np.max(a), np.min(a), np.std(a))

    raise StopIteration()
Ejemplo n.º 38
0
    def example_iter(self, filename):

        for example in file_line_generator(filename):
            yield example

        raise StopIteration
Ejemplo n.º 39
0
    def example_iter(self, filename):

        for example in file_line_generator(filename):
            yield example

        raise StopIteration
Ejemplo n.º 40
0
def extract_results_from_logfile(logfile,
                                 result='train_error',
                                 fmt='new',
                                 no_of_val_files=1):
    """Extract results from a given logfile and returns them as ndarray.

    Parameters
    ----------
    logfile : str
        path of the logfile
    result : str
        type of the result to be extracted; one of 'train_error',
        'val_error', 'val_ppl'
    format : str
        'new' or 'old', new format allows several validation files; old format
        only allowed 1 validation file.
    no_of_val_files : int
        number of validation files used in the logfile; is only matters if
        result = 'val_error' or 'val_perplexity'

    Returns
    -------
    ndarray
        contains all results in an array
    """

    if fmt == 'old':
        val_method_name = 'validate'
    else:
        val_method_name = '_validate_single_file'

    if result == 'train_error':
        pattern = re.compile(r'run\tAverage loss on .*? training set is (.*)',
                             re.UNICODE)
    elif result == 'val_error':
        pattern = re.compile(
            r'%s\tAverage loss on .*? validation set is (.*)' %
            val_method_name, re.UNICODE)
    elif result == 'val_ppl':
        pattern = re.compile(
            r'%s\tPerplexity on .*? validation set is (.*)' % val_method_name,
            re.UNICODE)
    else:
        raise ValueError(
            'Unknown result type to be extracted from logfile: %s' % result)

    values = list()

    for line in file_line_generator(logfile):
        match = re.search(pattern, line)

        if not match:
            continue

        values.append(float(match.group(1)))

    # Converts the 1d list of results into one list per validation file.
    if (result == 'val_error' or result == 'val_ppl') and no_of_val_files != 1:
        values = list(grouper_recipes(values, no_of_val_files))
        values = zip(*values)

    return values
Ejemplo n.º 41
0
def extract_results_from_logfile(logfile, result='train_error', fmt='new',
        no_of_val_files=1):
    """Extract results from a given logfile and returns them as ndarray.

    Parameters
    ----------
    logfile : str
        path of the logfile
    result : str
        type of the result to be extracted; one of 'train_error',
        'val_error', 'val_ppl'
    format : str
        'new' or 'old', new format allows several validation files; old format
        only allowed 1 validation file.
    no_of_val_files : int
        number of validation files used in the logfile; is only matters if
        result = 'val_error' or 'val_perplexity'

    Returns
    -------
    ndarray
        contains all results in an array
    """

    if fmt == 'old':
        val_method_name = 'validate'
    else:
        val_method_name = '_validate_single_file'


    if result == 'train_error':
        pattern = re.compile(r'run\tAverage loss on .*? training set is (.*)',
                re.UNICODE)
    elif result == 'val_error':
        pattern = re.compile(
                r'%s\tAverage loss on .*? validation set is (.*)' % val_method_name,
                re.UNICODE)
    elif result == 'val_ppl':
        pattern = re.compile(
                r'%s\tPerplexity on .*? validation set is (.*)' % val_method_name,
                re.UNICODE)
    else:
        raise ValueError('Unknown result type to be extracted from logfile: %s'
                % result)

    values = list()

    for line in file_line_generator(logfile):
        match = re.search(pattern, line)

        if not match:
            continue

        values.append(float(match.group(1)))

    # Converts the 1d list of results into one list per validation file.
    if (result == 'val_error' or result == 'val_ppl') and no_of_val_files != 1:
        values = list(grouper_recipes(values, no_of_val_files))
        values = zip(*values)

    return values