Beispiel #1
0
    def test_train(self):
        train_path = internal_resource("lm_test/train")
        valid_path = internal_resource("lm_test/valid")
        vocab = Vocab()
        vocab.load(train_path)

        train_data = RNNDataGenerator(vocab, train_path, target_vector=False,
                                      history_len=-1, _just_test=False, fixed_length=False, progress=True)
        valid_data = RNNDataGenerator(vocab, valid_path, target_vector=False,
                                      history_len=-1, _just_test=False, fixed_length=False, progress=True)

        net_conf = NetworkConfig(input_size=vocab.size)
        net_conf.layers = [RecurrentLayer(size=50, activation='relu')]

        trainer_conf = TrainerConfig()
        trainer_conf.learning_rate = 0.1
        trainer_conf.weight_l2 = 0.0001
        trainer_conf.hidden_l2 = 0.0001
        trainer_conf.monitor_frequency = trainer_conf.validation_frequency = trainer_conf.test_frequency = 1

        network = RecurrentNetwork(net_conf)
        trainer = SGDTrainer(network, config=trainer_conf)


        start_time = time.time()
        for k in list(trainer.train(train_data, valid_data)):
            pass
        print k
        end_time = time.time()
        network.save_params("/tmp/lmparam.gz")


        print "elapsed time:", (end_time - start_time) / 60, "mins"
Beispiel #2
0
    def _test_predict(self):
        train_path = internal_resource("lm_test/train")
        test_path = internal_resource("lm_test/test")
        vocab = Vocab()
        vocab.load(train_path)

        test_data = RNNDataGenerator(vocab,
                                     test_path,
                                     target_vector=False,
                                     history_len=-1,
                                     _just_test=True,
                                     fixed_length=False)

        net_conf = NetworkConfig(input_size=vocab.size)
        net_conf.layers = [RecurrentLayer(size=50, activation='relu')]

        network = RecurrentNetwork(net_conf)
        network.load_params("/tmp/lmparam.gz")

        for d, t in test_data:

            print map(vocab.word, np.argmax(d, axis=1))
            print map(vocab.word, t)
            print map(vocab.word, network.classify(d))
            print "-"
Beispiel #3
0
 def test_generator(self):
     data_path = internal_resource("lm_test/valid")
     v = Vocab()
     v.load(data_path)
     c = 0
     g = RNNDataGenerator(v, data_path, history_len=0)
     for d in g:
         print d
         c += 1
         if c > 100:
             break
Beispiel #4
0
    def test_train(self):
        train_path = internal_resource("lm_test/train")
        valid_path = internal_resource("lm_test/valid")
        vocab = Vocab()
        vocab.load(train_path)

        train_data = RNNDataGenerator(vocab,
                                      train_path,
                                      target_vector=False,
                                      history_len=-1,
                                      _just_test=False,
                                      fixed_length=False,
                                      progress=True)
        valid_data = RNNDataGenerator(vocab,
                                      valid_path,
                                      target_vector=False,
                                      history_len=-1,
                                      _just_test=False,
                                      fixed_length=False,
                                      progress=True)

        net_conf = NetworkConfig(input_size=vocab.size)
        net_conf.layers = [RecurrentLayer(size=50, activation='relu')]

        trainer_conf = TrainerConfig()
        trainer_conf.learning_rate = 0.1
        trainer_conf.weight_l2 = 0.0001
        trainer_conf.hidden_l2 = 0.0001
        trainer_conf.monitor_frequency = trainer_conf.validation_frequency = trainer_conf.test_frequency = 1

        network = RecurrentNetwork(net_conf)
        trainer = SGDTrainer(network, config=trainer_conf)

        start_time = time.time()
        for k in list(trainer.train(train_data, valid_data)):
            pass
        print k
        end_time = time.time()
        network.save_params("/tmp/lmparam.gz")

        print "elapsed time:", (end_time - start_time) / 60, "mins"
Beispiel #5
0
    def _test_predict(self):
        train_path = internal_resource("lm_test/train")
        test_path = internal_resource("lm_test/test")
        vocab = Vocab()
        vocab.load(train_path)

        test_data = RNNDataGenerator(vocab, test_path, target_vector=False,
                                      history_len=-1, _just_test=True, fixed_length=False)

        net_conf = NetworkConfig(input_size=vocab.size)
        net_conf.layers = [RecurrentLayer(size=50, activation='relu')]

        network = RecurrentNetwork(net_conf)
        network.load_params("/tmp/lmparam.gz")



        for d, t in test_data:

            print map(vocab.word, np.argmax(d, axis=1))
            print map(vocab.word, t)
            print map(vocab.word, network.classify(d))
            print "-"
Beispiel #6
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2015 NLPY.ORG
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

from nlpy.util import internal_resource, LineIterator
from nltk_tokenizers import NLTKEnglishTokenizer
from collections import Counter

_FREQ_DATA_PATH = internal_resource("general/en_us_with_coca_1m_bigram_words.txt")
_MASSIVE_WORD_LIST = internal_resource("general/ms_top_100k_words.txt")

class ContentfullnessEstimator(object):

    def __init__(self, source='frequency'):
        self.source = source
        if source == 'frequency':
            self._load_frequency()
        elif source == 'ranking':
            self._load_ranking()
        else:
            self._load_source()

    def _load_source(self):
        tokenizer = NLTKEnglishTokenizer()
        counter = Counter()
        for l in LineIterator(self.source):
            counter.update(map(str.lower, tokenizer.tokenize(l)))

        self._freqmap = dict(counter.items())
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2015 NLPY.ORG
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

import os
from nlpy.util import internal_resource

_FREQUENCY_DATA_PATH = internal_resource(
    "general/en_us_with_coca_1m_bigram_words.txt")


class FrequencyKeywordExtractor:
    def __init__(self):
        self._build_freqmap()
        self._threshold = 600

    def _build_freqmap(self):
        self._freqmap = {}
        for freq, word in (s.strip().split("\t")
                           for s in open(_FREQUENCY_DATA_PATH).xreadlines()):
            self._freqmap[word] = float(freq)

    def set_threshold(self, threshold):
        """
        :rtype threshold: int
        """
        self._threshold = threshold

    def threshold(self):
Beispiel #8
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2015 NLPY.ORG
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

from nlpy.util import internal_resource, LineIterator

# The freq data here is required to be sorted in the reverse order of frequency.
_FREQ_DATA_PATH = internal_resource("general/en_us_with_coca_1m_bigram_words.txt")

class FreqRecaser(object):

    def __init__(self):
        """
        Initialize recase map.
        """
        self._recase_map = {}
        for line in LineIterator(_FREQ_DATA_PATH):
            _, word = line.split("\t")
            low_word = word.lower()
            if low_word not in self._recase_map:
                self._recase_map[low_word] = word

    def recase(self, word):
        """
        :param word: word
        :return: recased word
        """
        low_word = word.lower()
        if low_word not in self._recase_map:
Beispiel #9
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# Copyright (C) 2015 NLPY.ORG
# Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html

from nlpy.util import internal_resource, LineIterator
from nltk_tokenizers import NLTKEnglishTokenizer
from collections import Counter

_FREQ_DATA_PATH = internal_resource(
    "general/en_us_with_coca_1m_bigram_words.txt")
_MASSIVE_WORD_LIST = internal_resource("general/ms_top_100k_words.txt")


class ContentfullnessEstimator(object):
    def __init__(self, source='frequency'):
        self.source = source
        if source == 'frequency':
            self._load_frequency()
        elif source == 'ranking':
            self._load_ranking()
        else:
            self._load_source()

    def _load_source(self):
        tokenizer = NLTKEnglishTokenizer()
        counter = Counter()
        for l in LineIterator(self.source):
            counter.update(map(str.lower, tokenizer.tokenize(l)))
Beispiel #10
0
 def _test_vocab(self):
     data_path = internal_resource("lm_test/valid")
     v = Vocab()
     v.load(data_path)
     print v.size
     print v.binvector("ergerrghwegr")
Beispiel #11
0
 def test(self):
     fc = FeatureContainer(internal_resource("dataset/heart_scale.txt"))
     print len(fc.data)
     print len(fc.targets)
Beispiel #12
0
 def test(self):
     fc = FeatureContainer(internal_resource("dataset/heart_scale.txt"))
     print len(fc.data)
     print len(fc.targets)
Beispiel #13
0
 def __init__(self, target_format=None):
     super(HeartScaleDataset, self).__init__(target_format)
     feature = FeatureContainer(internal_resource("dataset/heart_scale.txt"))
     self.data = feature.data
     self.targets = feature.targets
     self._target_size = 2