def __init__(self, vocab, data_path, history_len, batch_size = 1, overlap=False, progress=False, fixed_length=True, target_vector=False, _just_test=False, shuffle=True, max_words=999, min_words=0): """ Generate data for training with RNN :type vocab: nlpy.lm.Vocab :type data_path: str :param history_len: if this value is -1, then one trunk is a sentence :type history_len: int :type binvector: bool """ self._vocab = vocab self._target_vector = target_vector self._just_test = _just_test self.history_len = history_len self.batch_size = batch_size self.minibatch_mode = not (batch_size == 1) self.fixed_length = fixed_length self.progress = progress self.overlap = overlap self.shuffle = shuffle self.sentences = [] # Treat each sentence as a trunk for line in LineIterator(data_path): sequence = [vocab.sent_index] wc = line.count(" ") + 1 if wc < min_words or wc > max_words: continue for w in line.split(" "): sequence.append(vocab.index(w)) sequence.append(vocab.sent_index) self.sentences.append(sequence) logging.info("%d sentences loaded from %s" % (len(self.sentences), data_path))
def _load_source(self): tokenizer = NLTKEnglishTokenizer() counter = Counter() for l in LineIterator(self.source): counter.update(map(str.lower, tokenizer.tokenize(l))) self._freqmap = dict(counter.items()) self._maxfreq = sum(self._freqmap.values()) * 2 / len(self._freqmap)
def _build_data(self, path): data = [] for l in LineIterator(path): l = l.lower() word_data = self.vocab.convert(l) if len(word_data) == 0: continue data.append([word_data]) return data
def _load_frequency(self): self._maxfreq = 3000 self._freqmap = {} for line in LineIterator(_FREQ_DATA_PATH): freq, word = line.split("\t") freq = int(freq) if freq > self._maxfreq: continue self._freqmap[word] = freq
def load(self, path, fixed_size=-1): logging.info("load data from %s" % path) if fixed_size > 0: self._load_fixed_size(path, fixed_size) return for line in LineIterator(path): words = line.split(" ") map(self.add, words) logging.info("vocab size: %d" % self.size)
def _load_fixed_size(self, path, fixed_size): from collections import Counter logging.info("fixed size: %d" % fixed_size) counter = Counter() for line in LineIterator(path): words = line.split(" ") counter.update(words) for w, _ in counter.most_common(fixed_size): self.add(w)
def __init__(self): """ Initialize recase map. """ self._recase_map = {} for line in LineIterator(_FREQ_DATA_PATH): _, word = line.split("\t") low_word = word.lower() if low_word not in self._recase_map: self._recase_map[low_word] = word
def _build_data(self, path): data = [] for l in LineIterator(path): l = l.lower() chars = filter(lambda x: x in self.chat_set, l) if not chars: continue char_ids = [self.chat_set.index(c) + 1 for c in chars] char_ids = [0] + char_ids + [0] word_data = [ np.eye(1, M=self.input_size, k=c)[0] for c in char_ids ] data.append([word_data]) return data
def serve(param): from nlpy.util import external_resource from nlpy.util import LineIterator import urllib2 global semantic_searcher if "semantic_searcher" not in globals(): print "Loading searcher ..." data = LineIterator( external_resource("general/elementary_questions.txt")) semantic_searcher = SemanticSearcher() semantic_searcher.load_data(data) caches = set() if "caches" in param: caches = set(urllib2.unquote(param["caches"]).split(" ||| ")) print caches output = "" for _, result in semantic_searcher.searchMany( param['input'].convert('utf-8')): if result not in caches: output = result break return {"output": output}
def _load_ranking(self): self._rank_list = [] for l in LineIterator(_MASSIVE_WORD_LIST): self._rank_list.append(l)