def __texts__(self): width = 16 for path in self._inputs: self._paths.append(path) txt = datapath(path, datadir=self.datadir, ext=".tagged.txt").full pick = datapath(path, datadir=self.datadir, ext=".sents.pickle").full trie = datapath(path, datadir=self.datadir, ext=".trie.dawg").full process = 'reading:'.ljust(width) if not (os.path.exists(txt) or os.path.exists(pick) or os.path.exists(trie)) or self.rewrite: process = 'creating:'.ljust(width) if self.verbose: print(process, txt.replace(nlptk.MODULEDIR, '..')) start = time.time() # быстрее всего читать sents и vocab текстов из pickle # чем из tagged.txt text = Text(path, self._prep, self._clean, self._filters, datadir=self.datadir, verbose=self.verbose, loadas=self.loadas, saveas=self.saveas, rewrite=self.rewrite, inplace=True) if self.verbose: print('time:'.ljust(width), time.time() - start) # обе операции в среднем занимают 1сек words = text.words(filtrate=self.filtrate) # не фильтруем? words_time = time.time() - start self._vocab += Vocabulary(words) vocab_time = time.time() - start if self.verbose: wraptext = shorten(str(words[:10]), width=80, placeholder="...]") print('words_time:'.ljust(width), words_time) print('vocab:'.ljust(width), wraptext) print('TOTAL TIME:'.ljust(width), vocab_time) if self.autosave and 'creating' in process: text.save(as_=self.saveas) if self.verbose: print('TOTAL TIME:'.ljust(width), time.time() - start) #self._texts.append(text) yield text
def __sents__(self): encoding = self._encoding sentencizer = self._prep.sentencizer clean = self._clean path = self._input if self.loadas == 'txt' and self._path: path = datapath(self._path, datadir=self.datadir, ext=".tagged.txt").full if os.path.exists(path): encoding = 'utf-8' sentencizer = None clean = None stream = Stream(path, encoding=encoding) self.encoding = stream._encoding for num, sent in enumerate(stream(sentencizer, clean)): tagged_sent = TaggedSentence(sent.strip(), num, self._prep, self._filters) lemmas = tagged_sent.lemmas() # в этот словарь попадают все леммы, # так как здесь ничего не фильтруется self._vocab += FreqDist(lemmas) self._nwords += tagged_sent.nwords self._sents.append(tagged_sent) #self._words.extend(tagged_sent.words()) yield tagged_sent data = ((token.word, (token.nsent, token.idx)) for sent in self.sents() for token in sent.tokens(lower=True)) self._trie = dawg.RecordDAWG(">IH", data)
def savedawg(self, name, path=None): path_ = path or datapath(self._path, datadir=self.datadir).short # сохранение словаря для префиксного дерева path = '{}.{}.dawg'.format(path_, name) if self.verbose: print('saving dawg:'.ljust(16), path.replace(nlptk.MODULEDIR, '..')) self._trie.save(path)
def loaddawg(self, name, path=None): path_ = path or datapath(self._path, datadir=self.datadir).short path = '{}.{}.dawg'.format(path_, name) if self._validpath(path): if self.verbose: print('loading dawg:'.ljust(16), path.replace(nlptk.MODULEDIR, '..')) d = dawg.RecordDAWG(">IH") obj = d.load(path) else: obj = None return obj
def loadpickle(self, name, path=None): path_ = path or datapath(self._path, datadir=self.datadir).short path = '{}.{}.pickle'.format(path_, name) if self._validpath(path): if self.verbose: print('loading pickle:'.ljust(16), path.replace(nlptk.MODULEDIR, '..')) with open(path, 'rb') as f: obj = pickle.load(f) else: obj = None return obj
def __call__(self, sents: List[List[str]], **kwargs): datadir = kwargs.pop(datadir, '') name = kwargs.pop(name, None) self.params.update(kwargs) if name is not None: loadpath = datapath(name, datadir=datadir, ext=".model").full if os.path.exists(loadpath): self.model = Word2Vec.load(loadpath) else: raise FileNotFoundError(loadpath) else: self.model = Word2Vec(sents, **self.params) return self.model
def save(self, path=None, as_=("txt", "pickle")): if not os.path.exists(self.datadir): os.mkdir(self.datadir) path_ = path or datapath(self._path, datadir=self.datadir).short saveas = self.saveas or as_ if not isinstance(saveas, (tuple, list)): saveas = (saveas, ) for fmt in saveas: if fmt == "txt": path = '{}.tagged.txt'.format(path_) if self.verbose: print('saving txt:'.ljust(16), path.replace(nlptk.MODULEDIR, '..')) with open(path, 'w', encoding='utf8') as f: f.writelines('\n'.join(map(str, self._sents))) elif fmt == 'pickle': path = '{}.sents.pickle'.format(path_) if self.verbose: print('saving pickle:'.ljust(16), path.replace(nlptk.MODULEDIR, '..')) with open(path, 'wb') as f: pickle.dump(self._sents, f) path = '{}.vocab.pickle'.format(path_) with open(path, 'wb') as f: pickle.dump(self._vocab, f) self.savedawg('trie', path_)
def save(self, name, datadir='data'): loadpath = datapath(name, datadir=datadir, ext=".model").full if self.model: self.model.save(loadpath) else: raise ValueError('The model is not created')
def load(self, name, datadir='data'): loadpath = datapath(name, datadir=datadir, ext=".model").full self.model = Word2Vec.load(loadpath)