Example #1
0
    def __texts__(self):

        width = 16
        for path in self._inputs:
            self._paths.append(path)
            txt = datapath(path, datadir=self.datadir, ext=".tagged.txt").full
            pick = datapath(path, datadir=self.datadir,
                            ext=".sents.pickle").full
            trie = datapath(path, datadir=self.datadir, ext=".trie.dawg").full
            process = 'reading:'.ljust(width)

            if not (os.path.exists(txt) or os.path.exists(pick)
                    or os.path.exists(trie)) or self.rewrite:
                process = 'creating:'.ljust(width)

            if self.verbose:
                print(process, txt.replace(nlptk.MODULEDIR, '..'))

            start = time.time()
            # быстрее всего читать sents и vocab текстов из pickle
            # чем из tagged.txt
            text = Text(path,
                        self._prep,
                        self._clean,
                        self._filters,
                        datadir=self.datadir,
                        verbose=self.verbose,
                        loadas=self.loadas,
                        saveas=self.saveas,
                        rewrite=self.rewrite,
                        inplace=True)

            if self.verbose:
                print('time:'.ljust(width), time.time() - start)
            # обе операции в среднем занимают 1сек
            words = text.words(filtrate=self.filtrate)  # не фильтруем?
            words_time = time.time() - start
            self._vocab += Vocabulary(words)
            vocab_time = time.time() - start

            if self.verbose:
                wraptext = shorten(str(words[:10]),
                                   width=80,
                                   placeholder="...]")
                print('words_time:'.ljust(width), words_time)
                print('vocab:'.ljust(width), wraptext)
                print('TOTAL TIME:'.ljust(width), vocab_time)

            if self.autosave and 'creating' in process:
                text.save(as_=self.saveas)
                if self.verbose:
                    print('TOTAL TIME:'.ljust(width), time.time() - start)
            #self._texts.append(text)

            yield text
Example #2
0
    def __sents__(self):

        encoding = self._encoding
        sentencizer = self._prep.sentencizer
        clean = self._clean
        path = self._input

        if self.loadas == 'txt' and self._path:
            path = datapath(self._path,
                            datadir=self.datadir,
                            ext=".tagged.txt").full
            if os.path.exists(path):
                encoding = 'utf-8'
                sentencizer = None
                clean = None

        stream = Stream(path, encoding=encoding)
        self.encoding = stream._encoding
        for num, sent in enumerate(stream(sentencizer, clean)):
            tagged_sent = TaggedSentence(sent.strip(), num, self._prep,
                                         self._filters)
            lemmas = tagged_sent.lemmas()
            # в этот словарь попадают все леммы,
            # так как здесь ничего не фильтруется
            self._vocab += FreqDist(lemmas)
            self._nwords += tagged_sent.nwords
            self._sents.append(tagged_sent)
            #self._words.extend(tagged_sent.words())

            yield tagged_sent

        data = ((token.word, (token.nsent, token.idx))
                for sent in self.sents() for token in sent.tokens(lower=True))
        self._trie = dawg.RecordDAWG(">IH", data)
Example #3
0
 def savedawg(self, name, path=None):
     path_ = path or datapath(self._path, datadir=self.datadir).short
     # сохранение словаря для префиксного дерева
     path = '{}.{}.dawg'.format(path_, name)
     if self.verbose:
         print('saving dawg:'.ljust(16),
               path.replace(nlptk.MODULEDIR, '..'))
     self._trie.save(path)
Example #4
0
    def loaddawg(self, name, path=None):
        path_ = path or datapath(self._path, datadir=self.datadir).short
        path = '{}.{}.dawg'.format(path_, name)

        if self._validpath(path):
            if self.verbose:
                print('loading dawg:'.ljust(16),
                      path.replace(nlptk.MODULEDIR, '..'))

            d = dawg.RecordDAWG(">IH")
            obj = d.load(path)
        else:
            obj = None
        return obj
Example #5
0
    def loadpickle(self, name, path=None):
        path_ = path or datapath(self._path, datadir=self.datadir).short
        path = '{}.{}.pickle'.format(path_, name)

        if self._validpath(path):
            if self.verbose:
                print('loading pickle:'.ljust(16),
                      path.replace(nlptk.MODULEDIR, '..'))

            with open(path, 'rb') as f:
                obj = pickle.load(f)
        else:
            obj = None
        return obj
Example #6
0
    def __call__(self, sents: List[List[str]], **kwargs):
        datadir = kwargs.pop(datadir, '')
        name = kwargs.pop(name, None)
        self.params.update(kwargs)

        if name is not None:
            loadpath = datapath(name, datadir=datadir, ext=".model").full
            if os.path.exists(loadpath):
                self.model = Word2Vec.load(loadpath)
            else:
                raise FileNotFoundError(loadpath)

        else:
            self.model = Word2Vec(sents, **self.params)
        return self.model
Example #7
0
    def save(self, path=None, as_=("txt", "pickle")):
        if not os.path.exists(self.datadir):
            os.mkdir(self.datadir)

        path_ = path or datapath(self._path, datadir=self.datadir).short

        saveas = self.saveas or as_
        if not isinstance(saveas, (tuple, list)):
            saveas = (saveas, )

        for fmt in saveas:
            if fmt == "txt":
                path = '{}.tagged.txt'.format(path_)

                if self.verbose:
                    print('saving txt:'.ljust(16),
                          path.replace(nlptk.MODULEDIR, '..'))

                with open(path, 'w', encoding='utf8') as f:
                    f.writelines('\n'.join(map(str, self._sents)))

            elif fmt == 'pickle':
                path = '{}.sents.pickle'.format(path_)

                if self.verbose:
                    print('saving pickle:'.ljust(16),
                          path.replace(nlptk.MODULEDIR, '..'))

                with open(path, 'wb') as f:
                    pickle.dump(self._sents, f)
                path = '{}.vocab.pickle'.format(path_)

                with open(path, 'wb') as f:
                    pickle.dump(self._vocab, f)

        self.savedawg('trie', path_)
Example #8
0
 def save(self, name, datadir='data'):
     loadpath = datapath(name, datadir=datadir, ext=".model").full
     if self.model:
         self.model.save(loadpath)
     else:
         raise ValueError('The model is not created')
Example #9
0
 def load(self, name, datadir='data'):
     loadpath = datapath(name, datadir=datadir, ext=".model").full
     self.model = Word2Vec.load(loadpath)