def __init__(self, max_document_length, min_frequency=0, vocabulary=None, tokenizer_fn=None): self.max_document_length = max_document_length self.min_frequency = min_frequency if vocabulary: self.vocabulary_ = vocabulary else: self.vocabulary_ = CategoricalVocabulary() if tokenizer_fn: self._tokenizer = tokenizer_fn else: self._tokenizer = tokenizer
class VocabularyProcessor(object): """Maps documents to sequences of word ids. Parameters: max_document_length: Maximum length of documents. if documents are longer, they will be trimmed, if shorter - padded. min_frequency: Minimum frequency of words in the vocabulary. vocabulary: CategoricalVocabulary object. Attributes: vocabulary_: CategoricalVocabulary object. """ def __init__(self, max_document_length, min_frequency=0, vocabulary=None, tokenizer_fn=None): self.max_document_length = max_document_length self.min_frequency = min_frequency if vocabulary: self.vocabulary_ = vocabulary else: self.vocabulary_ = CategoricalVocabulary() if tokenizer_fn: self._tokenizer = tokenizer_fn else: self._tokenizer = tokenizer def fit(self, raw_documents, unused_y=None): """Learn a vocabulary dictionary of all tokens in the raw documents. Args: raw_documents: iterable An iterable which yield either str or unicode. unused_y: to match fit format signature of estimators. Returns: self """ for tokens in self._tokenizer(raw_documents): for token in tokens: self.vocabulary_.add(token) if self.min_frequency > 0: self.vocabulary_.trim(self.min_frequency) self.vocabulary_.freeze() return self def fit_transform(self, raw_documents, unused_y=None): """Learn the vocabulary dictionary and return indexies of words. Args: raw_documents: iterable An iterable which yield either str or unicode. unused_y: to match fit_transform signature of estimators. Returns: X: iterable, [n_samples, max_document_length] Word-id matrix. """ self.fit(raw_documents) return self.transform(raw_documents) def transform(self, raw_documents): """Transform documents to word-id matrix. Convert words to ids with vocabulary fitted with fit or the one provided in the constructor. Args: raw_documents: iterable. An iterable which yield either str or unicode. Returns: X: iterable, [n_samples, max_document_length] Word-id matrix. """ for tokens in self._tokenizer(raw_documents): word_ids = np.zeros(self.max_document_length, np.int64) for idx, token in enumerate(tokens): if idx >= self.max_document_length: break word_ids[idx] = self.vocabulary_.get(token) yield word_ids def reverse(self, documents): """Reverses output of vocabulary mapping to words. Args: documents: iterable, list of class ids. Returns: Iterator over mapped in words documents. """ for item in documents: output = [] for class_id in item: output.append(self.vocabulary_.reverse(class_id)) yield ' '.join(output)
class VocabularyProcessor(object): """Maps documents to sequences of word ids. Parameters: max_document_length: Maximum length of documents. if documents are longer, they will be trimmed, if shorter - padded. min_frequency: Minimum frequency of words in the vocabulary. vocabulary: CategoricalVocabulary object. Attributes: vocabulary_: CategoricalVocabulary object. """ def __init__(self, max_document_length, min_frequency=0, vocabulary=None, tokenizer_fn=None): self.max_document_length = max_document_length self.min_frequency = min_frequency if vocabulary: self.vocabulary_ = vocabulary else: self.vocabulary_ = CategoricalVocabulary() if tokenizer_fn: self._tokenizer = tokenizer_fn else: self._tokenizer = tokenizer def fit(self, raw_documents, unused_y=None): """Learn a vocabulary dictionary of all tokens in the raw documents. Args: raw_documents: iterable An iterable which yield either str or unicode. unused_y: to match fit format signature of estimators. Returns: self """ for tokens in self._tokenizer(raw_documents): for token in tokens: self.vocabulary_.add(token) if self.min_frequency > 0: self.vocabulary_.trim(self.min_frequency) self.vocabulary_.freeze() return self def fit_transform(self, raw_documents, unused_y=None): """Learn the vocabulary dictionary and return indexies of words. Args: raw_documents: iterable An iterable which yield either str or unicode. unused_y: to match fit_transform signature of estimators. Returns: X: iterable, [n_samples, max_document_length] Word-id matrix. """ self.fit(raw_documents) return self.transform(raw_documents) def transform(self, raw_documents): """Transform documents to word-id matrix. Convert words to ids with vocabulary fitted with fit or the one provided in the constructor. Args: raw_documents: iterable. An iterable which yield either str or unicode. Returns: X: iterable, [n_samples, max_document_length] Word-id matrix. """ for tokens in self._tokenizer(raw_documents): word_ids = np.zeros(self.max_document_length, np.int64) for idx, token in enumerate(tokens): if idx >= self.max_document_length: break word_ids[idx] = self.vocabulary_.get(token) yield word_ids
class VocabularyProcessor(object): """Maps documents to sequences of word ids. Parameters: max_document_length: Maximum length of documents. if documents are longer, they will be trimmed, if shorter - padded. min_frequency: Minimum frequency of words in the vocabulary. vocabulary: CategoricalVocabulary object. Attributes: vocabulary_: CategoricalVocabulary object. """ def __init__(self, max_document_length, min_frequency=0, vocabulary=None, tokenizer_fn=None): self.max_document_length = max_document_length self.min_frequency = min_frequency if vocabulary: self.vocabulary_ = vocabulary else: self.vocabulary_ = CategoricalVocabulary() if tokenizer_fn: self._tokenizer = tokenizer_fn else: self._tokenizer = tokenizer def fit(self, raw_documents, unused_y=None): """Learn a vocabulary dictionary of all tokens in the raw documents. Args: raw_documents: iterable An iterable which yield either str or unicode. unused_y: to match fit format signature of estimators. Returns: self """ for tokens in self._tokenizer(raw_documents): for token in tokens: self.vocabulary_.add(token) if self.min_frequency > 0: self.vocabulary_.trim(self.min_frequency) self.vocabulary_.freeze() return self def fit_transform(self, raw_documents, unused_y=None): """Learn the vocabulary dictionary and return indexies of words. Args: raw_documents: iterable An iterable which yield either str or unicode. unused_y: to match fit_transform signature of estimators. Returns: X: iterable, [n_samples, max_document_length] Word-id matrix. """ self.fit(raw_documents) return self.transform(raw_documents) def transform(self, raw_documents): """Transform documents to word-id matrix. Convert words to ids with vocabulary fitted with fit or the one provided in the constructor. Args: raw_documents: iterable. An iterable which yield either str or unicode. Returns: X: iterable, [n_samples, max_document_length] Word-id matrix. """ for tokens in self._tokenizer(raw_documents): word_ids = np.zeros(self.max_document_length, np.int64) for idx, token in enumerate(tokens): if idx >= self.max_document_length: break word_ids[idx] = self.vocabulary_.get(token) yield word_ids def reverse(self, documents): """Reverses output of vocabulary mapping to words. Args: documents: iterable, list of class ids. Returns: Iterator over mapped in words documents. """ for item in documents: output = [] for class_id in item: output.append(self.vocabulary_.reverse(class_id)) yield ' '.join(output) def save(self, filename): """Saves vocabulary processor into given file. Args: filename: Path to output file. """ open(filename, 'w').write(cPickle.dumps(self)) @classmethod def restore(cls, filename): """Restores vocabulary processor from given file. Args: filename: Path to file to load from. Returns: VocabularyProcessor object. """ return cPickle.loads(open(filename).read())