def load_token2id(folder: str) -> pc.Token2Id: dictionary: pd.DataFrame = smart_load(jj(folder, 'dictionary.zip'), feather_pipe=pu.set_index, columns='token_id') token2id: pc.Token2Id = pc.Token2Id( data={t: i for (t, i) in zip(dictionary.token, dictionary.index)}) return token2id
def __post_init__(self): if isinstance(self.corpus, (pc.VectorizedCorpus, pc.TokenizedCorpus)): if not isinstance(self.token2id, pc.Token2Id): self.token2id = pc.Token2Id(data=self.corpus.token2id) self.document_index = self.corpus.document_index if isinstance(self.token2id, dict): self.token2id = pc.Token2Id(data=self.token2id) self.update_token_counts() self.vectorizer_args = { **DEFAULT_VECTORIZE_PARAMS, **(self.vectorizer_args or {}) }
def create_train_corpus() -> tm.TrainingCorpus: corpus: TranströmerCorpus = TranströmerCorpus() sparse_corpus, vocabulary = convert.TranslateCorpus().translate(corpus, id2token=None) tc: tm.TrainingCorpus = tm.TrainingCorpus( corpus=sparse_corpus, document_index=corpus.document_index, token2id=pc.Token2Id(vocabulary.token2id), ) return tc
def test_id2token2token2id(): assert pc.id2token2token2id({1: 'a', 2: 'b'}) == {'a': 1, 'b': 2} assert pc.id2token2token2id(pc.Token2Id({ 1: 'a', 2: 'b' })) == { 'a': 1, 'b': 2 }
def load(folder: str) -> TrainingCorpus: """Loads an training corpus from pickled file.""" """Load from vectorized corpus if exists""" if pc.VectorizedCorpus.dump_exists(tag='train', folder=folder): corpus: pc.VectorizedCorpus = pc.VectorizedCorpus.load( tag='train', folder=folder) return TrainingCorpus( corpus=corpus, document_index=corpus.document_index, token2id=pc.Token2Id(data=corpus.token2id), corpus_options=utility.read_json(jj(folder, CORPUS_OPTIONS_FILENAME), default={}), vectorizer_args=utility.read_json(jj(folder, VECTORIZER_ARGS_FILENAME), default={}), ) return None
def token2id(self) -> pc.Token2Id: return pc.Token2Id(data={ t: i for t, i in zip(self.vocabulary.token, self.vocabulary.token_id) })
def instream_to_corpus(self, id2token: Mapping[int, str] | None) -> tm.TrainingCorpus: content_type: ContentType = self.resolved_prior_out_content_type() if self.train_corpus_folder: if tm.TrainingCorpus.exists(self.train_corpus_folder): logger.info( f"using existing corpus in folder {self.train_corpus_folder} for target mode {self.target_mode}" ) corpus: tm.TrainingCorpus = tm.TrainingCorpus.load(self.train_corpus_folder) return corpus tags: List[str] = pc.VectorizedCorpus.find_tags(self.train_corpus_folder) if len(tags) == 0: raise ValueError(f"no train or predict input corpus found in {self.train_corpus_folder}") if len(tags) > 1: raise ValueError(f"multiple corpus found in folder {self.train_corpus_folder}") logger.info( f"using corpus tagged {tags[0]} in folder {self.train_corpus_folder} for target mode {self.target_mode}" ) vectorized_corpus: pc.VectorizedCorpus = pc.VectorizedCorpus.load( folder=self.train_corpus_folder, tag=tags[0] ) corpus: tm.TrainingCorpus = tm.TrainingCorpus(corpus=vectorized_corpus) return corpus if content_type == ContentType.VECTORIZED_CORPUS: logger.info("creating sparse corpus out of input stream...") payload: DocumentPayload = next(self.prior.outstream()) vectorized_corpus: pc.VectorizedCorpus = payload.content vectorize_opts: pc.VectorizeOpts = payload.recall('vectorize_opts') if id2token is not None: """We must consolidate the vocabularies""" logger.info("translating vocabulary to training model's vocabulary...") vectorized_corpus.translate_to_vocab(id2token, inplace=True) corpus: tm.TrainingCorpus = tm.TrainingCorpus( corpus=vectorized_corpus, corpus_options={}, vectorizer_args={} if vectorize_opts is None else vectorize_opts.props, ) logger.info("training corpus created!") return corpus if content_type == ContentType.TOKENS: token2id: pc.Token2Id = ( pc.Token2Id(pc.id2token2token2id(id2token)) if id2token is not None else self.pipeline.payload.token2id ) corpus: tm.TrainingCorpus = tm.TrainingCorpus( corpus=self.prior.filename_content_stream(), document_index=self.document_index, token2id=token2id, corpus_options={}, ) return corpus raise ValueError("unable to resolve input corpus")
def token2id(self) -> pc.Token2Id: return pc.Token2Id(data=self.term2id)