def _encode_data(self, data : RawDataset, arg_values : Namespace) \ -> Tuple[HypStemDataset, Tuple[Tokenizer, Embedding]]: preprocessed_data = list(self._preprocess_data(data, arg_values)) embedding, embedded_data = embed_data(RawDataset(preprocessed_data)) tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values) print("Encoding hyps...") with multiprocessing.Pool(arg_values.num_threads) as pool: relevant_hyps, relevances = \ zip(*list(pool.imap(most_relevant_hyp, preprocessed_data))) encoded_relevant_hyps = [ getNGramTokenbagVector(arg_values.num_grams, tokenizer.numTokens(), tokenizer.toTokenList(hyp_term)) for hyp_term in relevant_hyps ] print("Encoding goals...") encoded_goals = [ getNGramTokenbagVector(arg_values.num_grams, tokenizer.numTokens(), term) for term in tokenized_goals ] print("Done") return HypStemDataset([ HypStemSample(hyp, relevance, goal, inter.tactic) for hyp, relevance, goal, inter in zip( encoded_relevant_hyps, relevances, encoded_goals, embedded_data) ]), (tokenizer, embedding)
def _encode_term(self, term: str) -> List[float]: assert self.training_args return cast( List[float], getNGramTokenbagVector(self.training_args.num_grams, self._tokenizer.numTokens(), self._tokenizer.toTokenList(term)))
def _encode_tokenized_data(self, data : TokenizedDataset, arg_values : Namespace, tokenizer : Tokenizer, embedding : Embedding) \ -> NGramDataset: return NGramDataset([NGramSample(getNGramTokenbagVector(arg_values.num_grams, tokenizer.numTokens(), goal), tactic) for prev_tactic, hyps, goal, tactic in data])
def encodeHypList(num_grams : int, num_tokens : int, hyps_list : List[List[int]]) -> \ List[List[int]]: return [ getNGramTokenbagVector(num_grams, num_tokens, hyp) for hyp in hyps_list ]