Ejemplo n.º 1
0
 def _encode_data(self, data : RawDataset, arg_values : Namespace) \
     -> Tuple[HypStemDataset, Tuple[Tokenizer, Embedding]]:
     preprocessed_data = list(self._preprocess_data(data, arg_values))
     embedding, embedded_data = embed_data(RawDataset(preprocessed_data))
     tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values)
     print("Encoding hyps...")
     with multiprocessing.Pool(arg_values.num_threads) as pool:
         relevant_hyps, relevances = \
             zip(*list(pool.imap(most_relevant_hyp, preprocessed_data)))
     encoded_relevant_hyps = [
         getNGramTokenbagVector(arg_values.num_grams, tokenizer.numTokens(),
                                tokenizer.toTokenList(hyp_term))
         for hyp_term in relevant_hyps
     ]
     print("Encoding goals...")
     encoded_goals = [
         getNGramTokenbagVector(arg_values.num_grams, tokenizer.numTokens(),
                                term) for term in tokenized_goals
     ]
     print("Done")
     return HypStemDataset([
         HypStemSample(hyp, relevance, goal,
                       inter.tactic) for hyp, relevance, goal, inter in zip(
                           encoded_relevant_hyps, relevances, encoded_goals,
                           embedded_data)
     ]), (tokenizer, embedding)
Ejemplo n.º 2
0
 def _encode_term(self, term: str) -> List[float]:
     assert self.training_args
     return cast(
         List[float],
         getNGramTokenbagVector(self.training_args.num_grams,
                                self._tokenizer.numTokens(),
                                self._tokenizer.toTokenList(term)))
 def _encode_tokenized_data(self, data : TokenizedDataset, arg_values : Namespace,
                            tokenizer : Tokenizer, embedding : Embedding) \
     -> NGramDataset:
     return NGramDataset([NGramSample(getNGramTokenbagVector(arg_values.num_grams,
                                                             tokenizer.numTokens(),
                                                             goal),
                                      tactic) for prev_tactic, hyps, goal, tactic
                          in data])
Ejemplo n.º 4
0
def encodeHypList(num_grams : int, num_tokens : int, hyps_list : List[List[int]]) -> \
    List[List[int]]:
    return [
        getNGramTokenbagVector(num_grams, num_tokens, hyp) for hyp in hyps_list
    ]