def most_relevant_hyp(inter: ScrapedTactic) -> Tuple[str, float]: goal, hyp_list = inter.context.focused_goal, inter.context.focused_hyps if len(hyp_list) == 0: return "", 0 result = max([(hyp_term, term_relevance( goal, serapi_instance.get_hyp_type(hyp_term))) for hyp_term in hyp_list], key=lambda x: x[1]) return result
def _predictDistribution(self, in_data : TacticContext) -> \ Tuple[torch.FloatTensor, str]: if len(in_data.hypotheses) > 0: relevant_hyp, relevance = \ max([(hyp, term_relevance(in_data.goal, serapi_instance.get_hyp_type(hyp))) for hyp in in_data.hypotheses], key=lambda x: x[1]) else: relevant_hyp = ":" relevance = 0 encoded_hyp = self._encode_term(serapi_instance.get_hyp_type(relevant_hyp)) encoded_relevance = [relevance] encoded_goal = self._encode_term(in_data.goal) stem_distribution = self._run_model(encoded_hyp, encoded_relevance, encoded_goal) return FloatTensor(stem_distribution), \ serapi_instance.get_first_var_in_hyp(relevant_hyp)
def get_closest_hyp(hyps : List[str], goal : str, max_length : int): if len(hyps) == 0: return ":" result = max(hyps, key=lambda hyp: score_hyp_type(limitNumTokens(goal, max_length), limitNumTokens(serapi_instance.get_hyp_type(hyp), max_length), max_length)) return result
def get_closest_hyps(hyps : List[str], goal : str, num_hyps : int, max_length : int)\ -> List[Tuple[str, float]]: if len(hyps) == 0: return [Prediction(":", 0)] * num_hyps else: return list(sorted([(hyp, score_hyp_type(limitNumTokens(goal, max_length), limitNumTokens(serapi_instance.get_hyp_type(hyp), max_length), max_length)) for hyp in hyps], reverse=True, key=lambda hyp_and_score: hyp_and_score[0]))
def __call__(self, context: TacticContext) -> List[float]: if len(context.hypotheses) == 0: return [0.] hyp_types = [ serapi_instance.get_hyp_type(hyp)[:100] for hyp in context.hypotheses ] best_hyp_score = max([ SequenceMatcher(None, context.goal, hyp).ratio() * len(hyp) for hyp in hyp_types ]) return [best_hyp_score / 100]
def _predictDistribution(self, in_data: TacticContext) -> torch.FloatTensor: hyp_terms = [ serapi_instance.get_hyp_type(hyp) for hyp in in_data.hypotheses ] encoded_hyps = FloatTensor( [self._encode_term(term) for term in hyp_terms]) encoded_goals = FloatTensor(self._encode_term(in_data.goal)) \ .view(1, -1).expand(len(in_data.hypotheses), -1) relevance_predictions = \ self._model(torch.cat((encoded_hyps, encoded_goals), dim=1)) return relevance_predictions[:, 1]
def predictKTactics(self, in_data : TacticContext, k : int) -> List[Prediction]: if len(in_data.hypotheses) == 0: return [Prediction("eauto", 0)] k = min(k, len(in_data.hypotheses)) best_hyps = \ sorted(in_data.hypotheses, reverse=True, key=lambda hyp: SequenceMatcher(None, serapi_instance.get_hyp_type(hyp), in_data.goal).ratio() )[:k] return [Prediction("apply " + serapi_instance.get_first_var_in_hyp(hyp) + ".", .5 ** idx) for idx, hyp in enumerate(best_hyps)]
def _encode_action(self, context: TacticContext, action: str) \ -> Tuple[List[int], torch.FloatTensor]: stem, argument = serapi_instance.split_tactic(action) stem_idx = encode_fpa_stem(self.dataloader_args, self.fpa_metadata, stem) all_prems = context.hypotheses + context.relevant_lemmas arg_idx = encode_fpa_arg(self.dataloader_args, self.fpa_metadata, all_prems, context.goal, argument.strip()) tokenized_goal = tokenize(self.dataloader_args, self.fpa_metadata, context.goal) premise_features_size = get_premise_features_size( self.dataloader_args, self.fpa_metadata) if arg_idx == 0: # No arg arg_type_idx = 0 encoded_arg = torch.zeros(128 + premise_features_size) elif arg_idx <= self.dataloader_args.max_length: # Goal token arg arg_type_idx = 1 encoded_arg = torch.cat((self.predictor.goal_token_encoder( torch.LongTensor([stem_idx]), torch.LongTensor([ tokenized_goal ])).squeeze(0)[arg_idx].to(device=torch.device("cpu")), torch.zeros(premise_features_size)), dim=0) else: # Hyp arg arg_type_idx = 2 arg_hyp = all_prems[arg_idx - (self.dataloader_args.max_length + 1)] entire_encoded_goal = self.predictor.entire_goal_encoder( torch.LongTensor([tokenized_goal])) tokenized_arg_hyp = tokenize(self.dataloader_args, self.fpa_metadata, serapi_instance.get_hyp_type(arg_hyp)) encoded_arg = torch.cat( (self.predictor.hyp_encoder( torch.LongTensor([stem_idx]), entire_encoded_goal, torch.LongTensor([tokenized_arg_hyp ])).to(device=torch.device("cpu")), torch.FloatTensor( get_premise_features(self.dataloader_args, self.fpa_metadata, context.goal, arg_hyp))), dim=0) return [stem_idx, arg_type_idx], encoded_arg
def __call__(self, context: TacticContext) -> int: if len(context.hypotheses) == 0: return 0 hyp_types = [ limitNumTokens(serapi_instance.get_hyp_type(hyp), self.max_length) for hyp in context.hypotheses ] goal = limitNumTokens(context.goal, self.max_length) closest_hyp_type = max(hyp_types, key=lambda x: SequenceMatcher(None, goal, x). ratio() * len(get_symbols(x))) headToken = get_symbols(closest_hyp_type)[0] if headToken in self.headKeywords: return self.headKeywords.index(headToken) + 1 else: return 0
def from_data(init_dataset: List[TacticContext], args: argparse.Namespace) -> 'TopLevelTokenInBestHyp': headTokenCounts: typing.Counter[str] = Counter() for relevant_lemmas, prev_tactics, hyps, goal in init_dataset: for hyp in hyps: headToken = get_symbols(serapi_instance.get_hyp_type(hyp))[0] headTokenCounts[headToken] += 1 if args.load_head_keywords and Path2(args.load_head_keywords).exists(): result = TopLevelTokenInBestHyp( args, torch.load(args.load_head_keywords)) else: result = TopLevelTokenInBestHyp(args, [ word for word, count in headTokenCounts.most_common( args.num_head_keywords) ]) eprint("Hypothesis head keywords are {}".format(result.headKeywords), guard=args.print_keywords) return result
def _predictDistributions(self, in_datas : List[TacticContext]) -> torch.FloatTensor: assert self._tokenizer assert self._embedding assert self.training_args goals_batch = [normalizeSentenceLength(self._tokenizer.toTokenList(goal), self.training_args.max_length) for _, _, _, goal in in_datas] hyps = [get_closest_hyp(hyps, goal, self.training_args.max_length) for _, _, hyps, goal in in_datas] hyp_types = [serapi_instance.get_hyp_type(hyp) for hyp in hyps] hyps_batch = [normalizeSentenceLength( self._tokenizer.toTokenList(hyp_type), self.training_args.max_length) for hyp_type in hyp_types] word_features_batch = [self._get_word_features(in_data) for in_data in in_datas] vec_features_batch = [self._get_vec_features(in_data) for in_data in in_datas] stem_distribution = self._model(LongTensor(goals_batch), LongTensor(hyps_batch), FloatTensor(vec_features_batch), LongTensor(word_features_batch)) return stem_distribution
def get_closest_hyp_type(tokenizer : Tokenizer, max_length : int, context : TacticContext): return tokenizer.toTokenList(serapi_instance.get_hyp_type( get_closest_hyp(context.hypotheses, context.goal, max_length)))