def filter_data(data: Iterable[ScrapedTactic], pair_filter: ContextFilter, arg_values: Namespace) -> Iterable[ScrapedTactic]: return (scraped for (scraped, next_scraped) in zip( data, itertools.chain(itertools.islice(data, 1, None), [ScrapedTactic([], [], ProofContext.empty(), "")])) if pair_filter(strip_scraped_output(scraped), scraped.tactic, strip_scraped_output(next_scraped), arg_values))
def filter_eval_data(data: Iterable[StateScore], pair_filter: ContextFilter, arg_values: Namespace) -> Iterable[StateScore]: return ( point for (point, next_point) in zip( data, itertools.chain(itertools.islice(data, 1, None), [ StateScore(ScrapedTactic([], [], ProofContext.empty(), ""), 0) ])) if pair_filter(strip_scraped_output(point.state), point.state.tactic, strip_scraped_output(next_point.state), arg_values))
def _encode_data(self, data : RawDataset, arg_values : Namespace) \ -> Tuple[FeaturesDataset, Tuple[Embedding, List[VecFeature]]]: stripped_data = [strip_scraped_output(dat) for dat in data] self._feature_functions = [ feature_constructor(stripped_data, arg_values) for feature_constructor in vec_feature_constructors ] embedding, embedded_data = embed_data(data) return (FeaturesDataset([ FeaturesSample(self._get_features(strip_scraped_output(scraped)), scraped.tactic) for scraped in embedded_data ]), (embedding, self._feature_functions))
def make_predictions(num_predictions: int, tactic_interactions: List[ScrapedTactic]) -> \ Tuple[Iterable[Tuple[ScrapedTactic, List[Prediction]]], float]: if len(tactic_interactions) == 0: return [], 0 chunk_size = args.chunk_size total_loss = 0. inputs = [ strip_scraped_output(tactic_interaction) for tactic_interaction in tactic_interactions ] corrects = [ tactic_interaction.tactic for tactic_interaction in tactic_interactions ] predictions: List[List[Prediction]] = [] for inputs_chunk, corrects_chunk in zip(chunks(inputs, chunk_size), chunks(corrects, chunk_size)): predictions_chunk, loss = predictor.predictKTacticsWithLoss_batch( inputs_chunk, args.num_predictions, corrects_chunk) predictions += predictions_chunk total_loss += loss del inputs del corrects return list(zip(tactic_interactions, predictions)), \ total_loss / math.ceil(len(tactic_interactions) / chunk_size)
def _encode_data(self, data : RawDataset, arg_values : Namespace) \ -> Tuple[CopyArgDataset, Tuple[Tokenizer, Embedding, List[WordFeature], List[VecFeature]]]: for datum in data: assert not re.match("induction\s+\d+\.", datum.tactic) stripped_data = [strip_scraped_output(dat) for dat in data] self._word_feature_functions = [ feature_constructor(stripped_data, arg_values) for # type: ignore feature_constructor in word_feature_constructors ] self._vec_feature_functions = [ feature_constructor(stripped_data, arg_values) for # type: ignore feature_constructor in vec_feature_constructors ] embedding, embedded_data = embed_data(data) tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values) with multiprocessing.Pool(arg_values.num_threads) as pool: arg_idxs = pool.imap( functools.partial(get_arg_idx, arg_values.max_length), data) start = time.time() print("Creating dataset...", end="") sys.stdout.flush() result_data = CopyArgDataset( list( pool.imap( functools.partial(mkCopySample, arg_values.max_length, self._word_feature_functions, self._vec_feature_functions), zip(embedded_data, tokenized_goals, arg_idxs)))) print("{:.2f}s".format(time.time() - start)) return result_data, (tokenizer, embedding, self._word_feature_functions, self._vec_feature_functions)
def _encode_data(self, data : RawDataset, arg_values : Namespace) \ -> Tuple[EncFeaturesDataset, Tuple[Tokenizer, Embedding, List[VecFeature], List[WordFeature]]]: stripped_data = [strip_scraped_output(dat) for dat in data] self._vec_feature_functions = [ feature_constructor(stripped_data, arg_values) for # type: ignore feature_constructor in vec_feature_constructors ] self._word_feature_functions = [ feature_constructor(stripped_data, arg_values) for # type: ignore feature_constructor in word_feature_constructors ] embedding, embedded_data = embed_data(data) tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values) result_data = EncFeaturesDataset([ EncFeaturesSample( self._get_vec_features( TacticContext([], prev_tactics, hypotheses, goal)), self._get_word_features( TacticContext([], prev_tactics, hypotheses, goal)), normalizeSentenceLength(tokenized_goal, arg_values.max_length), tactic) for (relevant_lemmas, prev_tactics, hypotheses, goal, tactic), tokenized_goal in zip(embedded_data, tokenized_goals) ]) return result_data, (tokenizer, embedding, self._vec_feature_functions, self._word_feature_functions)
def get_should_filter(data: MixedDataset) \ -> Iterable[Tuple[ScrapedCommand, bool]]: list_data: List[ScrapedCommand] = list(data) extended_list: List[Optional[ScrapedCommand]] = \ cast(List[Optional[ScrapedCommand]], list_data[1:]) + [None] for point, nextpoint in zip(list_data, extended_list): if isinstance(point, ScrapedTactic) \ and not re.match(r"\s*[{}]\s*", point.tactic) and \ point.context.focused_goal.strip() != "": if isinstance(nextpoint, ScrapedTactic): context_after = strip_scraped_output(nextpoint) else: context_after = TacticContext([], [], [], "") should_filter = not context_filter(strip_scraped_output(point), point.tactic, context_after, training_args) yield (point, should_filter) else: yield (point, True)
def _encode_data(self, data : RawDataset, arg_values : Namespace) \ -> Tuple[HypFeaturesDataset, Tuple[Tokenizer, Embedding, List[WordFeature], List[VecFeature]]]: start = time.time() print("Stripping...", end="") sys.stdout.flush() stripped_data = [strip_scraped_output(dat) for dat in data] print("{:.2f}s".format(time.time() - start)) self._word_feature_functions = [ feature_constructor(stripped_data, arg_values) for # type: ignore feature_constructor in word_feature_constructors ] self._vec_feature_functions = [ feature_constructor(stripped_data, arg_values) for # type: ignore feature_constructor in vec_feature_constructors ] embedding, embedded_data = embed_data(data) tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values) with multiprocessing.Pool(arg_values.num_threads) as pool: start = time.time() print("Getting closest hyps...", end="") sys.stdout.flush() tokenized_hyps = list( pool.imap( functools.partial(get_closest_hyp_type, tokenizer, arg_values.max_length), data)) print("{:.2f}s".format(time.time() - start)) start = time.time() print("Creating dataset...", end="") sys.stdout.flush() result_data = HypFeaturesDataset( list( pool.imap( functools.partial(mkHFSample, arg_values.max_length, self._word_feature_functions, self._vec_feature_functions), zip(embedded_data, tokenized_goals, tokenized_hyps)))) print("{:.2f}s".format(time.time() - start)) return result_data, (tokenizer, embedding, self._word_feature_functions, self._vec_feature_functions)
def get_data(args: List[str]) -> None: parser = argparse.ArgumentParser( description="Parse datafiles into multiple formats") parser.add_argument("format", choices=[ "terms", "goals", "hyps+goal", "hyps+goal+tactic", "tacvector", "scrapefile-rd", "scrapefile" ]) parser.add_argument("scrape_file", type=Path2) parser.add_argument("--tokenizer", choices=list(tokenizers.keys()), type=str, default=list(tokenizers.keys())[0]) parser.add_argument("--max-tuples", dest="max_tuples", default=None, type=int) parser.add_argument("--num-keywords", dest="num_keywords", default=100, type=int) parser.add_argument("--num-head-keywords", dest="num_head_keywords", type=int, default=100) parser.add_argument("--num-tactic-keywords", dest="num_tactic_keywords", type=int, default=50) parser.add_argument("--print-keywords", dest="print_keywords", action='store_true') parser.add_argument("--no-truncate-semicolons", dest="truncate_semicolons", action='store_false') parser.add_argument("--max-length", dest="max_length", default=30, type=int) parser.add_argument("--lineend", dest="lineend", default=False, const=True, action='store_const') parser.add_argument("-j", "--num-threads", default=None, type=int) parser.add_argument("--context-filter", dest="context_filter", default="default") parser.add_argument('-v', "--verbose", action="count") parser.add_argument("--num-threads", "-j", type=int, default=None) parser.add_argument("--no-use-substitutions", action='store_false', dest='use_substitutions') parser.add_argument("--no-normalize-numeric-args", action='store_false', dest='normalize_numeric_args') parser.add_argument("--sort", action='store_true') arg_values = parser.parse_args(args) if arg_values.format == "terms": terms, tokenizer = data.term_data( data.RawDataset( list( itertools.islice( data.read_text_data(arg_values.scrape_file), arg_values.max_tuples))), tokenizers[arg_values.tokenizer], arg_values.num_keywords, 2) if arg_values.max_length: terms = [ data.normalizeSentenceLength(term, arg_values.max_length) for term in terms ] for term in terms: print(tokenizer.toString( list(itertools.takewhile(lambda x: x != data.EOS_token, term))), end="\\n\n" if arg_values.lineend else "\n") else: dataset = data.get_text_data(arg_values) if arg_values.sort: dataset = data.RawDataset( sorted(dataset, key=lambda d: len(d.hypotheses), reverse=True)) if arg_values.format == "goals": for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset: print(goal) elif arg_values.format == "hyps+goal": for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset: for hyp in hyps: print(hyp) print("================================") print(goal) elif arg_values.format == "hyps+goal+tactic": for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset: for hyp in hyps: print(hyp) print("================================") print(goal) print("====> {}".format(tactic)) pass elif arg_values.format == "tacvector": embedding = SimpleEmbedding() eprint("Encoding tactics...", guard=arg_values.verbose) answers = [ embedding.encode_token(serapi_instance.get_stem(datum.tactic)) for datum in dataset ] stripped_data = [ strip_scraped_output(scraped) for scraped in dataset ] eprint("Constructing features...", guard=arg_values.verbose) word_feature_functions = [ word_feature_constructor(stripped_data, arg_values) # type: ignore for word_feature_constructor in features.word_feature_constructors ] vec_features_functions = [ vec_feature_constructor(stripped_data, arg_values) for vec_feature_constructor in features.vec_feature_constructors ] eprint("Extracting features...", guard=arg_values.verbose) word_features = [[ feature(c) for feature in word_feature_functions ] for c in stripped_data] vec_features = [[ feature_val for feature in vec_features_functions for feature_val in feature(c) ] for c in stripped_data] eprint("Done", guard=arg_values.verbose) for word_feat, vec_feat, tactic in zip(word_features, vec_features, answers): print(",".join( list(map(str, word_feat)) + list(map(str, vec_feat)) + [str(tactic)])) elif arg_values.format == "scrapefile-rd": for point in dataset: print( json.dumps({ "relevant_lemmas": point.relevant_lemmas, "prev_tactics": point.prev_tactics, "context": { "fg_goals": [{ "hypotheses": point.hypotheses, "goal": point.goal }], "bg_goals": [], "shelved_goals": [], "given_up_goals": [] }, "tactic": point.tactic })) elif arg_values.format == "scrapefile": for point in dataset: print( json.dumps({ "relevant_lemmas": point.relevant_lemmas, "prev_tactics": point.prev_tactics, "prev_hyps": point.hypotheses, "prev_goal": point.goal, "tactic": point.tactic }))