def _encode_data(self, data : RawDataset, arg_values : Namespace) \ -> Tuple[FeaturesDataset, Tuple[Embedding, List[VecFeature]]]: preprocessed_data = list(self._preprocess_data(data, arg_values)) stripped_data = [ strip_scraped_output(dat) for dat in preprocessed_data ] self._feature_functions = [ feature_constructor(stripped_data, arg_values) for feature_constructor in vec_feature_constructors ] embedding, embedded_data = embed_data(RawDataset(preprocessed_data)) return (FeaturesDataset([ FeaturesSample(self._get_features(strip_scraped_output(scraped)), scraped.tactic) for scraped in embedded_data ]), (embedding, self._feature_functions))
def _encode_data(self, data : RawDataset, arg_values : Namespace) \ -> Tuple[EncFeaturesDataset, Tuple[Tokenizer, Embedding, List[VecFeature], List[WordFeature]]]: preprocessed_data = list(self._preprocess_data(data, arg_values)) stripped_data = [ strip_scraped_output(dat) for dat in preprocessed_data ] self._vec_feature_functions = [ feature_constructor(stripped_data, arg_values) for # type: ignore feature_constructor in vec_feature_constructors ] self._word_feature_functions = [ feature_constructor(stripped_data, arg_values) for # type: ignore feature_constructor in word_feature_constructors ] embedding, embedded_data = embed_data(RawDataset(preprocessed_data)) tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values) result_data = EncFeaturesDataset([ EncFeaturesSample( self._get_vec_features( TacticContext(prev_tactics, hypotheses, goal)), self._get_word_features( TacticContext(prev_tactics, hypotheses, goal)), normalizeSentenceLength(tokenized_goal, arg_values.max_length), tactic) for (prev_tactics, hypotheses, goal, tactic), tokenized_goal in zip(embedded_data, tokenized_goals) ]) return result_data, (tokenizer, embedding, self._vec_feature_functions, self._word_feature_functions)
def _encode_data(self, data : RawDataset, arg_values : Namespace) \ -> Tuple[CopyArgDataset, Tuple[Tokenizer, Embedding, List[WordFeature], List[VecFeature]]]: preprocessed_data = list(self._preprocess_data(data, arg_values)) for datum in preprocessed_data: assert not re.match("induction\s+\d+\.", datum.tactic) stripped_data = [strip_scraped_output(dat) for dat in preprocessed_data] self._word_feature_functions = [feature_constructor(stripped_data, arg_values) for # type: ignore feature_constructor in word_feature_constructors] self._vec_feature_functions = [feature_constructor(stripped_data, arg_values) for # type: ignore feature_constructor in vec_feature_constructors] embedding, embedded_data = embed_data(RawDataset(preprocessed_data)) tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values) with multiprocessing.Pool(arg_values.num_threads) as pool: arg_idxs = pool.imap(functools.partial(get_arg_idx, arg_values.max_length), preprocessed_data) start = time.time() print("Creating dataset...", end="") sys.stdout.flush() result_data = CopyArgDataset(list(pool.imap( functools.partial(mkCopySample, arg_values.max_length, self._word_feature_functions, self._vec_feature_functions), zip(embedded_data, tokenized_goals, arg_idxs)))) print("{:.2f}s".format(time.time() - start)) return result_data, (tokenizer, embedding, self._word_feature_functions, self._vec_feature_functions)
def _encode_data(self, data : RawDataset, arg_values : Namespace) \ -> Tuple[HypFeaturesDataset, Tuple[Tokenizer, Embedding, List[WordFeature], List[VecFeature]]]: preprocessed_data = list(self._preprocess_data(data, arg_values)) start = time.time() print("Stripping...", end="") sys.stdout.flush() stripped_data = [ strip_scraped_output(dat) for dat in preprocessed_data ] print("{:.2f}s".format(time.time() - start)) self._word_feature_functions = [ feature_constructor(stripped_data, arg_values) for # type: ignore feature_constructor in word_feature_constructors ] self._vec_feature_functions = [ feature_constructor(stripped_data, arg_values) for # type: ignore feature_constructor in vec_feature_constructors ] embedding, embedded_data = embed_data(RawDataset(preprocessed_data)) tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values) with multiprocessing.Pool(arg_values.num_threads) as pool: start = time.time() print("Getting closest hyps...", end="") sys.stdout.flush() tokenized_hyps = list( pool.imap( functools.partial(get_closest_hyp_type, tokenizer, arg_values.max_length), preprocessed_data)) print("{:.2f}s".format(time.time() - start)) start = time.time() print("Creating dataset...", end="") sys.stdout.flush() result_data = HypFeaturesDataset( list( pool.imap( functools.partial(mkHFSample, arg_values.max_length, self._word_feature_functions, self._vec_feature_functions), zip(embedded_data, tokenized_goals, tokenized_hyps)))) print("{:.2f}s".format(time.time() - start)) return result_data, (tokenizer, embedding, self._word_feature_functions, self._vec_feature_functions)
def get_data(args: List[str]) -> None: parser = argparse.ArgumentParser( description="Parse datafiles into multiple formats") parser.add_argument("format", choices=[ "terms", "goals", "hyps+goal", "hyps+goal+tactic", "tacvector" ]) parser.add_argument("scrape_file", type=Path2) parser.add_argument("--tokenizer", choices=list(tokenizers.keys()), type=str, default=list(tokenizers.keys())[0]) parser.add_argument("--max-tuples", dest="max_tuples", default=None, type=int) parser.add_argument("--num-keywords", dest="num_keywords", default=100, type=int) parser.add_argument("--num-head-keywords", dest="num_head_keywords", type=int, default=100) parser.add_argument("--num-tactic-keywords", dest="num_tactic_keywords", type=int, default=50) parser.add_argument("--print-keywords", dest="print_keywords", action='store_true') parser.add_argument("--max-length", dest="max_length", default=None, type=int) parser.add_argument("--lineend", dest="lineend", default=False, const=True, action='store_const') parser.add_argument("--context-filter", dest="context_filter", default="default") parser.add_argument("--verbose", action="store_true") arg_values = parser.parse_args(args) if arg_values.format == "terms": terms, tokenizer = data.term_data( data.RawDataset( list( itertools.islice( data.read_text_data(arg_values.scrape_file), arg_values.max_tuples))), tokenizers[arg_values.tokenizer], arg_values.num_keywords, 2) if arg_values.max_length: terms = [ data.normalizeSentenceLength(term, arg_values.max_length) for term in terms ] for term in terms: print(tokenizer.toString( list(itertools.takewhile(lambda x: x != data.EOS_token, term))), end="\\n\n" if arg_values.lineend else "\n") elif arg_values.format == "goals": dataset = data.get_text_data(arg_values) for prev_tactics, hyps, goal, tactic in dataset: print(goal) elif arg_values.format == "hyps+goal": dataset = data.get_text_data(arg_values) for prev_tactics, hyps, goal, tactic in dataset: for hyp in hyps: print(hyp) print("================================") print(goal) elif arg_values.format == "hyps+goal+tactic": dataset = data.get_text_data(arg_values) for prev_tactics, hyps, goal, tactic in dataset: for hyp in hyps: print(hyp) print("================================") print(goal) print("====> {}".format(tactic)) pass elif arg_values.format == "tacvector": dataset = data.get_text_data(arg_values) embedding = SimpleEmbedding() eprint("Encoding tactics...", guard=arg_values.verbose) answers = [ embedding.encode_token(serapi_instance.get_stem(datum.tactic)) for datum in dataset ] stripped_data = [strip_scraped_output(scraped) for scraped in dataset] eprint("Constructing features...", guard=arg_values.verbose) word_feature_functions = [ word_feature_constructor(stripped_data, arg_values) for word_feature_constructor in features.word_feature_constructors ] vec_features_functions = [ vec_feature_constructor(stripped_data, arg_values) for vec_feature_constructor in features.vec_feature_constructors ] eprint("Extracting features...", guard=arg_values.verbose) word_features = [[feature(c) for feature in word_feature_functions] for c in stripped_data] vec_features = [[ feature_val for feature in vec_features_functions for feature_val in feature(c) ] for c in stripped_data] eprint("Done", guard=arg_values.verbose) for word_feat, vec_feat, tactic in zip(word_features, vec_features, answers): print(",".join( list(map(str, word_feat)) + list(map(str, vec_feat)) + [str(tactic)]))