def tokenize_goals(data : StrictEmbeddedDataset, args : Namespace) \ -> Tuple[Tokenizer, List[Sentence]]: if args.load_tokens and Path2(args.load_tokens).exists(): print("Loading tokens from {}".format(args.load_tokens)) with open(args.load_tokens, 'rb') as f: tokenizer = pickle.load(f) assert isinstance(tokenizer, Tokenizer) else: start = time.time() print("Picking tokens...", end="") sys.stdout.flush() subset : Sequence[EmbeddedSample] if args.num_relevance_samples > len(data): subset = data else: subset = random.sample(data, args.num_relevance_samples) tokenizer = make_keyword_tokenizer_relevance( [(goal, next_tactic) for prev_tactics, hypotheses, goal, next_tactic in subset], tokenizers[args.tokenizer], args.num_keywords, TOKEN_START, args.num_threads) print("{}s".format(time.time() - start)) if args.save_tokens: print("Saving tokens to {}".format(args.save_tokens)) with open(args.save_tokens, 'wb') as f: pickle.dump(tokenizer, f) if args.print_keywords: print("Keywords are {}".format(tokenizer.listTokens())) start = time.time() print("Tokenizing...", end="") sys.stdout.flush() tokenized_data = tokenize_data(tokenizer, data, args.num_threads) print("{:.2f}s".format(time.time() - start)) return tokenizer, [goal for prev_tactics, hypotheses, goal, tactic in tokenized_data]
def _encode_data(self, data : RawDataset, args : Namespace) \ -> Tuple[DatasetType, TokenizerEmbeddingState]: preprocessed_data = self._preprocess_data(data, args) embedding = SimpleEmbedding() embedded_data: EmbeddedDataset with multiprocessing.Pool(args.num_threads) as pool: stemmed_data = pool.imap(stemmify_data, preprocessed_data, chunksize=10240) lazy_embedded_data = LazyEmbeddedDataset( (EmbeddedSample(prev_tactics, hypotheses, goal, embedding.encode_token(tactic)) for (prev_tactics, hypotheses, goal, tactic) in stemmed_data)) if args.load_tokens: print("Loading tokens from {}".format(args.load_tokens)) with open(args.load_tokens, 'rb') as f: tokenizer = pickle.load(f) assert isinstance(tokenizer, Tokenizer) embedded_data = lazy_embedded_data else: # Force the embedded data for picking keywords forced_embedded_data = StrictEmbeddedDataset( list(lazy_embedded_data.data)) subset = StrictEmbeddedDataset( random.sample(forced_embedded_data, args.num_relevance_samples)) embedded_data = forced_embedded_data start = time.time() print("Picking tokens...", end="") sys.stdout.flush() tokenizer = make_keyword_tokenizer_relevance([ (goal, next_tactic) for prev_tactics, hypotheses, goal, next_tactic in subset ], tokenizers[args.tokenizer], args.num_keywords, TOKEN_START, args.num_threads) del subset print("{}s".format(time.time() - start)) if args.save_tokens: print("Saving tokens to {}".format(args.save_tokens)) assert isinstance(tokenizer, Tokenizer) with open(args.save_tokens, 'wb') as f: pickle.dump(tokenizer, f) if args.print_keywords: print("Keywords are {}".format(tokenizer.listTokens())) print("Tokenizing...") tokenized_data = tokenize_data(tokenizer, embedded_data, args.num_threads) gc.collect() return self._encode_tokenized_data(tokenized_data, args, tokenizer, embedding), \ TokenizerEmbeddingState(tokenizer, embedding)