def _encode_data(self, data : RawDataset, args : Namespace) \ -> Tuple[DatasetType, TokenizerEmbeddingState]: preprocessed_data = self._preprocess_data(data, args) embedding = SimpleEmbedding() embedded_data: EmbeddedDataset with multiprocessing.Pool(args.num_threads) as pool: stemmed_data = pool.imap(stemmify_data, preprocessed_data, chunksize=10240) lazy_embedded_data = LazyEmbeddedDataset( (EmbeddedSample(prev_tactics, hypotheses, goal, embedding.encode_token(tactic)) for (prev_tactics, hypotheses, goal, tactic) in stemmed_data)) if args.load_tokens: print("Loading tokens from {}".format(args.load_tokens)) with open(args.load_tokens, 'rb') as f: tokenizer = pickle.load(f) assert isinstance(tokenizer, Tokenizer) embedded_data = lazy_embedded_data else: # Force the embedded data for picking keywords forced_embedded_data = StrictEmbeddedDataset( list(lazy_embedded_data.data)) subset = StrictEmbeddedDataset( random.sample(forced_embedded_data, args.num_relevance_samples)) embedded_data = forced_embedded_data start = time.time() print("Picking tokens...", end="") sys.stdout.flush() tokenizer = make_keyword_tokenizer_relevance([ (goal, next_tactic) for prev_tactics, hypotheses, goal, next_tactic in subset ], tokenizers[args.tokenizer], args.num_keywords, TOKEN_START, args.num_threads) del subset print("{}s".format(time.time() - start)) if args.save_tokens: print("Saving tokens to {}".format(args.save_tokens)) assert isinstance(tokenizer, Tokenizer) with open(args.save_tokens, 'wb') as f: pickle.dump(tokenizer, f) if args.print_keywords: print("Keywords are {}".format(tokenizer.listTokens())) print("Tokenizing...") tokenized_data = tokenize_data(tokenizer, embedded_data, args.num_threads) gc.collect() return self._encode_tokenized_data(tokenized_data, args, tokenizer, embedding), \ TokenizerEmbeddingState(tokenizer, embedding)
def embed_data(data : RawDataset) -> Tuple[Embedding, StrictEmbeddedDataset]: embedding = SimpleEmbedding() start = time.time() print("Embedding data...", end="") sys.stdout.flush() dataset = StrictEmbeddedDataset([EmbeddedSample( prev_tactics, hypotheses, goal, embedding.encode_token(get_stem(tactic))) for prev_tactics, hypotheses, goal, tactic in data]) print("{:.2f}s".format(time.time() - start)) return embedding, dataset