Example #1
0
    def _encode_data(self, data : RawDataset, args : Namespace) \
        -> Tuple[DatasetType, TokenizerEmbeddingState]:
        preprocessed_data = self._preprocess_data(data, args)
        embedding = SimpleEmbedding()
        embedded_data: EmbeddedDataset
        with multiprocessing.Pool(args.num_threads) as pool:
            stemmed_data = pool.imap(stemmify_data,
                                     preprocessed_data,
                                     chunksize=10240)
            lazy_embedded_data = LazyEmbeddedDataset(
                (EmbeddedSample(prev_tactics, hypotheses, goal,
                                embedding.encode_token(tactic))
                 for (prev_tactics, hypotheses, goal, tactic) in stemmed_data))
            if args.load_tokens:
                print("Loading tokens from {}".format(args.load_tokens))
                with open(args.load_tokens, 'rb') as f:
                    tokenizer = pickle.load(f)
                    assert isinstance(tokenizer, Tokenizer)
                embedded_data = lazy_embedded_data
            else:
                # Force the embedded data for picking keywords
                forced_embedded_data = StrictEmbeddedDataset(
                    list(lazy_embedded_data.data))
                subset = StrictEmbeddedDataset(
                    random.sample(forced_embedded_data,
                                  args.num_relevance_samples))
                embedded_data = forced_embedded_data
                start = time.time()
                print("Picking tokens...", end="")
                sys.stdout.flush()
                tokenizer = make_keyword_tokenizer_relevance([
                    (goal, next_tactic)
                    for prev_tactics, hypotheses, goal, next_tactic in subset
                ], tokenizers[args.tokenizer], args.num_keywords, TOKEN_START,
                                                             args.num_threads)
                del subset
                print("{}s".format(time.time() - start))
            if args.save_tokens:
                print("Saving tokens to {}".format(args.save_tokens))
                assert isinstance(tokenizer, Tokenizer)
                with open(args.save_tokens, 'wb') as f:
                    pickle.dump(tokenizer, f)
            if args.print_keywords:
                print("Keywords are {}".format(tokenizer.listTokens()))

            print("Tokenizing...")
            tokenized_data = tokenize_data(tokenizer, embedded_data,
                                           args.num_threads)
            gc.collect()

        return self._encode_tokenized_data(tokenized_data, args, tokenizer, embedding), \
            TokenizerEmbeddingState(tokenizer, embedding)
def embed_data(data : RawDataset) -> Tuple[Embedding, StrictEmbeddedDataset]:
    embedding = SimpleEmbedding()
    start = time.time()
    print("Embedding data...", end="")
    sys.stdout.flush()
    dataset = StrictEmbeddedDataset([EmbeddedSample(
        prev_tactics, hypotheses, goal, embedding.encode_token(get_stem(tactic)))
                                     for prev_tactics, hypotheses, goal, tactic
                                     in data])
    print("{:.2f}s".format(time.time() - start))
    return embedding, dataset