コード例 #1
0
def tokenize_goals(data : StrictEmbeddedDataset, args : Namespace) \
    -> Tuple[Tokenizer, List[Sentence]]:
    if args.load_tokens and Path2(args.load_tokens).exists():
        print("Loading tokens from {}".format(args.load_tokens))
        with open(args.load_tokens, 'rb') as f:
            tokenizer = pickle.load(f)
            assert isinstance(tokenizer, Tokenizer)
    else:
        start = time.time()
        print("Picking tokens...", end="")
        sys.stdout.flush()
        subset : Sequence[EmbeddedSample]
        if args.num_relevance_samples > len(data):
            subset = data
        else:
            subset = random.sample(data, args.num_relevance_samples)
        tokenizer = make_keyword_tokenizer_relevance(
            [(goal, next_tactic) for
             prev_tactics, hypotheses, goal, next_tactic in subset],
            tokenizers[args.tokenizer], args.num_keywords, TOKEN_START, args.num_threads)
        print("{}s".format(time.time() - start))
    if args.save_tokens:
        print("Saving tokens to {}".format(args.save_tokens))
        with open(args.save_tokens, 'wb') as f:
            pickle.dump(tokenizer, f)
    if args.print_keywords:
        print("Keywords are {}".format(tokenizer.listTokens()))
    start = time.time()
    print("Tokenizing...", end="")
    sys.stdout.flush()
    tokenized_data = tokenize_data(tokenizer, data, args.num_threads)
    print("{:.2f}s".format(time.time() - start))
    return tokenizer, [goal for prev_tactics, hypotheses, goal, tactic in tokenized_data]
コード例 #2
0
    def _encode_data(self, data : RawDataset, args : Namespace) \
        -> Tuple[DatasetType, TokenizerEmbeddingState]:
        preprocessed_data = self._preprocess_data(data, args)
        embedding = SimpleEmbedding()
        embedded_data: EmbeddedDataset
        with multiprocessing.Pool(args.num_threads) as pool:
            stemmed_data = pool.imap(stemmify_data,
                                     preprocessed_data,
                                     chunksize=10240)
            lazy_embedded_data = LazyEmbeddedDataset(
                (EmbeddedSample(prev_tactics, hypotheses, goal,
                                embedding.encode_token(tactic))
                 for (prev_tactics, hypotheses, goal, tactic) in stemmed_data))
            if args.load_tokens:
                print("Loading tokens from {}".format(args.load_tokens))
                with open(args.load_tokens, 'rb') as f:
                    tokenizer = pickle.load(f)
                    assert isinstance(tokenizer, Tokenizer)
                embedded_data = lazy_embedded_data
            else:
                # Force the embedded data for picking keywords
                forced_embedded_data = StrictEmbeddedDataset(
                    list(lazy_embedded_data.data))
                subset = StrictEmbeddedDataset(
                    random.sample(forced_embedded_data,
                                  args.num_relevance_samples))
                embedded_data = forced_embedded_data
                start = time.time()
                print("Picking tokens...", end="")
                sys.stdout.flush()
                tokenizer = make_keyword_tokenizer_relevance([
                    (goal, next_tactic)
                    for prev_tactics, hypotheses, goal, next_tactic in subset
                ], tokenizers[args.tokenizer], args.num_keywords, TOKEN_START,
                                                             args.num_threads)
                del subset
                print("{}s".format(time.time() - start))
            if args.save_tokens:
                print("Saving tokens to {}".format(args.save_tokens))
                assert isinstance(tokenizer, Tokenizer)
                with open(args.save_tokens, 'wb') as f:
                    pickle.dump(tokenizer, f)
            if args.print_keywords:
                print("Keywords are {}".format(tokenizer.listTokens()))

            print("Tokenizing...")
            tokenized_data = tokenize_data(tokenizer, embedded_data,
                                           args.num_threads)
            gc.collect()

        return self._encode_tokenized_data(tokenized_data, args, tokenizer, embedding), \
            TokenizerEmbeddingState(tokenizer, embedding)