def encode_seq_structural_data(data : RawDataset,
                               context_tokenizer_type : \
                               Callable[[List[str], int], Tokenizer],
                               num_keywords : int,
                               num_reserved_tokens: int) -> \
                               Tuple[StructDataset, Tokenizer, SimpleEmbedding]:
    embedding = SimpleEmbedding()

    hyps_and_goals = [
        hyp_or_goal for hyp_and_goal in [
            zip(hyps +
                [goal], itertools.repeat(embedding.encode_token(tactic)))
            for prev_tactics, hyps, goal, tactic in data
        ] for hyp_or_goal in hyp_and_goal
    ]
    context_tokenizer = make_keyword_tokenizer_relevance(
        hyps_and_goals, context_tokenizer_type, num_keywords,
        num_reserved_tokens)
    encodedData = []
    for prev_tactics, hyps, goal, tactic in data:
        stem, rest = serapi_instance.split_tactic(tactic)
        encodedData.append(
            ([context_tokenizer.toTokenList(hyp)
              for hyp in hyps], context_tokenizer.toTokenList(goal),
             (embedding.encode_token(stem),
              [hyp_index(hyps, arg) for arg in get_symbols(rest)])))

    return encodedData, context_tokenizer, embedding
def encode_hyparg_data(data : RawDataset,
                       tokenizer_type : Callable[[List[str], int], Tokenizer],
                       num_keywords : int,
                       num_reserved_tokens : int,
                       max_args : int,
                       max_hyps : int,
                       encoded_length : int,
                       entropy_data_size : int,
                       num_threads : Optional[int] = None) -> \
                       Tuple[StructDataset, Tokenizer, SimpleEmbedding]:
    stem_embedding = SimpleEmbedding()
    data_list = list(data)
    if len(data_list) <= entropy_data_size:
        subset = data_list
    else:
        subset = random.sample(data_list, entropy_data_size)
    tokenizer = make_keyword_tokenizer_relevance(
        [(context, stem_embedding.encode_token(serapi_instance.get_stem(tactic)))
         for relevant_lemmas, prev_tactics, hyps, context, tactic in subset],
        tokenizer_type, num_keywords, num_reserved_tokens)
    termEncoder = functools.partial(getNGramTokenbagVector, 1, tokenizer.numTokens())
    with multiprocessing.Pool(num_threads) as pool:
        hyps, contexts, tactics = zip(*data_list)
        encoded_contexts = pool.imap(functools.partial(
            _encode, tokenizer, termEncoder), contexts)
        encoded_hyps = pool.imap(functools.partial(
            _encode_hyps, tokenizer, termEncoder, max_hyps, encoded_length), contexts)
        encoded_tactics = pool.imap(
            functools.partial(encode_tactic_structure, stem_embedding, max_args),
            zip(hyps, tactics))
        result = list(zip(encoded_hyps, encoded_contexts, encoded_tactics))
    tokenizer.freezeTokenList()
    return result, tokenizer, stem_embedding
def tokenize_goals(data : StrictEmbeddedDataset, args : Namespace) \
    -> Tuple[Tokenizer, List[Sentence]]:
    if args.load_tokens and Path2(args.load_tokens).exists():
        print("Loading tokens from {}".format(args.load_tokens))
        with open(args.load_tokens, 'rb') as f:
            tokenizer = pickle.load(f)
            assert isinstance(tokenizer, Tokenizer)
    else:
        start = time.time()
        print("Picking tokens...", end="")
        sys.stdout.flush()
        subset : Sequence[EmbeddedSample]
        if args.num_relevance_samples > len(data):
            subset = data
        else:
            subset = random.sample(data, args.num_relevance_samples)
        tokenizer = make_keyword_tokenizer_relevance(
            [(goal, next_tactic) for
             prev_tactics, hypotheses, goal, next_tactic in subset],
            tokenizers[args.tokenizer], args.num_keywords, TOKEN_START, args.num_threads)
        print("{}s".format(time.time() - start))
    if args.save_tokens:
        print("Saving tokens to {}".format(args.save_tokens))
        with open(args.save_tokens, 'wb') as f:
            pickle.dump(tokenizer, f)
    if args.print_keywords:
        print("Keywords are {}".format(tokenizer.listTokens()))
    start = time.time()
    print("Tokenizing...", end="")
    sys.stdout.flush()
    tokenized_data = tokenize_data(tokenizer, data, args.num_threads)
    print("{:.2f}s".format(time.time() - start))
    return tokenizer, [goal for prev_tactics, hypotheses, goal, tactic in tokenized_data]
Exemple #4
0
def encode_seq_classify_data(data : RawDataset,
                             tokenizer_type : Callable[[List[str], int], Tokenizer],
                             num_keywords : int,
                             num_reserved_tokens : int,
                             save_tokens : Optional[str] = None,
                             load_tokens : Optional[str] = None,
                             num_relevance_samples : int = 1000) \
    -> Tuple[ClassifySequenceDataset, Tokenizer, SimpleEmbedding]:
    embedding = SimpleEmbedding()
    subset = RawDataset(random.sample(data, num_relevance_samples))
    if load_tokens:
        print("Loading tokens from {}".format(load_tokens))
        tokenizer = torch.load(load_tokens)
    else:
        start = time.time()
        print("Picking tokens...", end="")
        sys.stdout.flush()
        tokenizer = make_keyword_tokenizer_relevance(
            [(context, embedding.encode_token(get_stem(tactic)))
             for prev_tactics, hyps, context, tactic in subset],
            tokenizer_type, num_keywords, num_reserved_tokens)
        print("{}s".format(time.time() - start))
    if save_tokens:
        print("Saving tokens to {}".format(save_tokens))
        torch.save(tokenizer, save_tokens)
    with multiprocessing.Pool(None) as pool:
        result = [(goal, embedding.encode_token(tactic))
                  for goal, tactic in chain.from_iterable(
                      pool.imap(
                          functools.partial(encode_seq_classify_data_worker__,
                                            tokenizer), chunks(data, 1024)))]
    tokenizer.freezeTokenList()
    return result, tokenizer, embedding
    def _encode_data(self, data : RawDataset, args : Namespace) \
        -> Tuple[DatasetType, TokenizerEmbeddingState]:
        preprocessed_data = self._preprocess_data(data, args)
        embedding = SimpleEmbedding()
        embedded_data: EmbeddedDataset
        with multiprocessing.Pool(args.num_threads) as pool:
            stemmed_data = pool.imap(stemmify_data,
                                     preprocessed_data,
                                     chunksize=10240)
            lazy_embedded_data = LazyEmbeddedDataset(
                (EmbeddedSample(prev_tactics, hypotheses, goal,
                                embedding.encode_token(tactic))
                 for (prev_tactics, hypotheses, goal, tactic) in stemmed_data))
            if args.load_tokens:
                print("Loading tokens from {}".format(args.load_tokens))
                with open(args.load_tokens, 'rb') as f:
                    tokenizer = pickle.load(f)
                    assert isinstance(tokenizer, Tokenizer)
                embedded_data = lazy_embedded_data
            else:
                # Force the embedded data for picking keywords
                forced_embedded_data = StrictEmbeddedDataset(
                    list(lazy_embedded_data.data))
                subset = StrictEmbeddedDataset(
                    random.sample(forced_embedded_data,
                                  args.num_relevance_samples))
                embedded_data = forced_embedded_data
                start = time.time()
                print("Picking tokens...", end="")
                sys.stdout.flush()
                tokenizer = make_keyword_tokenizer_relevance([
                    (goal, next_tactic)
                    for prev_tactics, hypotheses, goal, next_tactic in subset
                ], tokenizers[args.tokenizer], args.num_keywords, TOKEN_START,
                                                             args.num_threads)
                del subset
                print("{}s".format(time.time() - start))
            if args.save_tokens:
                print("Saving tokens to {}".format(args.save_tokens))
                assert isinstance(tokenizer, Tokenizer)
                with open(args.save_tokens, 'wb') as f:
                    pickle.dump(tokenizer, f)
            if args.print_keywords:
                print("Keywords are {}".format(tokenizer.listTokens()))

            print("Tokenizing...")
            tokenized_data = tokenize_data(tokenizer, embedded_data,
                                           args.num_threads)
            gc.collect()

        return self._encode_tokenized_data(tokenized_data, args, tokenizer, embedding), \
            TokenizerEmbeddingState(tokenizer, embedding)