def encode_seq_structural_data(data : RawDataset,
                               context_tokenizer_type : \
                               Callable[[List[str], int], Tokenizer],
                               num_keywords : int,
                               num_reserved_tokens: int) -> \
                               Tuple[StructDataset, Tokenizer, SimpleEmbedding]:
    embedding = SimpleEmbedding()

    hyps_and_goals = [
        hyp_or_goal for hyp_and_goal in [
            zip(hyps +
                [goal], itertools.repeat(embedding.encode_token(tactic)))
            for prev_tactics, hyps, goal, tactic in data
        ] for hyp_or_goal in hyp_and_goal
    ]
    context_tokenizer = make_keyword_tokenizer_relevance(
        hyps_and_goals, context_tokenizer_type, num_keywords,
        num_reserved_tokens)
    encodedData = []
    for prev_tactics, hyps, goal, tactic in data:
        stem, rest = serapi_instance.split_tactic(tactic)
        encodedData.append(
            ([context_tokenizer.toTokenList(hyp)
              for hyp in hyps], context_tokenizer.toTokenList(goal),
             (embedding.encode_token(stem),
              [hyp_index(hyps, arg) for arg in get_symbols(rest)])))

    return encodedData, context_tokenizer, embedding
Beispiel #2
0
def get_tokens(args: List[str]):
    parser = argparse.ArgumentParser(description="Pick a set of tokens")
    parser.add_argument("--type", choices=["mixed"], default="mixed")
    parser.add_argument("-v", "--verbose", action='count', default=0)
    parser.add_argument("-n", "--num-keywords", type=int, default=120)
    parser.add_argument("-s", "--num-samples", type=int, default=2000)
    parser.add_argument("-j", "--num-threads", type=int, default=None)
    parser.add_argument("scrapefile", type=Path2)
    parser.add_argument("dest")
    arg_values = parser.parse_args(args)

    with print_time("Reading scraped data", guard=arg_values.verbose):
        raw_data = list(data.read_text_data(arg_values.scrapefile))
    embedding = SimpleEmbedding()
    subset = data.RawDataset(random.sample(raw_data, arg_values.num_samples))
    relevance_pairs = [
        (context.focused_goal,
         embedding.encode_token(serapi_instance.get_stem(tactic)))
        for relevant_lemmas, prev_tactics, context, tactic in subset
    ]
    with print_time("Calculating keywords", guard=arg_values.verbose):
        keywords = get_relevant_k_keywords2(relevance_pairs,
                                            arg_values.num_keywords,
                                            arg_values.num_threads)

    with (open(arg_values.dest, mode='w') if arg_values.dest != "-" else
          contextlib.nullcontext(sys.stdout)) as f:
        for keyword in keywords:
            f.write(keyword + "\n")
def encode_hyparg_data(data : RawDataset,
                       tokenizer_type : Callable[[List[str], int], Tokenizer],
                       num_keywords : int,
                       num_reserved_tokens : int,
                       max_args : int,
                       max_hyps : int,
                       encoded_length : int,
                       entropy_data_size : int,
                       num_threads : Optional[int] = None) -> \
                       Tuple[StructDataset, Tokenizer, SimpleEmbedding]:
    stem_embedding = SimpleEmbedding()
    data_list = list(data)
    if len(data_list) <= entropy_data_size:
        subset = data_list
    else:
        subset = random.sample(data_list, entropy_data_size)
    tokenizer = make_keyword_tokenizer_relevance(
        [(context, stem_embedding.encode_token(serapi_instance.get_stem(tactic)))
         for relevant_lemmas, prev_tactics, hyps, context, tactic in subset],
        tokenizer_type, num_keywords, num_reserved_tokens)
    termEncoder = functools.partial(getNGramTokenbagVector, 1, tokenizer.numTokens())
    with multiprocessing.Pool(num_threads) as pool:
        hyps, contexts, tactics = zip(*data_list)
        encoded_contexts = pool.imap(functools.partial(
            _encode, tokenizer, termEncoder), contexts)
        encoded_hyps = pool.imap(functools.partial(
            _encode_hyps, tokenizer, termEncoder, max_hyps, encoded_length), contexts)
        encoded_tactics = pool.imap(
            functools.partial(encode_tactic_structure, stem_embedding, max_args),
            zip(hyps, tactics))
        result = list(zip(encoded_hyps, encoded_contexts, encoded_tactics))
    tokenizer.freezeTokenList()
    return result, tokenizer, stem_embedding
Beispiel #4
0
def encode_seq_classify_data(data : RawDataset,
                             tokenizer_type : Callable[[List[str], int], Tokenizer],
                             num_keywords : int,
                             num_reserved_tokens : int,
                             save_tokens : Optional[str] = None,
                             load_tokens : Optional[str] = None,
                             num_relevance_samples : int = 1000) \
    -> Tuple[ClassifySequenceDataset, Tokenizer, SimpleEmbedding]:
    embedding = SimpleEmbedding()
    subset = RawDataset(random.sample(data, num_relevance_samples))
    if load_tokens:
        print("Loading tokens from {}".format(load_tokens))
        tokenizer = torch.load(load_tokens)
    else:
        start = time.time()
        print("Picking tokens...", end="")
        sys.stdout.flush()
        tokenizer = make_keyword_tokenizer_relevance(
            [(context, embedding.encode_token(get_stem(tactic)))
             for prev_tactics, hyps, context, tactic in subset],
            tokenizer_type, num_keywords, num_reserved_tokens)
        print("{}s".format(time.time() - start))
    if save_tokens:
        print("Saving tokens to {}".format(save_tokens))
        torch.save(tokenizer, save_tokens)
    with multiprocessing.Pool(None) as pool:
        result = [(goal, embedding.encode_token(tactic))
                  for goal, tactic in chain.from_iterable(
                      pool.imap(
                          functools.partial(encode_seq_classify_data_worker__,
                                            tokenizer), chunks(data, 1024)))]
    tokenizer.freezeTokenList()
    return result, tokenizer, embedding
def embed_data(data : RawDataset) -> Tuple[Embedding, StrictEmbeddedDataset]:
    embedding = SimpleEmbedding()
    start = time.time()
    print("Embedding data...", end="")
    sys.stdout.flush()
    dataset = StrictEmbeddedDataset([EmbeddedSample(
        prev_tactics, hypotheses, goal, embedding.encode_token(get_stem(tactic)))
                                     for prev_tactics, hypotheses, goal, tactic
                                     in data])
    print("{:.2f}s".format(time.time() - start))
    return embedding, dataset
    def _encode_data(self, data : RawDataset, args : Namespace) \
        -> Tuple[DatasetType, TokenizerEmbeddingState]:
        preprocessed_data = self._preprocess_data(data, args)
        embedding = SimpleEmbedding()
        embedded_data: EmbeddedDataset
        with multiprocessing.Pool(args.num_threads) as pool:
            stemmed_data = pool.imap(stemmify_data,
                                     preprocessed_data,
                                     chunksize=10240)
            lazy_embedded_data = LazyEmbeddedDataset(
                (EmbeddedSample(prev_tactics, hypotheses, goal,
                                embedding.encode_token(tactic))
                 for (prev_tactics, hypotheses, goal, tactic) in stemmed_data))
            if args.load_tokens:
                print("Loading tokens from {}".format(args.load_tokens))
                with open(args.load_tokens, 'rb') as f:
                    tokenizer = pickle.load(f)
                    assert isinstance(tokenizer, Tokenizer)
                embedded_data = lazy_embedded_data
            else:
                # Force the embedded data for picking keywords
                forced_embedded_data = StrictEmbeddedDataset(
                    list(lazy_embedded_data.data))
                subset = StrictEmbeddedDataset(
                    random.sample(forced_embedded_data,
                                  args.num_relevance_samples))
                embedded_data = forced_embedded_data
                start = time.time()
                print("Picking tokens...", end="")
                sys.stdout.flush()
                tokenizer = make_keyword_tokenizer_relevance([
                    (goal, next_tactic)
                    for prev_tactics, hypotheses, goal, next_tactic in subset
                ], tokenizers[args.tokenizer], args.num_keywords, TOKEN_START,
                                                             args.num_threads)
                del subset
                print("{}s".format(time.time() - start))
            if args.save_tokens:
                print("Saving tokens to {}".format(args.save_tokens))
                assert isinstance(tokenizer, Tokenizer)
                with open(args.save_tokens, 'wb') as f:
                    pickle.dump(tokenizer, f)
            if args.print_keywords:
                print("Keywords are {}".format(tokenizer.listTokens()))

            print("Tokenizing...")
            tokenized_data = tokenize_data(tokenizer, embedded_data,
                                           args.num_threads)
            gc.collect()

        return self._encode_tokenized_data(tokenized_data, args, tokenizer, embedding), \
            TokenizerEmbeddingState(tokenizer, embedding)
Beispiel #7
0
def decode_tactic_structure(stem_embedding: SimpleEmbedding,
                            struct: TacticStructure, hyps: List[str]) -> str:
    stem_idx, arg_hyp_idxs = struct
    return " ".join([stem_embedding.decode_token(stem_idx)] + [
        serapi_instance.get_first_var_in_hyp(hyps[hyp_idx - TOKEN_START])
        for hyp_idx in arg_hyp_idxs
    ])
def decode_tactic_structure(term_tokenizer: Tokenizer,
                            stem_embedding: SimpleEmbedding,
                            struct: TacticStructure, hyps: List[str]) -> str:
    def get_var(idx: int) -> str:
        if idx == 0:
            return "UNKNOWN"
        else:
            return serapi_instance.get_first_var_in_hyp(hyps[idx - 1])

    stem_idx, arg_hyp_idxs = struct
    return " ".join([stem_embedding.decode_token(stem_idx)] + [
        get_var(hyp_idx)
        for hyp_idx in takewhile(lambda idx: idx > 0, arg_hyp_idxs)
    ])
Beispiel #9
0
def encode_tactic_structure(stem_embedding : SimpleEmbedding,
                            max_args : int,
                            hyps_and_tactic : Tuple[List[str], str]) \
    -> TacticStructure:
    hyps, tactic = hyps_and_tactic
    tactic_stem, args_str = serapi_instance.split_tactic(tactic)
    arg_strs = args_str.split()[:max_args]
    stem_idx = stem_embedding.encode_token(tactic_stem)
    arg_idxs = [get_arg_idx(hyps, arg.strip()) for arg in args_str.split()]
    if len(arg_idxs) < max_args:
        arg_idxs += [EOS_token] * (max_args - len(arg_idxs))
    # If any arguments aren't hypotheses, ignore the arguments
    if not all(arg_idxs):
        arg_idxs = [EOS_token] * max_args

    return TacticStructure(stem_idx=stem_idx, hyp_idxs=arg_idxs)
Beispiel #10
0
def get_data(args: List[str]) -> None:
    parser = argparse.ArgumentParser(
        description="Parse datafiles into multiple formats")
    parser.add_argument("format",
                        choices=[
                            "terms", "goals", "hyps+goal", "hyps+goal+tactic",
                            "tacvector", "scrapefile-rd", "scrapefile"
                        ])
    parser.add_argument("scrape_file", type=Path2)
    parser.add_argument("--tokenizer",
                        choices=list(tokenizers.keys()),
                        type=str,
                        default=list(tokenizers.keys())[0])
    parser.add_argument("--max-tuples",
                        dest="max_tuples",
                        default=None,
                        type=int)
    parser.add_argument("--num-keywords",
                        dest="num_keywords",
                        default=100,
                        type=int)
    parser.add_argument("--num-head-keywords",
                        dest="num_head_keywords",
                        type=int,
                        default=100)
    parser.add_argument("--num-tactic-keywords",
                        dest="num_tactic_keywords",
                        type=int,
                        default=50)
    parser.add_argument("--print-keywords",
                        dest="print_keywords",
                        action='store_true')
    parser.add_argument("--no-truncate-semicolons",
                        dest="truncate_semicolons",
                        action='store_false')
    parser.add_argument("--max-length",
                        dest="max_length",
                        default=30,
                        type=int)
    parser.add_argument("--lineend",
                        dest="lineend",
                        default=False,
                        const=True,
                        action='store_const')
    parser.add_argument("-j", "--num-threads", default=None, type=int)
    parser.add_argument("--context-filter",
                        dest="context_filter",
                        default="default")
    parser.add_argument('-v', "--verbose", action="count")
    parser.add_argument("--num-threads", "-j", type=int, default=None)
    parser.add_argument("--no-use-substitutions",
                        action='store_false',
                        dest='use_substitutions')
    parser.add_argument("--no-normalize-numeric-args",
                        action='store_false',
                        dest='normalize_numeric_args')
    parser.add_argument("--sort", action='store_true')
    arg_values = parser.parse_args(args)
    if arg_values.format == "terms":
        terms, tokenizer = data.term_data(
            data.RawDataset(
                list(
                    itertools.islice(
                        data.read_text_data(arg_values.scrape_file),
                        arg_values.max_tuples))),
            tokenizers[arg_values.tokenizer], arg_values.num_keywords, 2)
        if arg_values.max_length:
            terms = [
                data.normalizeSentenceLength(term, arg_values.max_length)
                for term in terms
            ]
        for term in terms:
            print(tokenizer.toString(
                list(itertools.takewhile(lambda x: x != data.EOS_token,
                                         term))),
                  end="\\n\n" if arg_values.lineend else "\n")
    else:
        dataset = data.get_text_data(arg_values)
        if arg_values.sort:
            dataset = data.RawDataset(
                sorted(dataset, key=lambda d: len(d.hypotheses), reverse=True))
        if arg_values.format == "goals":
            for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset:
                print(goal)
        elif arg_values.format == "hyps+goal":
            for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset:
                for hyp in hyps:
                    print(hyp)
                print("================================")
                print(goal)
        elif arg_values.format == "hyps+goal+tactic":
            for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset:
                for hyp in hyps:
                    print(hyp)
                print("================================")
                print(goal)
                print("====> {}".format(tactic))
            pass
        elif arg_values.format == "tacvector":
            embedding = SimpleEmbedding()
            eprint("Encoding tactics...", guard=arg_values.verbose)
            answers = [
                embedding.encode_token(serapi_instance.get_stem(datum.tactic))
                for datum in dataset
            ]
            stripped_data = [
                strip_scraped_output(scraped) for scraped in dataset
            ]
            eprint("Constructing features...", guard=arg_values.verbose)
            word_feature_functions = [
                word_feature_constructor(stripped_data,
                                         arg_values)  # type: ignore
                for word_feature_constructor in
                features.word_feature_constructors
            ]
            vec_features_functions = [
                vec_feature_constructor(stripped_data, arg_values) for
                vec_feature_constructor in features.vec_feature_constructors
            ]
            eprint("Extracting features...", guard=arg_values.verbose)
            word_features = [[
                feature(c) for feature in word_feature_functions
            ] for c in stripped_data]
            vec_features = [[
                feature_val for feature in vec_features_functions
                for feature_val in feature(c)
            ] for c in stripped_data]
            eprint("Done", guard=arg_values.verbose)
            for word_feat, vec_feat, tactic in zip(word_features, vec_features,
                                                   answers):
                print(",".join(
                    list(map(str, word_feat)) + list(map(str, vec_feat)) +
                    [str(tactic)]))
        elif arg_values.format == "scrapefile-rd":
            for point in dataset:
                print(
                    json.dumps({
                        "relevant_lemmas": point.relevant_lemmas,
                        "prev_tactics": point.prev_tactics,
                        "context": {
                            "fg_goals": [{
                                "hypotheses": point.hypotheses,
                                "goal": point.goal
                            }],
                            "bg_goals": [],
                            "shelved_goals": [],
                            "given_up_goals": []
                        },
                        "tactic": point.tactic
                    }))
        elif arg_values.format == "scrapefile":
            for point in dataset:
                print(
                    json.dumps({
                        "relevant_lemmas": point.relevant_lemmas,
                        "prev_tactics": point.prev_tactics,
                        "prev_hyps": point.hypotheses,
                        "prev_goal": point.goal,
                        "tactic": point.tactic
                    }))
Beispiel #11
0
def main(arg_list: List[str]) -> None:
    parser = argparse.ArgumentParser(description="Autoencoder for coq terms")
    parser.add_argument("scrape_file")
    parser.add_argument("autoencoder_weights")
    parser.add_argument("save_file")
    parser.add_argument("--num-epochs",
                        dest="num_epochs",
                        default=15,
                        type=int)
    parser.add_argument("--batch-size",
                        dest="batch_size",
                        default=256,
                        type=int)
    parser.add_argument("--max-tuples",
                        dest="max_tuples",
                        default=None,
                        type=int)
    parser.add_argument("--print-every",
                        dest="print_every",
                        default=10,
                        type=int)
    parser.add_argument("--learning-rate",
                        dest="learning_rate",
                        default=.7,
                        type=float)
    parser.add_argument("--gamma", default=.9, type=float)
    parser.add_argument("--epoch-step", dest="epoch_step", default=5, type=int)
    parser.add_argument("--optimizer",
                        choices=list(stdargs.optimizers.keys()),
                        type=str,
                        default=list(stdargs.optimizers.keys())[0])
    parser.add_argument("--num-classifier-layers",
                        dest="num_classifier_layers",
                        default=3,
                        type=int)
    parser.add_argument("--classifier-hidden-size",
                        dest="classifier_hidden_size",
                        default=128,
                        type=int)
    parser.add_argument("--train-autoencoder",
                        dest="train_autoencoder",
                        default=False,
                        const=True,
                        action='store_const')
    args = parser.parse_args(arg_list)
    print("Loading autoencoder state...")
    autoenc_state = torch.load(args.autoencoder_weights)
    cfilter = autoenc_state['context-filter']

    text_data = get_text_data(args)
    print("Encoding data...")
    start = time.time()
    tokenizer = autoenc_state['tokenizer']
    embedding = SimpleEmbedding()
    dataset = [(tokenizer.toTokenList(goal),
                embedding.encode_token(get_stem(tactic)))
               for prev_tactics, hyps, goal, tactic in text_data]
    timeTaken = time.time() - start
    print("Encoded data in {:.2f}".format(timeTaken))

    loadedAutoencoder = maybe_cuda(
        EncoderRNN(tokenizer.numTokens(), autoenc_state['hidden-size'],
                   autoenc_state['num-encoder-layers'], args.batch_size))
    loadedAutoencoder.load_state_dict(autoenc_state['encoder'])
    checkpoints = train(
        dataset, loadedAutoencoder, args.train_autoencoder,
        autoenc_state['max-length'],
        autoenc_state['hidden-size'], args.classifier_hidden_size,
        embedding.num_tokens(), args.num_classifier_layers, args.batch_size,
        args.learning_rate, args.gamma, args.epoch_step, args.num_epochs,
        args.print_every, stdargs.optimizers[args.optimizer])

    for epoch, (decoder_state, autoencoder_state,
                training_loss) in enumerate(checkpoints):
        print("Autoenc training loss is {:.4f}".format(
            autoenc_state['training-loss']))
        state = {
            'epoch': epoch,
            'training-loss': training_loss,
            'autoenc-training-loss': autoenc_state['training-loss'],
            'autoenc-epoch': autoenc_state['epoch'],
            'tokenizer': tokenizer,
            'tokenizer-name': autoenc_state['tokenizer-name'],
            'optimizer': args.optimizer,
            'autoenc-optimizer': autoenc_state['optimizer'],
            'learning-rate': args.learning_rate,
            'autoenc-learning-rate': autoenc_state['learning-rate'],
            'encoder': autoencoder_state,
            'decoder': decoder_state,
            'num-decoder-layers': args.num_classifier_layers,
            'num-encoder-layers': autoenc_state['num-encoder-layers'],
            'context-filter': cfilter,
            'max-length': autoenc_state['max-length'],
            'encoded-size': autoenc_state['hidden-size'],
            'hidden-size': args.classifier_hidden_size,
            'num-keywords': autoenc_state['num-keywords'],
            'stem-embedding': embedding,
        }
        with open(args.save_file, 'wb') as f:
            print("=> Saving checkpoint at epoch {}".format(epoch))
            torch.save(state, f)
def get_data(args: List[str]) -> None:
    parser = argparse.ArgumentParser(
        description="Parse datafiles into multiple formats")
    parser.add_argument("format",
                        choices=[
                            "terms", "goals", "hyps+goal", "hyps+goal+tactic",
                            "tacvector"
                        ])
    parser.add_argument("scrape_file", type=Path2)
    parser.add_argument("--tokenizer",
                        choices=list(tokenizers.keys()),
                        type=str,
                        default=list(tokenizers.keys())[0])
    parser.add_argument("--max-tuples",
                        dest="max_tuples",
                        default=None,
                        type=int)
    parser.add_argument("--num-keywords",
                        dest="num_keywords",
                        default=100,
                        type=int)
    parser.add_argument("--num-head-keywords",
                        dest="num_head_keywords",
                        type=int,
                        default=100)
    parser.add_argument("--num-tactic-keywords",
                        dest="num_tactic_keywords",
                        type=int,
                        default=50)
    parser.add_argument("--print-keywords",
                        dest="print_keywords",
                        action='store_true')
    parser.add_argument("--max-length",
                        dest="max_length",
                        default=None,
                        type=int)
    parser.add_argument("--lineend",
                        dest="lineend",
                        default=False,
                        const=True,
                        action='store_const')
    parser.add_argument("--context-filter",
                        dest="context_filter",
                        default="default")
    parser.add_argument("--verbose", action="store_true")
    arg_values = parser.parse_args(args)
    if arg_values.format == "terms":
        terms, tokenizer = data.term_data(
            data.RawDataset(
                list(
                    itertools.islice(
                        data.read_text_data(arg_values.scrape_file),
                        arg_values.max_tuples))),
            tokenizers[arg_values.tokenizer], arg_values.num_keywords, 2)
        if arg_values.max_length:
            terms = [
                data.normalizeSentenceLength(term, arg_values.max_length)
                for term in terms
            ]
        for term in terms:
            print(tokenizer.toString(
                list(itertools.takewhile(lambda x: x != data.EOS_token,
                                         term))),
                  end="\\n\n" if arg_values.lineend else "\n")
    elif arg_values.format == "goals":
        dataset = data.get_text_data(arg_values)
        for prev_tactics, hyps, goal, tactic in dataset:
            print(goal)
    elif arg_values.format == "hyps+goal":
        dataset = data.get_text_data(arg_values)
        for prev_tactics, hyps, goal, tactic in dataset:
            for hyp in hyps:
                print(hyp)
            print("================================")
            print(goal)
    elif arg_values.format == "hyps+goal+tactic":
        dataset = data.get_text_data(arg_values)
        for prev_tactics, hyps, goal, tactic in dataset:
            for hyp in hyps:
                print(hyp)
            print("================================")
            print(goal)
            print("====> {}".format(tactic))
        pass
    elif arg_values.format == "tacvector":
        dataset = data.get_text_data(arg_values)
        embedding = SimpleEmbedding()
        eprint("Encoding tactics...", guard=arg_values.verbose)
        answers = [
            embedding.encode_token(serapi_instance.get_stem(datum.tactic))
            for datum in dataset
        ]
        stripped_data = [strip_scraped_output(scraped) for scraped in dataset]
        eprint("Constructing features...", guard=arg_values.verbose)
        word_feature_functions = [
            word_feature_constructor(stripped_data, arg_values)
            for word_feature_constructor in features.word_feature_constructors
        ]
        vec_features_functions = [
            vec_feature_constructor(stripped_data, arg_values)
            for vec_feature_constructor in features.vec_feature_constructors
        ]
        eprint("Extracting features...", guard=arg_values.verbose)
        word_features = [[feature(c) for feature in word_feature_functions]
                         for c in stripped_data]
        vec_features = [[
            feature_val for feature in vec_features_functions
            for feature_val in feature(c)
        ] for c in stripped_data]
        eprint("Done", guard=arg_values.verbose)
        for word_feat, vec_feat, tactic in zip(word_features, vec_features,
                                               answers):
            print(",".join(
                list(map(str, word_feat)) + list(map(str, vec_feat)) +
                [str(tactic)]))