Example #1
0
def add_tokenizer_args(parser: argparse.ArgumentParser,
                       default_values: Dict[str, Any] = {}) -> None:
    parser.add_argument("--num-keywords",
                        dest="num_keywords",
                        type=int,
                        default=default_values.get("num-keywordes", 60))
    parser.add_argument("--tokenizer",
                        choices=list(tokenizers.keys()),
                        type=str,
                        default=default_values.get("tokenizer",
                                                   list(tokenizers.keys())[0]))
    parser.add_argument("--num-relevance-samples",
                        dest="num_relevance_samples",
                        type=int,
                        default=default_values.get("num_relevance_samples",
                                                   1000))
    parser.add_argument("--save-tokens",
                        dest="save_tokens",
                        default=default_values.get("save-tokens", None))
    parser.add_argument("--load-tokens",
                        dest="load_tokens",
                        default=default_values.get("load-tokens", None))
    parser.add_argument("--print-keywords",
                        dest="print_keywords",
                        default=False,
                        action='store_const',
                        const=True)
def get_data(args: List[str]) -> None:
    parser = argparse.ArgumentParser(
        description="Parse datafiles into multiple formats")
    parser.add_argument("format",
                        choices=[
                            "terms", "goals", "hyps+goal", "hyps+goal+tactic",
                            "tacvector"
                        ])
    parser.add_argument("scrape_file", type=Path2)
    parser.add_argument("--tokenizer",
                        choices=list(tokenizers.keys()),
                        type=str,
                        default=list(tokenizers.keys())[0])
    parser.add_argument("--max-tuples",
                        dest="max_tuples",
                        default=None,
                        type=int)
    parser.add_argument("--num-keywords",
                        dest="num_keywords",
                        default=100,
                        type=int)
    parser.add_argument("--num-head-keywords",
                        dest="num_head_keywords",
                        type=int,
                        default=100)
    parser.add_argument("--num-tactic-keywords",
                        dest="num_tactic_keywords",
                        type=int,
                        default=50)
    parser.add_argument("--print-keywords",
                        dest="print_keywords",
                        action='store_true')
    parser.add_argument("--max-length",
                        dest="max_length",
                        default=None,
                        type=int)
    parser.add_argument("--lineend",
                        dest="lineend",
                        default=False,
                        const=True,
                        action='store_const')
    parser.add_argument("--context-filter",
                        dest="context_filter",
                        default="default")
    parser.add_argument("--verbose", action="store_true")
    arg_values = parser.parse_args(args)
    if arg_values.format == "terms":
        terms, tokenizer = data.term_data(
            data.RawDataset(
                list(
                    itertools.islice(
                        data.read_text_data(arg_values.scrape_file),
                        arg_values.max_tuples))),
            tokenizers[arg_values.tokenizer], arg_values.num_keywords, 2)
        if arg_values.max_length:
            terms = [
                data.normalizeSentenceLength(term, arg_values.max_length)
                for term in terms
            ]
        for term in terms:
            print(tokenizer.toString(
                list(itertools.takewhile(lambda x: x != data.EOS_token,
                                         term))),
                  end="\\n\n" if arg_values.lineend else "\n")
    elif arg_values.format == "goals":
        dataset = data.get_text_data(arg_values)
        for prev_tactics, hyps, goal, tactic in dataset:
            print(goal)
    elif arg_values.format == "hyps+goal":
        dataset = data.get_text_data(arg_values)
        for prev_tactics, hyps, goal, tactic in dataset:
            for hyp in hyps:
                print(hyp)
            print("================================")
            print(goal)
    elif arg_values.format == "hyps+goal+tactic":
        dataset = data.get_text_data(arg_values)
        for prev_tactics, hyps, goal, tactic in dataset:
            for hyp in hyps:
                print(hyp)
            print("================================")
            print(goal)
            print("====> {}".format(tactic))
        pass
    elif arg_values.format == "tacvector":
        dataset = data.get_text_data(arg_values)
        embedding = SimpleEmbedding()
        eprint("Encoding tactics...", guard=arg_values.verbose)
        answers = [
            embedding.encode_token(serapi_instance.get_stem(datum.tactic))
            for datum in dataset
        ]
        stripped_data = [strip_scraped_output(scraped) for scraped in dataset]
        eprint("Constructing features...", guard=arg_values.verbose)
        word_feature_functions = [
            word_feature_constructor(stripped_data, arg_values)
            for word_feature_constructor in features.word_feature_constructors
        ]
        vec_features_functions = [
            vec_feature_constructor(stripped_data, arg_values)
            for vec_feature_constructor in features.vec_feature_constructors
        ]
        eprint("Extracting features...", guard=arg_values.verbose)
        word_features = [[feature(c) for feature in word_feature_functions]
                         for c in stripped_data]
        vec_features = [[
            feature_val for feature in vec_features_functions
            for feature_val in feature(c)
        ] for c in stripped_data]
        eprint("Done", guard=arg_values.verbose)
        for word_feat, vec_feat, tactic in zip(word_features, vec_features,
                                               answers):
            print(",".join(
                list(map(str, word_feat)) + list(map(str, vec_feat)) +
                [str(tactic)]))
Example #3
0
def get_data(args: List[str]) -> None:
    parser = argparse.ArgumentParser(
        description="Parse datafiles into multiple formats")
    parser.add_argument("format",
                        choices=[
                            "terms", "goals", "hyps+goal", "hyps+goal+tactic",
                            "tacvector", "scrapefile-rd", "scrapefile"
                        ])
    parser.add_argument("scrape_file", type=Path2)
    parser.add_argument("--tokenizer",
                        choices=list(tokenizers.keys()),
                        type=str,
                        default=list(tokenizers.keys())[0])
    parser.add_argument("--max-tuples",
                        dest="max_tuples",
                        default=None,
                        type=int)
    parser.add_argument("--num-keywords",
                        dest="num_keywords",
                        default=100,
                        type=int)
    parser.add_argument("--num-head-keywords",
                        dest="num_head_keywords",
                        type=int,
                        default=100)
    parser.add_argument("--num-tactic-keywords",
                        dest="num_tactic_keywords",
                        type=int,
                        default=50)
    parser.add_argument("--print-keywords",
                        dest="print_keywords",
                        action='store_true')
    parser.add_argument("--no-truncate-semicolons",
                        dest="truncate_semicolons",
                        action='store_false')
    parser.add_argument("--max-length",
                        dest="max_length",
                        default=30,
                        type=int)
    parser.add_argument("--lineend",
                        dest="lineend",
                        default=False,
                        const=True,
                        action='store_const')
    parser.add_argument("-j", "--num-threads", default=None, type=int)
    parser.add_argument("--context-filter",
                        dest="context_filter",
                        default="default")
    parser.add_argument('-v', "--verbose", action="count")
    parser.add_argument("--num-threads", "-j", type=int, default=None)
    parser.add_argument("--no-use-substitutions",
                        action='store_false',
                        dest='use_substitutions')
    parser.add_argument("--no-normalize-numeric-args",
                        action='store_false',
                        dest='normalize_numeric_args')
    parser.add_argument("--sort", action='store_true')
    arg_values = parser.parse_args(args)
    if arg_values.format == "terms":
        terms, tokenizer = data.term_data(
            data.RawDataset(
                list(
                    itertools.islice(
                        data.read_text_data(arg_values.scrape_file),
                        arg_values.max_tuples))),
            tokenizers[arg_values.tokenizer], arg_values.num_keywords, 2)
        if arg_values.max_length:
            terms = [
                data.normalizeSentenceLength(term, arg_values.max_length)
                for term in terms
            ]
        for term in terms:
            print(tokenizer.toString(
                list(itertools.takewhile(lambda x: x != data.EOS_token,
                                         term))),
                  end="\\n\n" if arg_values.lineend else "\n")
    else:
        dataset = data.get_text_data(arg_values)
        if arg_values.sort:
            dataset = data.RawDataset(
                sorted(dataset, key=lambda d: len(d.hypotheses), reverse=True))
        if arg_values.format == "goals":
            for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset:
                print(goal)
        elif arg_values.format == "hyps+goal":
            for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset:
                for hyp in hyps:
                    print(hyp)
                print("================================")
                print(goal)
        elif arg_values.format == "hyps+goal+tactic":
            for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset:
                for hyp in hyps:
                    print(hyp)
                print("================================")
                print(goal)
                print("====> {}".format(tactic))
            pass
        elif arg_values.format == "tacvector":
            embedding = SimpleEmbedding()
            eprint("Encoding tactics...", guard=arg_values.verbose)
            answers = [
                embedding.encode_token(serapi_instance.get_stem(datum.tactic))
                for datum in dataset
            ]
            stripped_data = [
                strip_scraped_output(scraped) for scraped in dataset
            ]
            eprint("Constructing features...", guard=arg_values.verbose)
            word_feature_functions = [
                word_feature_constructor(stripped_data,
                                         arg_values)  # type: ignore
                for word_feature_constructor in
                features.word_feature_constructors
            ]
            vec_features_functions = [
                vec_feature_constructor(stripped_data, arg_values) for
                vec_feature_constructor in features.vec_feature_constructors
            ]
            eprint("Extracting features...", guard=arg_values.verbose)
            word_features = [[
                feature(c) for feature in word_feature_functions
            ] for c in stripped_data]
            vec_features = [[
                feature_val for feature in vec_features_functions
                for feature_val in feature(c)
            ] for c in stripped_data]
            eprint("Done", guard=arg_values.verbose)
            for word_feat, vec_feat, tactic in zip(word_features, vec_features,
                                                   answers):
                print(",".join(
                    list(map(str, word_feat)) + list(map(str, vec_feat)) +
                    [str(tactic)]))
        elif arg_values.format == "scrapefile-rd":
            for point in dataset:
                print(
                    json.dumps({
                        "relevant_lemmas": point.relevant_lemmas,
                        "prev_tactics": point.prev_tactics,
                        "context": {
                            "fg_goals": [{
                                "hypotheses": point.hypotheses,
                                "goal": point.goal
                            }],
                            "bg_goals": [],
                            "shelved_goals": [],
                            "given_up_goals": []
                        },
                        "tactic": point.tactic
                    }))
        elif arg_values.format == "scrapefile":
            for point in dataset:
                print(
                    json.dumps({
                        "relevant_lemmas": point.relevant_lemmas,
                        "prev_tactics": point.prev_tactics,
                        "prev_hyps": point.hypotheses,
                        "prev_goal": point.goal,
                        "tactic": point.tactic
                    }))
Example #4
0
def add_std_args(parser: argparse.ArgumentParser,
                 default_values: Dict[str, Any] = {}) -> None:
    parser.add_argument("scrape_file")
    parser.add_argument("save_file")
    parser.add_argument("--num-threads",
                        "-j",
                        dest="num_threads",
                        type=int,
                        default=default_values.get("num-threads", None))
    parser.add_argument("--num-epochs",
                        dest="num_epochs",
                        type=int,
                        default=default_values.get("num-epochs", 20))
    parser.add_argument("--batch-size",
                        dest="batch_size",
                        type=int,
                        default=default_values.get("batch-size", 256))
    parser.add_argument("--max-length",
                        dest="max_length",
                        type=int,
                        default=default_values.get("max-length", 100))
    parser.add_argument("--max-tuples",
                        dest="max_tuples",
                        type=int,
                        default=default_values.get("max-tuples", None))
    parser.add_argument("--start-from",
                        dest="start_from",
                        type=str,
                        default=default_values.get("start-from", None))
    parser.add_argument("--print-every",
                        dest="print_every",
                        type=int,
                        default=default_values.get("print-every", 5))
    parser.add_argument("--hidden-size",
                        dest="hidden_size",
                        type=int,
                        default=default_values.get("hidden-size", 128))
    parser.add_argument("--learning-rate",
                        dest="learning_rate",
                        type=float,
                        default=default_values.get("learning-rate", .7))
    parser.add_argument("--epoch-step",
                        dest="epoch_step",
                        type=int,
                        default=default_values.get("epoch-step", 10))
    parser.add_argument("--gamma",
                        dest="gamma",
                        type=float,
                        default=default_values.get("gamma", 0.8))
    parser.add_argument("--num-encoder-layers",
                        dest="num_encoder_layers",
                        type=int,
                        default=default_values.get("num-encoder-layers", 3))
    parser.add_argument("--num-decoder-layers",
                        dest="num_decoder_layers",
                        type=int,
                        default=default_values.get("num-decoder-layers", 3))
    parser.add_argument("--num-keywords",
                        dest="num_keywords",
                        type=int,
                        default=default_values.get("num-keywordes", 60))
    parser.add_argument("--tokenizer",
                        choices=list(tokenizers.keys()),
                        type=str,
                        default=default_values.get("tokenizer",
                                                   list(tokenizers.keys())[0]))
    parser.add_argument("--save-tokens",
                        dest="save_tokens",
                        default=default_values.get("save-tokens", None))
    parser.add_argument("--load-tokens",
                        dest="load_tokens",
                        default=default_values.get("load-tokens", None))
    parser.add_argument("--optimizer",
                        choices=list(optimizers.keys()),
                        type=str,
                        default=default_values.get("optimizer",
                                                   list(optimizers.keys())[0]))
    parser.add_argument("--context-filter",
                        dest="context_filter",
                        type=str,
                        default=default_values.get("context-filter",
                                                   "goal-changes%no-args"))