Esempio n. 1
0
def filter_data(data: Iterable[ScrapedTactic], pair_filter: ContextFilter,
                arg_values: Namespace) -> Iterable[ScrapedTactic]:
    return (scraped for (scraped, next_scraped) in zip(
        data,
        itertools.chain(itertools.islice(data, 1, None),
                        [ScrapedTactic([], [], ProofContext.empty(), "")]))
            if pair_filter(strip_scraped_output(scraped), scraped.tactic,
                           strip_scraped_output(next_scraped), arg_values))
Esempio n. 2
0
def filter_eval_data(data: Iterable[StateScore], pair_filter: ContextFilter,
                     arg_values: Namespace) -> Iterable[StateScore]:
    return (
        point for (point, next_point) in zip(
            data,
            itertools.chain(itertools.islice(data, 1, None), [
                StateScore(ScrapedTactic([], [], ProofContext.empty(), ""), 0)
            ]))
        if pair_filter(strip_scraped_output(point.state), point.state.tactic,
                       strip_scraped_output(next_point.state), arg_values))
 def _encode_data(self, data : RawDataset, arg_values : Namespace) \
     -> Tuple[FeaturesDataset, Tuple[Embedding, List[VecFeature]]]:
     stripped_data = [strip_scraped_output(dat) for dat in data]
     self._feature_functions = [
         feature_constructor(stripped_data, arg_values)
         for feature_constructor in vec_feature_constructors
     ]
     embedding, embedded_data = embed_data(data)
     return (FeaturesDataset([
         FeaturesSample(self._get_features(strip_scraped_output(scraped)),
                        scraped.tactic) for scraped in embedded_data
     ]), (embedding, self._feature_functions))
Esempio n. 4
0
 def make_predictions(num_predictions: int,
                      tactic_interactions: List[ScrapedTactic]) -> \
         Tuple[Iterable[Tuple[ScrapedTactic, List[Prediction]]], float]:
     if len(tactic_interactions) == 0:
         return [], 0
     chunk_size = args.chunk_size
     total_loss = 0.
     inputs = [
         strip_scraped_output(tactic_interaction)
         for tactic_interaction in tactic_interactions
     ]
     corrects = [
         tactic_interaction.tactic
         for tactic_interaction in tactic_interactions
     ]
     predictions: List[List[Prediction]] = []
     for inputs_chunk, corrects_chunk in zip(chunks(inputs, chunk_size),
                                             chunks(corrects, chunk_size)):
         predictions_chunk, loss = predictor.predictKTacticsWithLoss_batch(
             inputs_chunk, args.num_predictions, corrects_chunk)
         predictions += predictions_chunk
         total_loss += loss
     del inputs
     del corrects
     return list(zip(tactic_interactions, predictions)), \
         total_loss / math.ceil(len(tactic_interactions) / chunk_size)
Esempio n. 5
0
    def _encode_data(self, data : RawDataset, arg_values : Namespace) \
        -> Tuple[CopyArgDataset, Tuple[Tokenizer, Embedding,
                                       List[WordFeature], List[VecFeature]]]:
        for datum in data:
            assert not re.match("induction\s+\d+\.", datum.tactic)
        stripped_data = [strip_scraped_output(dat) for dat in data]
        self._word_feature_functions = [
            feature_constructor(stripped_data, arg_values) for  # type: ignore
            feature_constructor in word_feature_constructors
        ]
        self._vec_feature_functions = [
            feature_constructor(stripped_data, arg_values) for  # type: ignore
            feature_constructor in vec_feature_constructors
        ]
        embedding, embedded_data = embed_data(data)
        tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values)
        with multiprocessing.Pool(arg_values.num_threads) as pool:
            arg_idxs = pool.imap(
                functools.partial(get_arg_idx, arg_values.max_length), data)

            start = time.time()
            print("Creating dataset...", end="")
            sys.stdout.flush()
            result_data = CopyArgDataset(
                list(
                    pool.imap(
                        functools.partial(mkCopySample, arg_values.max_length,
                                          self._word_feature_functions,
                                          self._vec_feature_functions),
                        zip(embedded_data, tokenized_goals, arg_idxs))))
            print("{:.2f}s".format(time.time() - start))
        return result_data, (tokenizer, embedding,
                             self._word_feature_functions,
                             self._vec_feature_functions)
 def _encode_data(self, data : RawDataset, arg_values : Namespace) \
     -> Tuple[EncFeaturesDataset, Tuple[Tokenizer, Embedding,
                                        List[VecFeature], List[WordFeature]]]:
     stripped_data = [strip_scraped_output(dat) for dat in data]
     self._vec_feature_functions = [
         feature_constructor(stripped_data, arg_values) for  # type: ignore
         feature_constructor in vec_feature_constructors
     ]
     self._word_feature_functions = [
         feature_constructor(stripped_data, arg_values) for  # type: ignore
         feature_constructor in word_feature_constructors
     ]
     embedding, embedded_data = embed_data(data)
     tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values)
     result_data = EncFeaturesDataset([
         EncFeaturesSample(
             self._get_vec_features(
                 TacticContext([], prev_tactics, hypotheses, goal)),
             self._get_word_features(
                 TacticContext([], prev_tactics, hypotheses, goal)),
             normalizeSentenceLength(tokenized_goal, arg_values.max_length),
             tactic)
         for (relevant_lemmas, prev_tactics, hypotheses, goal,
              tactic), tokenized_goal in zip(embedded_data, tokenized_goals)
     ])
     return result_data, (tokenizer, embedding, self._vec_feature_functions,
                          self._word_feature_functions)
Esempio n. 7
0
 def get_should_filter(data: MixedDataset) \
         -> Iterable[Tuple[ScrapedCommand, bool]]:
     list_data: List[ScrapedCommand] = list(data)
     extended_list: List[Optional[ScrapedCommand]] = \
         cast(List[Optional[ScrapedCommand]], list_data[1:]) + [None]
     for point, nextpoint in zip(list_data, extended_list):
         if isinstance(point, ScrapedTactic) \
            and not re.match(r"\s*[{}]\s*", point.tactic) and \
            point.context.focused_goal.strip() != "":
             if isinstance(nextpoint, ScrapedTactic):
                 context_after = strip_scraped_output(nextpoint)
             else:
                 context_after = TacticContext([], [], [], "")
             should_filter = not context_filter(strip_scraped_output(point),
                                                point.tactic, context_after,
                                                training_args)
             yield (point, should_filter)
         else:
             yield (point, True)
Esempio n. 8
0
 def _encode_data(self, data : RawDataset, arg_values : Namespace) \
     -> Tuple[HypFeaturesDataset, Tuple[Tokenizer, Embedding,
                                        List[WordFeature], List[VecFeature]]]:
     start = time.time()
     print("Stripping...", end="")
     sys.stdout.flush()
     stripped_data = [strip_scraped_output(dat) for dat in data]
     print("{:.2f}s".format(time.time() - start))
     self._word_feature_functions = [
         feature_constructor(stripped_data, arg_values) for  # type: ignore
         feature_constructor in word_feature_constructors
     ]
     self._vec_feature_functions = [
         feature_constructor(stripped_data, arg_values) for  # type: ignore
         feature_constructor in vec_feature_constructors
     ]
     embedding, embedded_data = embed_data(data)
     tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values)
     with multiprocessing.Pool(arg_values.num_threads) as pool:
         start = time.time()
         print("Getting closest hyps...", end="")
         sys.stdout.flush()
         tokenized_hyps = list(
             pool.imap(
                 functools.partial(get_closest_hyp_type, tokenizer,
                                   arg_values.max_length), data))
         print("{:.2f}s".format(time.time() - start))
         start = time.time()
         print("Creating dataset...", end="")
         sys.stdout.flush()
         result_data = HypFeaturesDataset(
             list(
                 pool.imap(
                     functools.partial(mkHFSample, arg_values.max_length,
                                       self._word_feature_functions,
                                       self._vec_feature_functions),
                     zip(embedded_data, tokenized_goals, tokenized_hyps))))
         print("{:.2f}s".format(time.time() - start))
     return result_data, (tokenizer, embedding,
                          self._word_feature_functions,
                          self._vec_feature_functions)
Esempio n. 9
0
def get_data(args: List[str]) -> None:
    parser = argparse.ArgumentParser(
        description="Parse datafiles into multiple formats")
    parser.add_argument("format",
                        choices=[
                            "terms", "goals", "hyps+goal", "hyps+goal+tactic",
                            "tacvector", "scrapefile-rd", "scrapefile"
                        ])
    parser.add_argument("scrape_file", type=Path2)
    parser.add_argument("--tokenizer",
                        choices=list(tokenizers.keys()),
                        type=str,
                        default=list(tokenizers.keys())[0])
    parser.add_argument("--max-tuples",
                        dest="max_tuples",
                        default=None,
                        type=int)
    parser.add_argument("--num-keywords",
                        dest="num_keywords",
                        default=100,
                        type=int)
    parser.add_argument("--num-head-keywords",
                        dest="num_head_keywords",
                        type=int,
                        default=100)
    parser.add_argument("--num-tactic-keywords",
                        dest="num_tactic_keywords",
                        type=int,
                        default=50)
    parser.add_argument("--print-keywords",
                        dest="print_keywords",
                        action='store_true')
    parser.add_argument("--no-truncate-semicolons",
                        dest="truncate_semicolons",
                        action='store_false')
    parser.add_argument("--max-length",
                        dest="max_length",
                        default=30,
                        type=int)
    parser.add_argument("--lineend",
                        dest="lineend",
                        default=False,
                        const=True,
                        action='store_const')
    parser.add_argument("-j", "--num-threads", default=None, type=int)
    parser.add_argument("--context-filter",
                        dest="context_filter",
                        default="default")
    parser.add_argument('-v', "--verbose", action="count")
    parser.add_argument("--num-threads", "-j", type=int, default=None)
    parser.add_argument("--no-use-substitutions",
                        action='store_false',
                        dest='use_substitutions')
    parser.add_argument("--no-normalize-numeric-args",
                        action='store_false',
                        dest='normalize_numeric_args')
    parser.add_argument("--sort", action='store_true')
    arg_values = parser.parse_args(args)
    if arg_values.format == "terms":
        terms, tokenizer = data.term_data(
            data.RawDataset(
                list(
                    itertools.islice(
                        data.read_text_data(arg_values.scrape_file),
                        arg_values.max_tuples))),
            tokenizers[arg_values.tokenizer], arg_values.num_keywords, 2)
        if arg_values.max_length:
            terms = [
                data.normalizeSentenceLength(term, arg_values.max_length)
                for term in terms
            ]
        for term in terms:
            print(tokenizer.toString(
                list(itertools.takewhile(lambda x: x != data.EOS_token,
                                         term))),
                  end="\\n\n" if arg_values.lineend else "\n")
    else:
        dataset = data.get_text_data(arg_values)
        if arg_values.sort:
            dataset = data.RawDataset(
                sorted(dataset, key=lambda d: len(d.hypotheses), reverse=True))
        if arg_values.format == "goals":
            for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset:
                print(goal)
        elif arg_values.format == "hyps+goal":
            for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset:
                for hyp in hyps:
                    print(hyp)
                print("================================")
                print(goal)
        elif arg_values.format == "hyps+goal+tactic":
            for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset:
                for hyp in hyps:
                    print(hyp)
                print("================================")
                print(goal)
                print("====> {}".format(tactic))
            pass
        elif arg_values.format == "tacvector":
            embedding = SimpleEmbedding()
            eprint("Encoding tactics...", guard=arg_values.verbose)
            answers = [
                embedding.encode_token(serapi_instance.get_stem(datum.tactic))
                for datum in dataset
            ]
            stripped_data = [
                strip_scraped_output(scraped) for scraped in dataset
            ]
            eprint("Constructing features...", guard=arg_values.verbose)
            word_feature_functions = [
                word_feature_constructor(stripped_data,
                                         arg_values)  # type: ignore
                for word_feature_constructor in
                features.word_feature_constructors
            ]
            vec_features_functions = [
                vec_feature_constructor(stripped_data, arg_values) for
                vec_feature_constructor in features.vec_feature_constructors
            ]
            eprint("Extracting features...", guard=arg_values.verbose)
            word_features = [[
                feature(c) for feature in word_feature_functions
            ] for c in stripped_data]
            vec_features = [[
                feature_val for feature in vec_features_functions
                for feature_val in feature(c)
            ] for c in stripped_data]
            eprint("Done", guard=arg_values.verbose)
            for word_feat, vec_feat, tactic in zip(word_features, vec_features,
                                                   answers):
                print(",".join(
                    list(map(str, word_feat)) + list(map(str, vec_feat)) +
                    [str(tactic)]))
        elif arg_values.format == "scrapefile-rd":
            for point in dataset:
                print(
                    json.dumps({
                        "relevant_lemmas": point.relevant_lemmas,
                        "prev_tactics": point.prev_tactics,
                        "context": {
                            "fg_goals": [{
                                "hypotheses": point.hypotheses,
                                "goal": point.goal
                            }],
                            "bg_goals": [],
                            "shelved_goals": [],
                            "given_up_goals": []
                        },
                        "tactic": point.tactic
                    }))
        elif arg_values.format == "scrapefile":
            for point in dataset:
                print(
                    json.dumps({
                        "relevant_lemmas": point.relevant_lemmas,
                        "prev_tactics": point.prev_tactics,
                        "prev_hyps": point.hypotheses,
                        "prev_goal": point.goal,
                        "tactic": point.tactic
                    }))