Example #1
0
 def _encode_data(self, data : RawDataset, arg_values : Namespace) \
     -> Tuple[FeaturesDataset, Tuple[Embedding, List[VecFeature]]]:
     preprocessed_data = list(self._preprocess_data(data, arg_values))
     stripped_data = [
         strip_scraped_output(dat) for dat in preprocessed_data
     ]
     self._feature_functions = [
         feature_constructor(stripped_data, arg_values)
         for feature_constructor in vec_feature_constructors
     ]
     embedding, embedded_data = embed_data(RawDataset(preprocessed_data))
     return (FeaturesDataset([
         FeaturesSample(self._get_features(strip_scraped_output(scraped)),
                        scraped.tactic) for scraped in embedded_data
     ]), (embedding, self._feature_functions))
Example #2
0
 def _encode_data(self, data : RawDataset, arg_values : Namespace) \
     -> Tuple[EncFeaturesDataset, Tuple[Tokenizer, Embedding,
                                        List[VecFeature], List[WordFeature]]]:
     preprocessed_data = list(self._preprocess_data(data, arg_values))
     stripped_data = [
         strip_scraped_output(dat) for dat in preprocessed_data
     ]
     self._vec_feature_functions = [
         feature_constructor(stripped_data, arg_values) for  # type: ignore
         feature_constructor in vec_feature_constructors
     ]
     self._word_feature_functions = [
         feature_constructor(stripped_data, arg_values) for  # type: ignore
         feature_constructor in word_feature_constructors
     ]
     embedding, embedded_data = embed_data(RawDataset(preprocessed_data))
     tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values)
     result_data = EncFeaturesDataset([
         EncFeaturesSample(
             self._get_vec_features(
                 TacticContext(prev_tactics, hypotheses, goal)),
             self._get_word_features(
                 TacticContext(prev_tactics, hypotheses, goal)),
             normalizeSentenceLength(tokenized_goal, arg_values.max_length),
             tactic)
         for (prev_tactics, hypotheses, goal,
              tactic), tokenized_goal in zip(embedded_data, tokenized_goals)
     ])
     return result_data, (tokenizer, embedding, self._vec_feature_functions,
                          self._word_feature_functions)
Example #3
0
    def _encode_data(self, data : RawDataset, arg_values : Namespace) \
        -> Tuple[CopyArgDataset, Tuple[Tokenizer, Embedding,
                                       List[WordFeature], List[VecFeature]]]:
        preprocessed_data = list(self._preprocess_data(data, arg_values))
        for datum in preprocessed_data:
            assert not re.match("induction\s+\d+\.", datum.tactic)
        stripped_data = [strip_scraped_output(dat) for dat in preprocessed_data]
        self._word_feature_functions  = [feature_constructor(stripped_data, arg_values) for # type: ignore
                                       feature_constructor in
                                        word_feature_constructors]
        self._vec_feature_functions = [feature_constructor(stripped_data, arg_values) for # type: ignore
                                       feature_constructor in vec_feature_constructors]
        embedding, embedded_data = embed_data(RawDataset(preprocessed_data))
        tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values)
        with multiprocessing.Pool(arg_values.num_threads) as pool:
            arg_idxs = pool.imap(functools.partial(get_arg_idx, arg_values.max_length),
                                 preprocessed_data)

            start = time.time()
            print("Creating dataset...", end="")
            sys.stdout.flush()
            result_data = CopyArgDataset(list(pool.imap(
                functools.partial(mkCopySample, arg_values.max_length,
                                  self._word_feature_functions,
                                  self._vec_feature_functions),
                zip(embedded_data, tokenized_goals, arg_idxs))))
            print("{:.2f}s".format(time.time() - start))
        return result_data, (tokenizer, embedding,
                             self._word_feature_functions,
                             self._vec_feature_functions)
 def _encode_data(self, data : RawDataset, arg_values : Namespace) \
     -> Tuple[HypFeaturesDataset, Tuple[Tokenizer, Embedding,
                                        List[WordFeature], List[VecFeature]]]:
     preprocessed_data = list(self._preprocess_data(data, arg_values))
     start = time.time()
     print("Stripping...", end="")
     sys.stdout.flush()
     stripped_data = [
         strip_scraped_output(dat) for dat in preprocessed_data
     ]
     print("{:.2f}s".format(time.time() - start))
     self._word_feature_functions = [
         feature_constructor(stripped_data, arg_values) for  # type: ignore
         feature_constructor in word_feature_constructors
     ]
     self._vec_feature_functions = [
         feature_constructor(stripped_data, arg_values) for  # type: ignore
         feature_constructor in vec_feature_constructors
     ]
     embedding, embedded_data = embed_data(RawDataset(preprocessed_data))
     tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values)
     with multiprocessing.Pool(arg_values.num_threads) as pool:
         start = time.time()
         print("Getting closest hyps...", end="")
         sys.stdout.flush()
         tokenized_hyps = list(
             pool.imap(
                 functools.partial(get_closest_hyp_type, tokenizer,
                                   arg_values.max_length),
                 preprocessed_data))
         print("{:.2f}s".format(time.time() - start))
         start = time.time()
         print("Creating dataset...", end="")
         sys.stdout.flush()
         result_data = HypFeaturesDataset(
             list(
                 pool.imap(
                     functools.partial(mkHFSample, arg_values.max_length,
                                       self._word_feature_functions,
                                       self._vec_feature_functions),
                     zip(embedded_data, tokenized_goals, tokenized_hyps))))
         print("{:.2f}s".format(time.time() - start))
     return result_data, (tokenizer, embedding,
                          self._word_feature_functions,
                          self._vec_feature_functions)
def get_data(args: List[str]) -> None:
    parser = argparse.ArgumentParser(
        description="Parse datafiles into multiple formats")
    parser.add_argument("format",
                        choices=[
                            "terms", "goals", "hyps+goal", "hyps+goal+tactic",
                            "tacvector"
                        ])
    parser.add_argument("scrape_file", type=Path2)
    parser.add_argument("--tokenizer",
                        choices=list(tokenizers.keys()),
                        type=str,
                        default=list(tokenizers.keys())[0])
    parser.add_argument("--max-tuples",
                        dest="max_tuples",
                        default=None,
                        type=int)
    parser.add_argument("--num-keywords",
                        dest="num_keywords",
                        default=100,
                        type=int)
    parser.add_argument("--num-head-keywords",
                        dest="num_head_keywords",
                        type=int,
                        default=100)
    parser.add_argument("--num-tactic-keywords",
                        dest="num_tactic_keywords",
                        type=int,
                        default=50)
    parser.add_argument("--print-keywords",
                        dest="print_keywords",
                        action='store_true')
    parser.add_argument("--max-length",
                        dest="max_length",
                        default=None,
                        type=int)
    parser.add_argument("--lineend",
                        dest="lineend",
                        default=False,
                        const=True,
                        action='store_const')
    parser.add_argument("--context-filter",
                        dest="context_filter",
                        default="default")
    parser.add_argument("--verbose", action="store_true")
    arg_values = parser.parse_args(args)
    if arg_values.format == "terms":
        terms, tokenizer = data.term_data(
            data.RawDataset(
                list(
                    itertools.islice(
                        data.read_text_data(arg_values.scrape_file),
                        arg_values.max_tuples))),
            tokenizers[arg_values.tokenizer], arg_values.num_keywords, 2)
        if arg_values.max_length:
            terms = [
                data.normalizeSentenceLength(term, arg_values.max_length)
                for term in terms
            ]
        for term in terms:
            print(tokenizer.toString(
                list(itertools.takewhile(lambda x: x != data.EOS_token,
                                         term))),
                  end="\\n\n" if arg_values.lineend else "\n")
    elif arg_values.format == "goals":
        dataset = data.get_text_data(arg_values)
        for prev_tactics, hyps, goal, tactic in dataset:
            print(goal)
    elif arg_values.format == "hyps+goal":
        dataset = data.get_text_data(arg_values)
        for prev_tactics, hyps, goal, tactic in dataset:
            for hyp in hyps:
                print(hyp)
            print("================================")
            print(goal)
    elif arg_values.format == "hyps+goal+tactic":
        dataset = data.get_text_data(arg_values)
        for prev_tactics, hyps, goal, tactic in dataset:
            for hyp in hyps:
                print(hyp)
            print("================================")
            print(goal)
            print("====> {}".format(tactic))
        pass
    elif arg_values.format == "tacvector":
        dataset = data.get_text_data(arg_values)
        embedding = SimpleEmbedding()
        eprint("Encoding tactics...", guard=arg_values.verbose)
        answers = [
            embedding.encode_token(serapi_instance.get_stem(datum.tactic))
            for datum in dataset
        ]
        stripped_data = [strip_scraped_output(scraped) for scraped in dataset]
        eprint("Constructing features...", guard=arg_values.verbose)
        word_feature_functions = [
            word_feature_constructor(stripped_data, arg_values)
            for word_feature_constructor in features.word_feature_constructors
        ]
        vec_features_functions = [
            vec_feature_constructor(stripped_data, arg_values)
            for vec_feature_constructor in features.vec_feature_constructors
        ]
        eprint("Extracting features...", guard=arg_values.verbose)
        word_features = [[feature(c) for feature in word_feature_functions]
                         for c in stripped_data]
        vec_features = [[
            feature_val for feature in vec_features_functions
            for feature_val in feature(c)
        ] for c in stripped_data]
        eprint("Done", guard=arg_values.verbose)
        for word_feat, vec_feat, tactic in zip(word_features, vec_features,
                                               answers):
            print(",".join(
                list(map(str, word_feat)) + list(map(str, vec_feat)) +
                [str(tactic)]))