Esempio n. 1
0
def tactic_substitutions(substitutions: Dict[str, str],
                         sample: ScrapedTactic) \
        -> ScrapedTactic:
    relevant_lemmas, prev_tactics, context, tactic = sample
    return ScrapedTactic(
        relevant_lemmas, prev_tactics, context, tactic if get_stem(tactic)
        not in substitutions else substitutions[get_stem(tactic)])
Esempio n. 2
0
    def add_tactic(self, predictions : List[PredictionResult], correct : str) -> None:
        self.num_tactics += 1

        if predictions[0].grade == "goodcommand" or \
           predictions[0].grade == "mostlygoodcommand":
            self.num_correct += 1
            self.num_partial += 1
            self.correctly_predicted_frequency[get_stem(correct)] += 1
        elif predictions[0].grade == "okaycommand":
            self.num_partial += 1
        else:
            self.num_failed += 1

        for prediction, grade, certainty in predictions:
            if grade == "goodcommand" or \
               grade == "mostlygoodcommand":
                self.num_topN += 1
                break
        for prediction, grade, certainty in predictions:
            if grade == "goodcommand" or \
               grade == "mostlygoodcommand":
                self.num_topNPartial += 1
                break
            if grade == "okaycommand":
                self.num_topNPartial += 1
                break

        self.actual_tactic_frequency[get_stem(correct)] += 1
        self.predicted_tactic_frequency[get_stem(predictions[0].prediction)] += 1
Esempio n. 3
0
 def _get_prev(self, in_data: TacticContext) -> int:
     stem = get_stem(in_data.prev_tactics[-1]) \
         if len(in_data.prev_tactics) > 1 else "Proof"
     if self._embedding.has_token(stem):
         return self._embedding.encode_token(stem)
     else:
         return self._embedding.encode_token("eauto")
    def predictKTacticsWithLoss(
            self, in_data: TacticContext, k: int,
            correct: str) -> Tuple[List[Prediction], float]:
        with self._lock:
            distribution = self.predictDistribution(in_data)
            stem = get_stem(correct)
            if self._embedding.has_token(stem):
                output_var = maybe_cuda(
                    Variable(
                        torch.LongTensor([self._embedding.encode_token(stem)
                                          ])))
                loss = self._criterion(distribution, output_var).item()
            else:
                loss = 0

            if k > self._embedding.num_tokens():
                k = self._embedding.num_tokens()
            probs_and_indices = distribution.squeeze().topk(k)
            predictions = [
                Prediction(
                    self._embedding.decode_token(idx.item()) + ".",
                    math.exp(certainty.item()))
                for certainty, idx in zip(*probs_and_indices)
            ]
        return predictions, loss
    def predictKTacticsWithLoss_batch(self, in_data: List[TacticContext],
                                      k: int, corrects: List[str]):
        assert self.training_args
        with self._lock:
            input_tensor = Variable(
                FloatTensor([
                    encode_ngram_classify_input(in_data_point.goal,
                                                self.training_args.num_grams,
                                                self._tokenizer)
                    for in_data_point in in_data
                ]))
            prediction_distributions = self._lsoftmax(
                self._model(input_tensor))
            correct_stems = [get_stem(correct) for correct in corrects]
            output_var = maybe_cuda(
                Variable(
                    torch.LongTensor([
                        self._embedding.encode_token(correct_stem)
                        if self._embedding.has_token(correct_stem) else 0
                        for correct_stem in correct_stems
                    ])))
            loss = self._criterion(prediction_distributions, output_var).item()
            if k > self._embedding.num_tokens():
                k = self._embedding.num_tokens()

            certainties_and_idxs_list = \
                [single_distribution.view(-1).topk(k)
                 for single_distribution in list(prediction_distributions)]
            results = [[
                Prediction(
                    self._embedding.decode_token(stem_idx.item()) + ".",
                    math.exp(certainty.item()))
                for certainty, stem_idx in zip(*certainties_and_idxs)
            ] for certainties_and_idxs in certainties_and_idxs_list]
        return results, loss
Esempio n. 6
0
 def __call__(self, context: TacticContext) -> int:
     prev_tactic = (serapi_instance.get_stem(context.prev_tactics[-1])
                    if len(context.prev_tactics) > 1 else "Proof")
     if prev_tactic in self.tacticKeywords:
         return self.tacticKeywords.index(prev_tactic) + 1
     else:
         return 0
Esempio n. 7
0
 def __call__(self, context: TacticContext) -> List[float]:
     prev_tactic = (serapi_instance.get_stem(context.prev_tactics[-1])
                    if len(context.prev_tactics) > 1 else "Proof")
     oneHotPrevs = [0.] * len(self.tacticKeywords)
     if prev_tactic in self.tacticKeywords:
         oneHotPrevs[self.tacticKeywords.index(prev_tactic)] = 1.
     return oneHotPrevs
Esempio n. 8
0
 def predictKTacticsWithLoss(self, in_data : TacticContext, k : int, correct : str) -> \
     Tuple[List[Prediction], float]:
     assert self.training_args
     assert self._embedding
     with self._lock:
         prediction_distribution = self._predictDistributions([in_data])[0]
     if k > self._embedding.num_tokens():
         k = self._embedding.num_tokens()
     correct_stem = serapi_instance.get_stem(correct)
     if self._embedding.has_token(correct_stem):
         output_var = maybe_cuda(
             Variable(
                 LongTensor([self._embedding.encode_token(correct_stem)])))
         loss = self._criterion(prediction_distribution.view(1, -1),
                                output_var).item()
     else:
         loss = 0
     if len(in_data.hypotheses) == 0:
         certainties, idxs = topk_with_filter(
             prediction_distribution.view(-1), k,
             lambda certainty, idx: not serapi_instance.tacticTakesHypArgs(
                 cast(Embedding, self._embedding).decode_token(idx)))
     else:
         certainties, idxs = prediction_distribution.view(-1).topk(k)
     results = [
         Prediction(
             self.add_arg(self._embedding.decode_token(stem_idx.item()),
                          in_data.goal, in_data.hypotheses,
                          self.training_args.max_length),
             math.exp(certainty.item()))
         for certainty, stem_idx in zip(certainties, idxs)
     ]
     return results, loss
Esempio n. 9
0
    def predictKTacticsWithLoss(
            self, in_data: TacticContext, k: int,
            correct: str) -> Tuple[List[Prediction], float]:
        self.lock.acquire()
        prediction_distribution = self.predictDistribution(in_data)
        correct_stem = get_stem(correct)
        if self.embedding.has_token(correct_stem):
            output_var = maybe_cuda(
                Variable(
                    torch.LongTensor(
                        [self.embedding.encode_token(correct_stem)])))
            loss = self.criterion(prediction_distribution, output_var).item()
        else:
            loss = 0

        certainties_and_idxs = prediction_distribution.view(-1).topk(k)
        results = [
            Prediction(
                self.embedding.decode_token(stem_idx.item()) + ".",
                math.exp(certainty.item()))
            for certainty, stem_idx in zip(*certainties_and_idxs)
        ]

        self.lock.release()
        return results, loss
Esempio n. 10
0
    def predictKTacticsWithLoss(
            self, in_data: TacticContext, k: int,
            correct: str) -> Tuple[List[Prediction], float]:
        with self._lock:
            prediction_distribution = self._predictDistributions([in_data])[0]
            correct_stem = get_stem(correct)
            if self._embedding.has_token(correct_stem):
                output_var = maybe_cuda(
                    Variable(
                        torch.LongTensor(
                            [self._embedding.encode_token(correct_stem)])))
                loss = self._criterion(prediction_distribution.view(1, -1),
                                       output_var).item()
            else:
                loss = 0

            if k > self._embedding.num_tokens():
                k = self._embedding.num_tokens()
            certainties_and_idxs = prediction_distribution.view(-1).topk(k)
            results = [
                Prediction(
                    self._embedding.decode_token(stem_idx.item()) + ".",
                    math.exp(certainty.item()))
                for certainty, stem_idx in zip(*certainties_and_idxs)
            ]

        return results, loss
Esempio n. 11
0
def encode_seq_classify_data(data : RawDataset,
                             tokenizer_type : Callable[[List[str], int], Tokenizer],
                             num_keywords : int,
                             num_reserved_tokens : int,
                             save_tokens : Optional[str] = None,
                             load_tokens : Optional[str] = None,
                             num_relevance_samples : int = 1000) \
    -> Tuple[ClassifySequenceDataset, Tokenizer, SimpleEmbedding]:
    embedding = SimpleEmbedding()
    subset = RawDataset(random.sample(data, num_relevance_samples))
    if load_tokens:
        print("Loading tokens from {}".format(load_tokens))
        tokenizer = torch.load(load_tokens)
    else:
        start = time.time()
        print("Picking tokens...", end="")
        sys.stdout.flush()
        tokenizer = make_keyword_tokenizer_relevance(
            [(context, embedding.encode_token(get_stem(tactic)))
             for prev_tactics, hyps, context, tactic in subset],
            tokenizer_type, num_keywords, num_reserved_tokens)
        print("{}s".format(time.time() - start))
    if save_tokens:
        print("Saving tokens to {}".format(save_tokens))
        torch.save(tokenizer, save_tokens)
    with multiprocessing.Pool(None) as pool:
        result = [(goal, embedding.encode_token(tactic))
                  for goal, tactic in chain.from_iterable(
                      pool.imap(
                          functools.partial(encode_seq_classify_data_worker__,
                                            tokenizer), chunks(data, 1024)))]
    tokenizer.freezeTokenList()
    return result, tokenizer, embedding
Esempio n. 12
0
def get_tokens(args: List[str]):
    parser = argparse.ArgumentParser(description="Pick a set of tokens")
    parser.add_argument("--type", choices=["mixed"], default="mixed")
    parser.add_argument("-v", "--verbose", action='count', default=0)
    parser.add_argument("-n", "--num-keywords", type=int, default=120)
    parser.add_argument("-s", "--num-samples", type=int, default=2000)
    parser.add_argument("-j", "--num-threads", type=int, default=None)
    parser.add_argument("scrapefile", type=Path2)
    parser.add_argument("dest")
    arg_values = parser.parse_args(args)

    with print_time("Reading scraped data", guard=arg_values.verbose):
        raw_data = list(data.read_text_data(arg_values.scrapefile))
    embedding = SimpleEmbedding()
    subset = data.RawDataset(random.sample(raw_data, arg_values.num_samples))
    relevance_pairs = [
        (goal, embedding.encode_token(serapi_instance.get_stem(tactic)))
        for relevant_lemmas, prev_tactics, hyps, goal, tactic in subset
    ]
    with print_time("Calculating keywords", guard=arg_values.verbose):
        keywords = get_relevant_k_keywords2(relevance_pairs,
                                            arg_values.num_keywords,
                                            arg_values.num_threads)

    with (open(arg_values.dest, mode='w') if arg_values.dest != "-" else
          contextlib.nullcontext(sys.stdout)) as f:
        for keyword in keywords:
            f.write(keyword + "\n")
Esempio n. 13
0
 def predictKTacticsWithLoss(
         self, in_data: TacticContext, k: int,
         correct: str) -> Tuple[List[Prediction], float]:
     with self._lock:
         distribution, hyp_var = self._predictDistribution(in_data)
         correct_stem = serapi_instance.get_stem(correct)
         if self._embedding.has_token(correct_stem):
             loss = self._criterion(
                 distribution.view(1, -1),
                 Variable(
                     LongTensor([
                         self._embedding.encode_token(correct_stem)
                     ]))).item()
         else:
             loss = float("+inf")
     indices, probabilities = list_topk(list(distribution), k)
     predictions: List[Prediction] = []
     for certainty, idx in zip(probabilities, indices):
         stem = self._embedding.decode_token(idx)
         if serapi_instance.tacticTakesHypArgs(stem):
             predictions.append(
                 Prediction(stem + " " + hyp_var + ".",
                            math.exp(certainty)))
         else:
             predictions.append(Prediction(stem + ".", math.exp(certainty)))
     return predictions, loss
Esempio n. 14
0
def grade_prediction(correct_inter : ScrapedTactic, prediction : str):
    correct_tactic = correct_inter.tactic
    correct_tactic_normalized = \
        serapi_instance.normalizeNumericArgs(correct_inter).tactic
    prediction_normalized = \
        serapi_instance.normalizeNumericArgs(ScrapedTactic(
            correct_inter.prev_tactics, correct_inter.hypotheses, correct_inter.goal,
            prediction)).tactic
    if correct_tactic.strip() == prediction.strip() or\
       correct_tactic_normalized.strip() == prediction_normalized.strip():
        return "goodcommand"
    elif get_stem(correct_tactic).strip() == get_stem(prediction).strip():
        return "okaycommand"
    elif correct_tactic.strip() in proper_subs and \
         proper_subs[correct_tactic.strip()] == prediction.strip():
        return "mostlygoodcommand"
    else:
        return "badcommand"
Esempio n. 15
0
 def grade_command_result(self, initial_context: str, predicted: str,
                          predicted_context: str, actual: str,
                          actual_context: str,
                          exception: Optional[Exception]) -> str:
     if actual.strip() == predicted.strip():
         return "goodcommand"
     elif (get_stem(actual) == get_stem(predicted)):
         return "okaycommand"
     elif type(exception) == ParseError or type(exception) == LexError:
         return "superfailedcommand"
     elif exception != None:
         return "failedcommand"
     elif predicted_context == actual_context:
         return "mostlygoodcommand"
     elif predicted_context == initial_context:
         return "uselesscommand"
     else:
         return "badcommand"
Esempio n. 16
0
 def _encode_tokenized_data(self, data : TokenizedDataset, arg_values : Namespace,
                            tokenizer : Tokenizer, embedding : Embedding) \
     -> PECDataset:
     return PECDataset([
         PECSample(
             embedding.encode_token(
                 get_stem(prev_tactics[-1]
                          ) if len(prev_tactics) > 1 else "Proof"), goal,
             tactic) for prev_tactics, goal, tactic in data
     ])
def embed_data(data : RawDataset) -> Tuple[Embedding, StrictEmbeddedDataset]:
    embedding = SimpleEmbedding()
    start = time.time()
    print("Embedding data...", end="")
    sys.stdout.flush()
    dataset = StrictEmbeddedDataset([EmbeddedSample(
        prev_tactics, hypotheses, goal, embedding.encode_token(get_stem(tactic)))
                                     for prev_tactics, hypotheses, goal, tactic
                                     in data])
    print("{:.2f}s".format(time.time() - start))
    return embedding, dataset
Esempio n. 18
0
 def __init__(self, init_dataset: List[TacticContext],
              args: argparse.Namespace) -> None:
     prevTacticsCounts: typing.Counter[str] = Counter()
     for prev_tactics, hyps, goal in init_dataset:
         if len(prev_tactics) > 2:
             prevTacticsCounts[serapi_instance.get_stem(
                 prev_tactics[-1])] += 1
     self.tacticKeywords = ["Proof"] + \
         [word for word, count in
          prevTacticsCounts.most_common(args.num_tactic_keywords)]
     eprint("Tactic keywords are {}".format(self.tacticKeywords),
            guard=args.print_keywords)
 def predictKTacticsWithLoss(self, in_data : TacticContext, k : int,
                             correct : str) -> Tuple[List[Prediction], float]:
     distribution = self.predictDistribution(in_data)
     correct_stem = get_stem(correct)
     if self.embedding.has_token(correct_stem):
         loss = self.criterion(torch.FloatTensor(distribution).view(1, -1), Variable(torch.LongTensor([self.embedding.encode_token(correct_stem)]))).item()
     else:
         loss = float("+inf")
     indices, probabilities = list_topk(list(distribution), k)
     predictions = [Prediction(self.embedding.decode_token(idx) + ".",
                               math.exp(certainty))
                    for certainty, idx in zip(probabilities, indices)]
     return predictions, loss
Esempio n. 20
0
    def add_command_result(self, predictions: List[str], grades: List[str],
                           actual: str, loss: float) -> None:
        add_to_freq_table(self.actual_tactic_frequency, get_stem(actual))
        add_to_freq_table(self.predicted_tactic_frequency,
                          get_stem(predictions[0]))

        self.total_loss += loss

        self.num_tactics += 1
        if (grades[0] == "goodcommand" or grades[0] == "mostlygoodcommand"):
            add_to_freq_table(self.correctly_predicted_frequency,
                              get_stem(predictions[0]))
            self.num_correct += 1
            self.num_partial += 1
        elif (grades[0] == "okaycommand"):
            self.num_partial += 1
        elif (grades[0] == "failedcommand"
              or grades[0] == "superfailedcommand"):
            self.num_failed += 1

        for grade in grades:
            if (grade == "goodcommand" or grade == "mostlygoodcommand"):
                self.num_topN += 1
                self.num_topNPartial += 1
                break
            if (grade == "okaycommand"):
                self.num_topNPartial += 1
                break
        for grade in grades:
            if (grade == "goodcommand" or grade == "mostlygoodcommand"):
                self.num_searched += 1
                break
            if (grade != "failedcommand" and grade != "superfailedcommand"
                    and grade != "uselesscommand"):
                break
        pass
 def _features(self, context: TacticContext) \
         -> Tuple[List[int], List[float]]:
     if len(context.prev_tactics) > 1:
         prev_tactic = serapi_instance.get_stem(context.prev_tactics[-1])
         prev_tactic_index = emap_lookup(self.tactic_map, 32, prev_tactic)
     else:
         prev_tactic_index = 0
     if context.goal != "":
         goal_head_index = emap_lookup(self.token_map, 128,
                                       tokenizer.get_words(context.goal)[0])
     else:
         goal_head_index = 0
     goal_length_feature = min(len(tokenizer.get_words(context.goal)),
                               100) / 100
     num_hyps_feature = min(len(context.hypotheses), 30) / 30
     return [prev_tactic_index, goal_head_index], \
         [goal_length_feature, num_hyps_feature]
Esempio n. 22
0
 def from_data(init_dataset: List[TacticContext],
               args: argparse.Namespace) -> 'PrevTactic':
     prevTacticsCounts: typing.Counter[str] = Counter()
     for relevant_lemmas, prev_tactics, hyps, goal in init_dataset:
         if len(prev_tactics) > 2:
             prevTacticsCounts[
                 serapi_instance.get_stem(prev_tactics[-1])] += 1
     if args.load_tactic_keywords and \
        Path2(args.load_tactic_keywords).exists():
         result = PrevTactic(torch.load(args.load_tactic_keywords))
     else:
         result = PrevTactic(["Proof"] +
                             [word for word, count in
                              prevTacticsCounts.most_common(
                                  args.num_tactic_keywords)])
     eprint("Tactic keywords are {}".format(result.tacticKeywords),
            guard=args.print_keywords)
     return result
Esempio n. 23
0
    def predictKTacticsWithLoss_batch(self,
                                      in_data : List[TacticContext],
                                      k : int, corrects : List[str]) -> \
                                      Tuple[List[List[Prediction]], float]:
        assert self.training_args
        if len(in_data) == 0:
            return [], 0
        with self._lock:
            tokenized_goals = [
                self._tokenizer.toTokenList(goal)
                for prev_tactics, hypotheses, goal in in_data
            ]
            input_tensor = LongTensor([
                inputFromSentence(tokenized_goal,
                                  self.training_args.max_length)
                for tokenized_goal in tokenized_goals
            ])
            prediction_distributions = self._model.run(input_tensor,
                                                       batch_size=len(in_data))
            correct_stems = [get_stem(correct) for correct in corrects]
            output_var = maybe_cuda(
                Variable(
                    torch.LongTensor([
                        self._embedding.encode_token(correct_stem)
                        if self._embedding.has_token(correct_stem) else 0
                        for correct_stem in correct_stems
                    ])))
            loss = self._criterion(prediction_distributions, output_var).item()

            if k > self._embedding.num_tokens():
                k = self._embedding.num_tokens()

            certainties_and_idxs_list = [
                single_distribution.view(-1).topk(k)
                for single_distribution in list(prediction_distributions)
            ]
            results = [[
                Prediction(
                    self._embedding.decode_token(stem_idx.item()) + ".",
                    math.exp(certainty.item()))
                for certainty, stem_idx in zip(*certainties_and_idxs)
            ] for certainties_and_idxs in certainties_and_idxs_list]
        return results, loss
def predictKTacticsWithLoss(prediction_distribution : torch.FloatTensor,
                            embedding : Embedding,
                            k : int,
                            correct : str,
                            criterion : nn.Module) -> Tuple[List[Prediction], float]:
    if k > embedding.num_tokens():
        k = embedding.num_tokens()
    correct_stem = get_stem(correct)
    if embedding.has_token(correct_stem):
        output_var = maybe_cuda(Variable(
            torch.LongTensor([embedding.encode_token(correct_stem)])))
        loss = criterion(prediction_distribution.view(1, -1), output_var).item()
    else:
        loss = 0

    certainties_and_idxs = prediction_distribution.view(-1).topk(k)
    results = [Prediction(embedding.decode_token(stem_idx.item()) + ".",
                          math.exp(certainty.item()))
               for certainty, stem_idx in zip(*certainties_and_idxs)]

    return results, loss
Esempio n. 25
0
 def predictKTacticsWithLoss_batch(self,
                                   in_data : List[TacticContext],
                                   k : int, corrects : List[str]) -> \
                                   Tuple[List[List[Prediction]], float]:
     assert self._embedding
     assert self.training_args
     with self._lock:
         prediction_distributions = self._predictDistributions(in_data)
     correct_stems = [
         serapi_instance.get_stem(correct) for correct in corrects
     ]
     output_var = maybe_cuda(
         Variable(
             LongTensor([
                 self._embedding.encode_token(correct_stem)
                 if self._embedding.has_token(correct_stem) else 0
                 for correct_stem in correct_stems
             ])))
     loss = self._criterion(prediction_distributions, output_var).item()
     if k > self._embedding.num_tokens():
         k = self._embedding.num_tokens()
     certainties_and_idxs_list = \
         [single_distribution.view(-1).topk(k) if len(context.hypotheses) > 0 else
          topk_with_filter(single_distribution.view(-1), k,
                           lambda certainty, idx:
                           not serapi_instance.tacticTakesHypArgs(
                               cast(Embedding, self._embedding).decode_token(idx)))
          for single_distribution, context in
          zip(prediction_distributions, in_data)]
     results = [[
         Prediction(
             self.add_arg(self._embedding.decode_token(stem_idx.item()),
                          in_datum.goal, in_datum.hypotheses,
                          self.training_args.max_length),
             math.exp(certainty.item()))
         for certainty, stem_idx in zip(*certainties_and_idxs)
     ]
                for certainties_and_idxs, in_datum in zip(
                    certainties_and_idxs_list, in_data)]
     return results, loss
Esempio n. 26
0
    def predictKTacticsWithLoss(
            self, in_data: TacticContext, k: int,
            correct: str) -> Tuple[List[Prediction], float]:
        self.lock.acquire()
        distribution = self.predictDistribution(in_data)
        stem = get_stem(correct)
        if self.embedding.has_token(stem):
            output_var = maybe_cuda(
                Variable(torch.LongTensor([self.embedding.encode_token(stem)
                                           ])))
            loss = self.criterion(distribution.view(1, -1), output_var).item()
        else:
            loss = 0

        certainties, idxs = distribution.squeeze().topk(k)
        predictions_and_certainties = \
            [Prediction(self.embedding.decode_token(idx.item()) + ".",
                        math.exp(certainty.item()))
             for certainty, idx in zip(list(certainties), list(idxs))]
        self.lock.release()

        return predictions_and_certainties, loss
Esempio n. 27
0
def encode_hyparg_data(data : RawDataset,
                       tokenizer_type : Callable[[List[str], int], Tokenizer],
                       num_keywords : int,
                       num_reserved_tokens : int,
                       max_args : int,
                       max_hyps : int,
                       encoded_length : int,
                       entropy_data_size : int,
                       num_threads : Optional[int] = None) -> \
                       Tuple[StructDataset, Tokenizer, SimpleEmbedding]:
    stem_embedding = SimpleEmbedding()
    data_list = list(data)
    if len(data_list) <= entropy_data_size:
        subset = data_list
    else:
        subset = random.sample(data_list, entropy_data_size)
    tokenizer = make_keyword_tokenizer_relevance(
        [(context, stem_embedding.encode_token(
            serapi_instance.get_stem(tactic)))
         for relevant_lemmas, prev_tactics, hyps, context, tactic in subset],
        tokenizer_type, num_keywords, num_reserved_tokens)
    termEncoder = functools.partial(getNGramTokenbagVector, 1,
                                    tokenizer.numTokens())
    with multiprocessing.Pool(num_threads) as pool:
        hyps, contexts, tactics = zip(*data_list)
        encoded_contexts = pool.imap(
            functools.partial(_encode, tokenizer, termEncoder), contexts)
        encoded_hyps = pool.imap(
            functools.partial(_encode_hyps, tokenizer, termEncoder, max_hyps,
                              encoded_length), contexts)
        encoded_tactics = pool.imap(
            functools.partial(encode_tactic_structure, stem_embedding,
                              max_args), zip(hyps, tactics))
        result = list(zip(encoded_hyps, encoded_contexts, encoded_tactics))
    tokenizer.freezeTokenList()
    return result, tokenizer, stem_embedding
Esempio n. 28
0
def main(arg_list: List[str]) -> None:
    parser = argparse.ArgumentParser(description="Autoencoder for coq terms")
    parser.add_argument("scrape_file")
    parser.add_argument("autoencoder_weights")
    parser.add_argument("save_file")
    parser.add_argument("--num-epochs",
                        dest="num_epochs",
                        default=15,
                        type=int)
    parser.add_argument("--batch-size",
                        dest="batch_size",
                        default=256,
                        type=int)
    parser.add_argument("--max-tuples",
                        dest="max_tuples",
                        default=None,
                        type=int)
    parser.add_argument("--print-every",
                        dest="print_every",
                        default=10,
                        type=int)
    parser.add_argument("--learning-rate",
                        dest="learning_rate",
                        default=.7,
                        type=float)
    parser.add_argument("--gamma", default=.9, type=float)
    parser.add_argument("--epoch-step", dest="epoch_step", default=5, type=int)
    parser.add_argument("--optimizer",
                        choices=list(stdargs.optimizers.keys()),
                        type=str,
                        default=list(stdargs.optimizers.keys())[0])
    parser.add_argument("--num-classifier-layers",
                        dest="num_classifier_layers",
                        default=3,
                        type=int)
    parser.add_argument("--classifier-hidden-size",
                        dest="classifier_hidden_size",
                        default=128,
                        type=int)
    parser.add_argument("--train-autoencoder",
                        dest="train_autoencoder",
                        default=False,
                        const=True,
                        action='store_const')
    args = parser.parse_args(arg_list)
    print("Loading autoencoder state...")
    autoenc_state = torch.load(args.autoencoder_weights)
    cfilter = autoenc_state['context-filter']

    text_data = get_text_data(args)
    print("Encoding data...")
    start = time.time()
    tokenizer = autoenc_state['tokenizer']
    embedding = SimpleEmbedding()
    dataset = [(tokenizer.toTokenList(goal),
                embedding.encode_token(get_stem(tactic)))
               for prev_tactics, hyps, goal, tactic in text_data]
    timeTaken = time.time() - start
    print("Encoded data in {:.2f}".format(timeTaken))

    loadedAutoencoder = maybe_cuda(
        EncoderRNN(tokenizer.numTokens(), autoenc_state['hidden-size'],
                   autoenc_state['num-encoder-layers'], args.batch_size))
    loadedAutoencoder.load_state_dict(autoenc_state['encoder'])
    checkpoints = train(
        dataset, loadedAutoencoder, args.train_autoencoder,
        autoenc_state['max-length'],
        autoenc_state['hidden-size'], args.classifier_hidden_size,
        embedding.num_tokens(), args.num_classifier_layers, args.batch_size,
        args.learning_rate, args.gamma, args.epoch_step, args.num_epochs,
        args.print_every, stdargs.optimizers[args.optimizer])

    for epoch, (decoder_state, autoencoder_state,
                training_loss) in enumerate(checkpoints):
        print("Autoenc training loss is {:.4f}".format(
            autoenc_state['training-loss']))
        state = {
            'epoch': epoch,
            'training-loss': training_loss,
            'autoenc-training-loss': autoenc_state['training-loss'],
            'autoenc-epoch': autoenc_state['epoch'],
            'tokenizer': tokenizer,
            'tokenizer-name': autoenc_state['tokenizer-name'],
            'optimizer': args.optimizer,
            'autoenc-optimizer': autoenc_state['optimizer'],
            'learning-rate': args.learning_rate,
            'autoenc-learning-rate': autoenc_state['learning-rate'],
            'encoder': autoencoder_state,
            'decoder': decoder_state,
            'num-decoder-layers': args.num_classifier_layers,
            'num-encoder-layers': autoenc_state['num-encoder-layers'],
            'context-filter': cfilter,
            'max-length': autoenc_state['max-length'],
            'encoded-size': autoenc_state['hidden-size'],
            'hidden-size': args.classifier_hidden_size,
            'num-keywords': autoenc_state['num-keywords'],
            'stem-embedding': embedding,
        }
        with open(args.save_file, 'wb') as f:
            print("=> Saving checkpoint at epoch {}".format(epoch))
            torch.save(state, f)
Esempio n. 29
0
def get_data(args: List[str]) -> None:
    parser = argparse.ArgumentParser(
        description="Parse datafiles into multiple formats")
    parser.add_argument("format",
                        choices=[
                            "terms", "goals", "hyps+goal", "hyps+goal+tactic",
                            "tacvector"
                        ])
    parser.add_argument("scrape_file", type=Path2)
    parser.add_argument("--tokenizer",
                        choices=list(tokenizers.keys()),
                        type=str,
                        default=list(tokenizers.keys())[0])
    parser.add_argument("--max-tuples",
                        dest="max_tuples",
                        default=None,
                        type=int)
    parser.add_argument("--num-keywords",
                        dest="num_keywords",
                        default=100,
                        type=int)
    parser.add_argument("--num-head-keywords",
                        dest="num_head_keywords",
                        type=int,
                        default=100)
    parser.add_argument("--num-tactic-keywords",
                        dest="num_tactic_keywords",
                        type=int,
                        default=50)
    parser.add_argument("--print-keywords",
                        dest="print_keywords",
                        action='store_true')
    parser.add_argument("--max-length",
                        dest="max_length",
                        default=None,
                        type=int)
    parser.add_argument("--lineend",
                        dest="lineend",
                        default=False,
                        const=True,
                        action='store_const')
    parser.add_argument("--context-filter",
                        dest="context_filter",
                        default="default")
    parser.add_argument("--verbose", action="store_true")
    arg_values = parser.parse_args(args)
    if arg_values.format == "terms":
        terms, tokenizer = data.term_data(
            data.RawDataset(
                list(
                    itertools.islice(
                        data.read_text_data(arg_values.scrape_file),
                        arg_values.max_tuples))),
            tokenizers[arg_values.tokenizer], arg_values.num_keywords, 2)
        if arg_values.max_length:
            terms = [
                data.normalizeSentenceLength(term, arg_values.max_length)
                for term in terms
            ]
        for term in terms:
            print(tokenizer.toString(
                list(itertools.takewhile(lambda x: x != data.EOS_token,
                                         term))),
                  end="\\n\n" if arg_values.lineend else "\n")
    elif arg_values.format == "goals":
        dataset = data.get_text_data(arg_values)
        for prev_tactics, hyps, goal, tactic in dataset:
            print(goal)
    elif arg_values.format == "hyps+goal":
        dataset = data.get_text_data(arg_values)
        for prev_tactics, hyps, goal, tactic in dataset:
            for hyp in hyps:
                print(hyp)
            print("================================")
            print(goal)
    elif arg_values.format == "hyps+goal+tactic":
        dataset = data.get_text_data(arg_values)
        for prev_tactics, hyps, goal, tactic in dataset:
            for hyp in hyps:
                print(hyp)
            print("================================")
            print(goal)
            print("====> {}".format(tactic))
        pass
    elif arg_values.format == "tacvector":
        dataset = data.get_text_data(arg_values)
        embedding = SimpleEmbedding()
        eprint("Encoding tactics...", guard=arg_values.verbose)
        answers = [
            embedding.encode_token(serapi_instance.get_stem(datum.tactic))
            for datum in dataset
        ]
        stripped_data = [strip_scraped_output(scraped) for scraped in dataset]
        eprint("Constructing features...", guard=arg_values.verbose)
        word_feature_functions = [
            word_feature_constructor(stripped_data, arg_values)
            for word_feature_constructor in features.word_feature_constructors
        ]
        vec_features_functions = [
            vec_feature_constructor(stripped_data, arg_values)
            for vec_feature_constructor in features.vec_feature_constructors
        ]
        eprint("Extracting features...", guard=arg_values.verbose)
        word_features = [[feature(c) for feature in word_feature_functions]
                         for c in stripped_data]
        vec_features = [[
            feature_val for feature in vec_features_functions
            for feature_val in feature(c)
        ] for c in stripped_data]
        eprint("Done", guard=arg_values.verbose)
        for word_feat, vec_feat, tactic in zip(word_features, vec_features,
                                               answers):
            print(",".join(
                list(map(str, word_feat)) + list(map(str, vec_feat)) +
                [str(tactic)]))
Esempio n. 30
0
def write_html(output_dir : Path2, filename : Path2, command_results : List[CommandResult],
               stats : 'ResultStats') -> None:
    def details_header(tag : Any, doc : Doc, text : Text, filename : Path2) -> None:
        header(tag, doc, text, details_css, details_javascript,
               "Proverbot Detailed Report for {}".format(filename))
    doc, tag, text, line = Doc().ttl()
    with tag('html'):
        details_header(tag, doc, text, filename)
        with tag('div', id='overlay', onclick='event.stopPropagation();'):
            with tag('div', id='predicted'):
                pass
            with tag('div', id='context'):
                pass
            with tag('div', id='stats'):
                pass
            pass
        with tag('body', onclick='deselectTactic()',
                 onload='init()'), tag('pre'):
            for region_idx, region in enumerate(split_into_regions(command_results)):
                if len(region) > 1 and len(region[1]) == 1:
                    for cmd_idx, command_result in enumerate(region):
                        assert isinstance(command_result[0], str)
                        with tag('code', klass='plaincommand'):
                            text("\n" + command_result[0].strip('\n'))
                else:
                    doc.stag("br")
                    with tag('button', klass='collapsible',
                             id='collapsible-{}'.format(region_idx)):
                        with tag('code', klass='buttontext'):
                            assert isinstance(region[0][0], str), region
                            text(region[0][0].strip("\n"))
                        num_unfiltered = count_region_unfiltered(region)
                        with tag('code', klass='numtacs ' +
                                 ('nonempty' if num_unfiltered > 3 else 'empty')):
                            text(num_unfiltered)
                    with tag('div', klass='region'):
                        for cmd_idx, command_result in enumerate(region[1:]):
                            if len(command_result) == 1:
                                assert isinstance(command_result[0], str)
                                with tag('code', klass='plaincommand'):
                                    text("\n" + command_result[0].strip('\n'))
                            else:
                                command, hyps, goal, prediction_results = \
                                    cast(TacticResult, command_result)
                                predictions : List[str]
                                grades : List[str]
                                certainties : List[float]
                                if len(prediction_results) > 0:
                                    predictions, grades, certainties = zip(*prediction_results) # type: ignore
                                else:
                                    predictions, grades, certainties = [], [], []
                                with tag('span',
                                         ('data-hyps',"\n".join(hyps)),
                                         ('data-goal',format_goal(goal)),
                                         ('data-num-total', str(stats.num_tactics)),
                                         ('data-predictions',
                                          to_list_string(cast(List[str], predictions))),
                                         ('data-num-predicteds',
                                          to_list_string([stats.predicted_tactic_frequency
                                                          .get(get_stem(prediction), 0)
                                                          for prediction in cast(List[str],
                                                                                 predictions)])),
                                         ('data-num-corrects',
                                          to_list_string([stats.correctly_predicted_frequency
                                                          .get(get_stem(prediction), 0)
                                                          for prediction in
                                                          cast(List[str], predictions)])),
                                         ('data-certainties',
                                          to_list_string(cast(List[float], certainties))),
                                         ('data-num-actual-corrects',
                                          stats.correctly_predicted_frequency
                                          .get(get_stem(command), 0)),
                                         ('data-num-actual-in-file',
                                          stats.actual_tactic_frequency
                                          .get(get_stem(command), 0)),
                                         ('data-actual-tactic',
                                          strip_comments(command)),
                                         ('data-grades',
                                          to_list_string(cast(List[str], grades))),
                                         ('data-search-idx', 0),
                                         id='command-{}-{}'.format(region_idx, cmd_idx),
                                         onmouseover='hoverTactic("{}-{}")'\
                                         .format(region_idx, cmd_idx),
                                         onmouseout='unhoverTactic()',
                                         onclick='selectTactic("{}-{}"); event.stopPropagation();'
                                         .format(region_idx, cmd_idx)):
                                    doc.stag("br")
                                    if len(grades) == 0:
                                        with tag('code', klass="plaincommand"):
                                            text(command.strip("\n"))
                                    else:
                                        with tag('code', klass=grades[0]):
                                            text(command.strip("\n"))
                                        for grade in grades[1:]:
                                            with tag('span', klass=grade):
                                                doc.asis(" &#11044;")
    with (output_dir / escape_filename(str(filename))).with_suffix(".html")\
                                                      .open(mode='w') as fout:
        fout.write(doc.getvalue())

    pass