コード例 #1
0
ファイル: data.py プロジェクト: UCSD-PL/proverbot9001
def tactic_substitutions(substitutions: Dict[str, str],
                         sample: ScrapedTactic) \
        -> ScrapedTactic:
    relevant_lemmas, prev_tactics, context, tactic = sample
    return ScrapedTactic(relevant_lemmas, prev_tactics, context,
                         tactic if get_stem(tactic) not in substitutions
                         else substitutions[get_stem(tactic)])
コード例 #2
0
    def add_tactic(self, predictions: List[PredictionResult],
                   correct: str) -> None:
        self.num_tactics += 1

        if predictions[0].grade == "goodcommand" or \
           predictions[0].grade == "mostlygoodcommand":
            self.num_correct += 1
            self.num_partial += 1
            self.correctly_predicted_frequency[get_stem(correct)] += 1
        elif predictions[0].grade == "okaycommand":
            self.num_partial += 1
        else:
            self.num_failed += 1

        for prediction, grade, certainty in predictions:
            if grade == "goodcommand" or \
               grade == "mostlygoodcommand":
                self.num_topN += 1
                break
        for prediction, grade, certainty in predictions:
            if grade == "goodcommand" or \
               grade == "mostlygoodcommand":
                self.num_topNPartial += 1
                break
            if grade == "okaycommand":
                self.num_topNPartial += 1
                break

        self.actual_tactic_frequency[get_stem(correct)] += 1
        self.predicted_tactic_frequency[get_stem(
            predictions[0].prediction)] += 1
コード例 #3
0
def get_tokens(args: List[str]):
    parser = argparse.ArgumentParser(description="Pick a set of tokens")
    parser.add_argument("--type", choices=["mixed"], default="mixed")
    parser.add_argument("-v", "--verbose", action='count', default=0)
    parser.add_argument("-n", "--num-keywords", type=int, default=120)
    parser.add_argument("-s", "--num-samples", type=int, default=2000)
    parser.add_argument("-j", "--num-threads", type=int, default=None)
    parser.add_argument("scrapefile", type=Path2)
    parser.add_argument("dest")
    arg_values = parser.parse_args(args)

    with print_time("Reading scraped data", guard=arg_values.verbose):
        raw_data = list(data.read_text_data(arg_values.scrapefile))
    embedding = SimpleEmbedding()
    subset = data.RawDataset(random.sample(raw_data, arg_values.num_samples))
    relevance_pairs = [
        (context.focused_goal,
         embedding.encode_token(serapi_instance.get_stem(tactic)))
        for relevant_lemmas, prev_tactics, context, tactic in subset
    ]
    with print_time("Calculating keywords", guard=arg_values.verbose):
        keywords = get_relevant_k_keywords2(relevance_pairs,
                                            arg_values.num_keywords,
                                            arg_values.num_threads)

    with (open(arg_values.dest, mode='w') if arg_values.dest != "-" else
          contextlib.nullcontext(sys.stdout)) as f:
        for keyword in keywords:
            f.write(keyword + "\n")
コード例 #4
0
 def predictKTacticsWithLoss_batch(self,
                                   in_data : List[TacticContext],
                                   k : int, corrects : List[str]) -> \
                                   Tuple[List[List[Prediction]], float]:
     assert self._embedding
     assert self.training_args
     with self._lock:
         prediction_distributions = self._predictDistributions(in_data)
     correct_stems = [serapi_instance.get_stem(correct) for correct in corrects]
     output_var = maybe_cuda(Variable(
         LongTensor([self._embedding.encode_token(correct_stem)
                     if self._embedding.has_token(correct_stem)
                     else 0
                     for correct_stem in correct_stems])))
     loss = self._criterion(prediction_distributions, output_var).item()
     if k > self._embedding.num_tokens():
         k = self._embedding.num_tokens()
     certainties_and_idxs_list = \
         [single_distribution.view(-1).topk(k) if len(context.hypotheses) > 0 else
          topk_with_filter(single_distribution.view(-1), k,
                           lambda certainty, idx:
                           not serapi_instance.tacticTakesHypArgs(
                               cast(Embedding, self._embedding).decode_token(idx)))
          for single_distribution, context in
          zip(prediction_distributions, in_data)]
     results = [[Prediction(self.add_arg(self._embedding.decode_token(stem_idx.item()),
                                         in_datum.goal, in_datum.hypotheses,
                                         self.training_args.max_length),
                            math.exp(certainty.item()))
                 for certainty, stem_idx in zip(*certainties_and_idxs)]
                for certainties_and_idxs, in_datum in
                zip(certainties_and_idxs_list, in_data)]
     return results, loss
コード例 #5
0
ファイル: features.py プロジェクト: UCSD-PL/proverbot9001
 def __call__(self, context: TacticContext) -> int:
     prev_tactic = (serapi_instance.get_stem(context.prev_tactics[-1])
                    if len(context.prev_tactics) > 1 else "Proof")
     if prev_tactic in self.tacticKeywords:
         return self.tacticKeywords.index(prev_tactic) + 1
     else:
         return 0
コード例 #6
0
    def predictKTacticsWithLoss(
            self, in_data: TacticContext, k: int,
            correct: str) -> Tuple[List[Prediction], float]:
        self.lock.acquire()
        prediction_distribution = self.predictDistribution(in_data)
        correct_stem = get_stem(correct)
        if self.embedding.has_token(correct_stem):
            output_var = maybe_cuda(
                Variable(
                    torch.LongTensor(
                        [self.embedding.encode_token(correct_stem)])))
            loss = self.criterion(prediction_distribution, output_var).item()
        else:
            loss = 0

        certainties_and_idxs = prediction_distribution.view(-1).topk(k)
        results = [
            Prediction(
                self.embedding.decode_token(stem_idx.item()) + ".",
                math.exp(certainty.item()))
            for certainty, stem_idx in zip(*certainties_and_idxs)
        ]

        self.lock.release()
        return results, loss
コード例 #7
0
 def predictKTacticsWithLoss(self, in_data : TacticContext, k : int, correct : str) -> \
     Tuple[List[Prediction], float]:
     assert self.training_args
     assert self._embedding
     with self._lock:
         prediction_distribution = self._predictDistributions([in_data])[0]
     if k > self._embedding.num_tokens():
         k = self._embedding.num_tokens()
     correct_stem = serapi_instance.get_stem(correct)
     if self._embedding.has_token(correct_stem):
         output_var = maybe_cuda(Variable(
             LongTensor([self._embedding.encode_token(correct_stem)])))
         loss = self._criterion(prediction_distribution.view(1, -1), output_var).item()
     else:
         loss = 0
     if len(in_data.hypotheses) == 0:
         certainties, idxs = topk_with_filter(
             prediction_distribution.view(-1), k,
             lambda certainty, idx:
             not serapi_instance.tacticTakesHypArgs(
                 cast(Embedding, self._embedding).decode_token(idx)))
     else:
         certainties, idxs = prediction_distribution.view(-1).topk(k)
     results = [Prediction(self.add_arg(self._embedding.decode_token(stem_idx.item()),
                                        in_data.goal, in_data.hypotheses,
                                        self.training_args.max_length),
                           math.exp(certainty.item()))
                for certainty, stem_idx in zip(certainties, idxs)]
     return results, loss
コード例 #8
0
ファイル: features.py プロジェクト: UCSD-PL/proverbot9001
 def __call__(self, context: TacticContext) -> List[float]:
     prev_tactic = (serapi_instance.get_stem(context.prev_tactics[-1])
                    if len(context.prev_tactics) > 1 else "Proof")
     oneHotPrevs = [0.] * len(self.tacticKeywords)
     if prev_tactic in self.tacticKeywords:
         oneHotPrevs[self.tacticKeywords.index(prev_tactic)] = 1.
     return oneHotPrevs
コード例 #9
0
    def predictKTacticsWithLoss(
            self, in_data: TacticContext, k: int,
            correct: str) -> Tuple[List[Prediction], float]:
        with self._lock:
            distribution = self.predictDistribution(in_data)
            stem = get_stem(correct)
            if self._embedding.has_token(stem):
                output_var = maybe_cuda(
                    Variable(
                        torch.LongTensor([self._embedding.encode_token(stem)
                                          ])))
                loss = self._criterion(distribution, output_var).item()
            else:
                loss = 0

            if k > self._embedding.num_tokens():
                k = self._embedding.num_tokens()
            probs_and_indices = distribution.squeeze().topk(k)
            predictions = [
                Prediction(
                    self._embedding.decode_token(idx.item()) + ".",
                    math.exp(certainty.item()))
                for certainty, idx in zip(*probs_and_indices)
            ]
        return predictions, loss
コード例 #10
0
    def predictKTacticsWithLoss_batch(self, in_data: List[TacticContext],
                                      k: int, corrects: List[str]):
        assert self.training_args
        with self._lock:
            input_tensor = Variable(
                FloatTensor([
                    encode_ngram_classify_input(in_data_point.goal,
                                                self.training_args.num_grams,
                                                self._tokenizer)
                    for in_data_point in in_data
                ]))
            prediction_distributions = self._lsoftmax(
                self._model(input_tensor))
            correct_stems = [get_stem(correct) for correct in corrects]
            output_var = maybe_cuda(
                Variable(
                    torch.LongTensor([
                        self._embedding.encode_token(correct_stem)
                        if self._embedding.has_token(correct_stem) else 0
                        for correct_stem in correct_stems
                    ])))
            loss = self._criterion(prediction_distributions, output_var).item()
            if k > self._embedding.num_tokens():
                k = self._embedding.num_tokens()

            certainties_and_idxs_list = \
                [single_distribution.view(-1).topk(k)
                 for single_distribution in list(prediction_distributions)]
            results = [[
                Prediction(
                    self._embedding.decode_token(stem_idx.item()) + ".",
                    math.exp(certainty.item()))
                for certainty, stem_idx in zip(*certainties_and_idxs)
            ] for certainties_and_idxs in certainties_and_idxs_list]
        return results, loss
コード例 #11
0
    def predictKTacticsWithLoss_batch(self,
                                      in_data : List[TacticContext],
                                      k : int, corrects : List[str]) -> \
                                      Tuple[List[List[Prediction]], float]:
        assert self.training_args
        if len(in_data) == 0:
            return [], 0
        with self._lock:
            tokenized_goals = [self._tokenizer.toTokenList(goal)
                               for relevant_lemmas, prev_tactics, hypotheses, goal
                               in in_data]
            input_tensor = LongTensor([inputFromSentence(tokenized_goal,
                                                         self.training_args.max_length)
                                      for tokenized_goal in tokenized_goals])
            prediction_distributions = self._model.run(input_tensor,
                                                       batch_size=len(in_data))
            correct_stems = [get_stem(correct) for correct in corrects]
            output_var = maybe_cuda(Variable(
                torch.LongTensor([self._embedding.encode_token(correct_stem)
                                  if self._embedding.has_token(correct_stem)
                                  else 0
                                  for correct_stem in correct_stems])))
            loss = self._criterion(prediction_distributions, output_var).item()

            if k > self._embedding.num_tokens():
                k = self._embedding.num_tokens()

            certainties_and_idxs_list = [single_distribution.view(-1).topk(k)
                                         for single_distribution in
                                         list(prediction_distributions)]
            results = [[Prediction(self._embedding.decode_token(stem_idx.item()) + ".",
                                   math.exp(certainty.item()))
                        for certainty, stem_idx in zip(*certainties_and_idxs)]
                       for certainties_and_idxs in certainties_and_idxs_list]
        return results, loss
コード例 #12
0
def encode_hyparg_data(data : RawDataset,
                       tokenizer_type : Callable[[List[str], int], Tokenizer],
                       num_keywords : int,
                       num_reserved_tokens : int,
                       max_args : int,
                       max_hyps : int,
                       encoded_length : int,
                       entropy_data_size : int,
                       num_threads : Optional[int] = None) -> \
                       Tuple[StructDataset, Tokenizer, SimpleEmbedding]:
    stem_embedding = SimpleEmbedding()
    data_list = list(data)
    if len(data_list) <= entropy_data_size:
        subset = data_list
    else:
        subset = random.sample(data_list, entropy_data_size)
    tokenizer = make_keyword_tokenizer_relevance(
        [(context, stem_embedding.encode_token(serapi_instance.get_stem(tactic)))
         for relevant_lemmas, prev_tactics, hyps, context, tactic in subset],
        tokenizer_type, num_keywords, num_reserved_tokens)
    termEncoder = functools.partial(getNGramTokenbagVector, 1, tokenizer.numTokens())
    with multiprocessing.Pool(num_threads) as pool:
        hyps, contexts, tactics = zip(*data_list)
        encoded_contexts = pool.imap(functools.partial(
            _encode, tokenizer, termEncoder), contexts)
        encoded_hyps = pool.imap(functools.partial(
            _encode_hyps, tokenizer, termEncoder, max_hyps, encoded_length), contexts)
        encoded_tactics = pool.imap(
            functools.partial(encode_tactic_structure, stem_embedding, max_args),
            zip(hyps, tactics))
        result = list(zip(encoded_hyps, encoded_contexts, encoded_tactics))
    tokenizer.freezeTokenList()
    return result, tokenizer, stem_embedding
コード例 #13
0
 def _get_prev(self, in_data: TacticContext) -> int:
     stem = get_stem(in_data.prev_tactics[-1]) \
         if len(in_data.prev_tactics) > 1 else "Proof"
     if self._embedding.has_token(stem):
         return self._embedding.encode_token(stem)
     else:
         return self._embedding.encode_token("eauto")
コード例 #14
0
    def predictKTacticsWithLoss(
            self, in_data: TacticContext, k: int,
            correct: str) -> Tuple[List[Prediction], float]:
        with self._lock:
            prediction_distribution = self._predictDistributions([in_data])[0]
            correct_stem = get_stem(correct)
            if self._embedding.has_token(correct_stem):
                output_var = maybe_cuda(
                    Variable(
                        torch.LongTensor(
                            [self._embedding.encode_token(correct_stem)])))
                loss = self._criterion(prediction_distribution.view(1, -1),
                                       output_var).item()
            else:
                loss = 0

            if k > self._embedding.num_tokens():
                k = self._embedding.num_tokens()
            certainties_and_idxs = prediction_distribution.view(-1).topk(k)
            results = [
                Prediction(
                    self._embedding.decode_token(stem_idx.item()) + ".",
                    math.exp(certainty.item()))
                for certainty, stem_idx in zip(*certainties_and_idxs)
            ]

        return results, loss
コード例 #15
0
 def grade_command_result(self, initial_context: str, predicted: str,
                          predicted_context: str, actual: str,
                          actual_context: str,
                          exception: Optional[Exception]) -> str:
     if actual.strip() == predicted.strip():
         return "goodcommand"
     elif (get_stem(actual) == get_stem(predicted)):
         return "okaycommand"
     elif type(exception) == ParseError or type(exception) == LexError:
         return "superfailedcommand"
     elif exception != None:
         return "failedcommand"
     elif predicted_context == actual_context:
         return "mostlygoodcommand"
     elif predicted_context == initial_context:
         return "uselesscommand"
     else:
         return "badcommand"
コード例 #16
0
 def _encode_tokenized_data(self, data : TokenizedDataset, arg_values : Namespace,
                            tokenizer : Tokenizer, embedding : Embedding) \
     -> PECDataset:
     return PECDataset([
         PECSample(
             embedding.encode_token(
                 get_stem(prev_tactics[-1]
                          ) if len(prev_tactics) > 1 else "Proof"), goal,
             tactic) for prev_tactics, goal, tactic in data
     ])
コード例 #17
0
def grade_prediction(correct_inter: ScrapedTactic, prediction: str):
    correct_tactic = correct_inter.tactic
    correct_tactic_normalized = \
        serapi_instance.normalizeNumericArgs(correct_inter).tactic
    prediction_normalized = \
        serapi_instance.normalizeNumericArgs(ScrapedTactic(
            correct_inter.relevant_lemmas, correct_inter.prev_tactics,
            correct_inter.context,
            prediction)).tactic
    if correct_tactic.strip() == prediction.strip() or\
       correct_tactic_normalized.strip() == prediction_normalized.strip():
        return "goodcommand"
    elif get_stem(correct_tactic).strip() == get_stem(prediction).strip():
        return "okaycommand"
    elif (correct_tactic.strip() in proper_subs
          and proper_subs[correct_tactic.strip()] == prediction.strip()):
        return "mostlygoodcommand"
    else:
        return "badcommand"
コード例 #18
0
def embed_data(data : RawDataset, embedding : Optional[Embedding] = None) \
    -> Tuple[Embedding, StrictEmbeddedDataset]:
    if not embedding:
        embedding = SimpleEmbedding()
    start = time.time()
    print("Embedding data...", end="")
    sys.stdout.flush()
    dataset = StrictEmbeddedDataset([
        EmbeddedSample(relevant_lemmas, prev_tactics, hypotheses, goal,
                       embedding.encode_token(get_stem(tactic)))
        for relevant_lemmas, prev_tactics, hypotheses, goal, tactic in data
    ])
    print("{:.2f}s".format(time.time() - start))
    return embedding, dataset
コード例 #19
0
    def add_command_result(self, predictions: List[str], grades: List[str],
                           actual: str, loss: float) -> None:
        add_to_freq_table(self.actual_tactic_frequency, get_stem(actual))
        add_to_freq_table(self.predicted_tactic_frequency,
                          get_stem(predictions[0]))

        self.total_loss += loss

        self.num_tactics += 1
        if (grades[0] == "goodcommand" or grades[0] == "mostlygoodcommand"):
            add_to_freq_table(self.correctly_predicted_frequency,
                              get_stem(predictions[0]))
            self.num_correct += 1
            self.num_partial += 1
        elif (grades[0] == "okaycommand"):
            self.num_partial += 1
        elif (grades[0] == "failedcommand"
              or grades[0] == "superfailedcommand"):
            self.num_failed += 1

        for grade in grades:
            if (grade == "goodcommand" or grade == "mostlygoodcommand"):
                self.num_topN += 1
                self.num_topNPartial += 1
                break
            if (grade == "okaycommand"):
                self.num_topNPartial += 1
                break
        for grade in grades:
            if (grade == "goodcommand" or grade == "mostlygoodcommand"):
                self.num_searched += 1
                break
            if (grade != "failedcommand" and grade != "superfailedcommand"
                    and grade != "uselesscommand"):
                break
        pass
コード例 #20
0
ファイル: features.py プロジェクト: UCSD-PL/proverbot9001
 def __init__(self, init_dataset: List[TacticContext],
              args: argparse.Namespace) -> None:
     prevTacticsCounts: typing.Counter[str] = Counter()
     for relevant_lemmas, prev_tactics, hyps, goal in init_dataset:
         if len(prev_tactics) > 2:
             prevTacticsCounts[serapi_instance.get_stem(
                 prev_tactics[-1])] += 1
     if args.load_tactic_keywords and Path2(
             args.load_tactic_keywords).exists():
         self.tacticKeywords = torch.load(args.load_tactic_keywords)
     else:
         self.tacticKeywords = ["Proof"] + \
             [word for word, count in
              prevTacticsCounts.most_common(args.num_tactic_keywords)]
     eprint("Tactic keywords are {}".format(self.tacticKeywords),
            guard=args.print_keywords)
コード例 #21
0
 def _features(self, context: TacticContext, certainty: float) \
         -> Tuple[List[int], List[float]]:
     if len(context.prev_tactics) > 1:
         prev_tactic = serapi_instance.get_stem(context.prev_tactics[-1])
         prev_tactic_index = emap_lookup(self.tactic_map, 32, prev_tactic)
     else:
         prev_tactic_index = 0
     if context.goal != "":
         goal_head_index = emap_lookup(self.token_map, 128,
                                       tokenizer.get_words(context.goal)[0])
     else:
         goal_head_index = 0
     goal_length_feature = min(len(tokenizer.get_words(context.goal)),
                               100) / 100
     num_hyps_feature = min(len(context.hypotheses), 30) / 30
     return [prev_tactic_index, goal_head_index], \
         [goal_length_feature, num_hyps_feature, certainty]
コード例 #22
0
ファイル: features.py プロジェクト: UCSD-PL/proverbot9001
 def from_data(init_dataset: List[TacticContext],
               args: argparse.Namespace) -> 'PrevTactic':
     prevTacticsCounts: typing.Counter[str] = Counter()
     for relevant_lemmas, prev_tactics, hyps, goal in init_dataset:
         if len(prev_tactics) > 2:
             prevTacticsCounts[serapi_instance.get_stem(
                 prev_tactics[-1])] += 1
     if args.load_tactic_keywords and \
        Path2(args.load_tactic_keywords).exists():
         result = PrevTactic(torch.load(args.load_tactic_keywords))
     else:
         result = PrevTactic(["Proof"] + [
             word for word, count in prevTacticsCounts.most_common(
                 args.num_tactic_keywords)
         ])
     eprint("Tactic keywords are {}".format(result.tacticKeywords),
            guard=args.print_keywords)
     return result
コード例 #23
0
 def predictKTacticsWithLoss(
         self, in_data: TacticContext, k: int,
         correct: str) -> Tuple[List[Prediction], float]:
     distribution = self.predictDistribution(in_data)
     correct_stem = get_stem(correct)
     if self.embedding.has_token(correct_stem):
         loss = self.criterion(
             torch.FloatTensor(distribution).view(1, -1),
             Variable(
                 torch.LongTensor(
                     [self.embedding.encode_token(correct_stem)]))).item()
     else:
         loss = float("+inf")
     indices, probabilities = list_topk(list(distribution), k)
     predictions = [
         Prediction(
             self.embedding.decode_token(idx) + ".", math.exp(certainty))
         for certainty, idx in zip(probabilities, indices)
     ]
     return predictions, loss
コード例 #24
0
 def predictKTacticsWithLoss(self, in_data : TacticContext, k : int,
                             correct : str) -> Tuple[List[Prediction], float]:
     with self._lock:
         distribution, hyp_var = self._predictDistribution(in_data)
         correct_stem = serapi_instance.get_stem(correct)
         if self._embedding.has_token(correct_stem):
             loss = self._criterion(distribution.view(1, -1),
                                    Variable(LongTensor([self._embedding.encode_token(correct_stem)]))).item()
         else:
             loss = float("+inf")
     indices, probabilities = list_topk(list(distribution), k)
     predictions : List[Prediction] = []
     for certainty, idx in zip(probabilities, indices):
         stem = self._embedding.decode_token(idx)
         if serapi_instance.tacticTakesHypArgs(stem):
             predictions.append(Prediction(stem + " " + hyp_var + ".",
                                           math.exp(certainty)))
         else:
             predictions.append(Prediction(stem + ".", math.exp(certainty)))
     return predictions, loss
コード例 #25
0
    def predictKTacticsWithLoss(
            self, in_data: TacticContext, k: int,
            correct: str) -> Tuple[List[Prediction], float]:
        self.lock.acquire()
        distribution = self.predictDistribution(in_data)
        stem = get_stem(correct)
        if self.embedding.has_token(stem):
            output_var = maybe_cuda(
                Variable(torch.LongTensor([self.embedding.encode_token(stem)
                                           ])))
            loss = self.criterion(distribution.view(1, -1), output_var).item()
        else:
            loss = 0

        certainties, idxs = distribution.squeeze().topk(k)
        predictions_and_certainties = \
            [Prediction(self.embedding.decode_token(idx.item()) + ".",
                        math.exp(certainty.item()))
             for certainty, idx in zip(list(certainties), list(idxs))]
        self.lock.release()

        return predictions_and_certainties, loss
コード例 #26
0
ファイル: data.py プロジェクト: UCSD-PL/proverbot9001
def encode_seq_classify_data(data : RawDataset,
                             tokenizer_type : Callable[[List[str], int], Tokenizer],
                             num_keywords : int,
                             num_reserved_tokens : int,
                             save_tokens : Optional[str] = None,
                             load_tokens : Optional[str] = None,
                             num_relevance_samples : int = 1000) \
    -> Tuple[ClassifySequenceDataset, Tokenizer, SimpleEmbedding]:
    embedding = SimpleEmbedding()
    subset = RawDataset(random.sample(data, num_relevance_samples))
    if load_tokens:
        print("Loading tokens from {}".format(load_tokens))
        tokenizer = torch.load(load_tokens)
    else:
        start = time.time()
        print("Picking tokens...", end="")
        sys.stdout.flush()
        tokenizer = make_keyword_tokenizer_relevance([(context,
                                                       embedding.encode_token(
                                                           get_stem(tactic)))
                                                      for prev_tactics, hyps,
                                                      context, tactic
                                                      in subset],
                                                     tokenizer_type,
                                                     num_keywords, num_reserved_tokens)
        print("{}s".format(time.time() - start))
    if save_tokens:
        print("Saving tokens to {}".format(save_tokens))
        torch.save(tokenizer, save_tokens)
    with multiprocessing.Pool(None) as pool:
        result = [(goal, embedding.encode_token(tactic)) for goal, tactic in
                  chain.from_iterable(pool.imap(functools.partial(
                      encode_seq_classify_data_worker__, tokenizer),
                                                          chunks(data, 1024)))]
    tokenizer.freezeTokenList()
    return result, tokenizer, embedding
コード例 #27
0
def predictKTacticsWithLoss(
        prediction_distribution: torch.FloatTensor, embedding: Embedding,
        k: int, correct: str,
        criterion: nn.Module) -> Tuple[List[Prediction], float]:
    if k > embedding.num_tokens():
        k = embedding.num_tokens()
    correct_stem = get_stem(correct)
    if embedding.has_token(correct_stem):
        output_var = maybe_cuda(
            Variable(torch.LongTensor([embedding.encode_token(correct_stem)])))
        loss = criterion(prediction_distribution.view(1, -1),
                         output_var).item()
    else:
        loss = 0

    certainties_and_idxs = prediction_distribution.view(-1).topk(k)
    results = [
        Prediction(
            embedding.decode_token(stem_idx.item()) + ".",
            math.exp(certainty.item()))
        for certainty, stem_idx in zip(*certainties_and_idxs)
    ]

    return results, loss
コード例 #28
0
def get_data(args: List[str]) -> None:
    parser = argparse.ArgumentParser(
        description="Parse datafiles into multiple formats")
    parser.add_argument("format",
                        choices=[
                            "terms", "goals", "hyps+goal", "hyps+goal+tactic",
                            "tacvector", "scrapefile-rd", "scrapefile"
                        ])
    parser.add_argument("scrape_file", type=Path2)
    parser.add_argument("--tokenizer",
                        choices=list(tokenizers.keys()),
                        type=str,
                        default=list(tokenizers.keys())[0])
    parser.add_argument("--max-tuples",
                        dest="max_tuples",
                        default=None,
                        type=int)
    parser.add_argument("--num-keywords",
                        dest="num_keywords",
                        default=100,
                        type=int)
    parser.add_argument("--num-head-keywords",
                        dest="num_head_keywords",
                        type=int,
                        default=100)
    parser.add_argument("--num-tactic-keywords",
                        dest="num_tactic_keywords",
                        type=int,
                        default=50)
    parser.add_argument("--print-keywords",
                        dest="print_keywords",
                        action='store_true')
    parser.add_argument("--no-truncate-semicolons",
                        dest="truncate_semicolons",
                        action='store_false')
    parser.add_argument("--max-length",
                        dest="max_length",
                        default=30,
                        type=int)
    parser.add_argument("--lineend",
                        dest="lineend",
                        default=False,
                        const=True,
                        action='store_const')
    parser.add_argument("-j", "--num-threads", default=None, type=int)
    parser.add_argument("--context-filter",
                        dest="context_filter",
                        default="default")
    parser.add_argument('-v', "--verbose", action="count")
    parser.add_argument("--num-threads", "-j", type=int, default=None)
    parser.add_argument("--no-use-substitutions",
                        action='store_false',
                        dest='use_substitutions')
    parser.add_argument("--no-normalize-numeric-args",
                        action='store_false',
                        dest='normalize_numeric_args')
    parser.add_argument("--sort", action='store_true')
    arg_values = parser.parse_args(args)
    if arg_values.format == "terms":
        terms, tokenizer = data.term_data(
            data.RawDataset(
                list(
                    itertools.islice(
                        data.read_text_data(arg_values.scrape_file),
                        arg_values.max_tuples))),
            tokenizers[arg_values.tokenizer], arg_values.num_keywords, 2)
        if arg_values.max_length:
            terms = [
                data.normalizeSentenceLength(term, arg_values.max_length)
                for term in terms
            ]
        for term in terms:
            print(tokenizer.toString(
                list(itertools.takewhile(lambda x: x != data.EOS_token,
                                         term))),
                  end="\\n\n" if arg_values.lineend else "\n")
    else:
        dataset = data.get_text_data(arg_values)
        if arg_values.sort:
            dataset = data.RawDataset(
                sorted(dataset, key=lambda d: len(d.hypotheses), reverse=True))
        if arg_values.format == "goals":
            for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset:
                print(goal)
        elif arg_values.format == "hyps+goal":
            for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset:
                for hyp in hyps:
                    print(hyp)
                print("================================")
                print(goal)
        elif arg_values.format == "hyps+goal+tactic":
            for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset:
                for hyp in hyps:
                    print(hyp)
                print("================================")
                print(goal)
                print("====> {}".format(tactic))
            pass
        elif arg_values.format == "tacvector":
            embedding = SimpleEmbedding()
            eprint("Encoding tactics...", guard=arg_values.verbose)
            answers = [
                embedding.encode_token(serapi_instance.get_stem(datum.tactic))
                for datum in dataset
            ]
            stripped_data = [
                strip_scraped_output(scraped) for scraped in dataset
            ]
            eprint("Constructing features...", guard=arg_values.verbose)
            word_feature_functions = [
                word_feature_constructor(stripped_data,
                                         arg_values)  # type: ignore
                for word_feature_constructor in
                features.word_feature_constructors
            ]
            vec_features_functions = [
                vec_feature_constructor(stripped_data, arg_values) for
                vec_feature_constructor in features.vec_feature_constructors
            ]
            eprint("Extracting features...", guard=arg_values.verbose)
            word_features = [[
                feature(c) for feature in word_feature_functions
            ] for c in stripped_data]
            vec_features = [[
                feature_val for feature in vec_features_functions
                for feature_val in feature(c)
            ] for c in stripped_data]
            eprint("Done", guard=arg_values.verbose)
            for word_feat, vec_feat, tactic in zip(word_features, vec_features,
                                                   answers):
                print(",".join(
                    list(map(str, word_feat)) + list(map(str, vec_feat)) +
                    [str(tactic)]))
        elif arg_values.format == "scrapefile-rd":
            for point in dataset:
                print(
                    json.dumps({
                        "relevant_lemmas": point.relevant_lemmas,
                        "prev_tactics": point.prev_tactics,
                        "context": {
                            "fg_goals": [{
                                "hypotheses": point.hypotheses,
                                "goal": point.goal
                            }],
                            "bg_goals": [],
                            "shelved_goals": [],
                            "given_up_goals": []
                        },
                        "tactic": point.tactic
                    }))
        elif arg_values.format == "scrapefile":
            for point in dataset:
                print(
                    json.dumps({
                        "relevant_lemmas": point.relevant_lemmas,
                        "prev_tactics": point.prev_tactics,
                        "prev_hyps": point.hypotheses,
                        "prev_goal": point.goal,
                        "tactic": point.tactic
                    }))
コード例 #29
0
    def process_file(self, args : argparse.Namespace, file_idx : int, filename : str) \
        -> None:
        global gresult
        fresult = FileResult(filename)

        if self.debug:
            print("Preprocessing...")
        commands = self.get_commands(args, file_idx, filename)

        command_results: List[CommandResult] = []

        with serapi_instance.SerapiContext(self.coqargs, self.includes,
                                           self.prelude) as coq:
            coq.debug = self.debug
            nb_commands = len(commands)
            for i in range(nb_commands):
                command = commands[i]
                # print("Processing command {}/{}".format(str(i+1), str(nb_commands)))
                in_proof = (coq.proof_context
                            and not re.match(".*Proof.*", command.strip()))
                if re.match("[{}]", command):
                    coq.run_stmt(command)
                    continue
                if in_proof:
                    prev_tactics = coq.prev_tactics
                    initial_context = coq.proof_context
                    assert initial_context
                    hyps = coq.hypotheses
                    goals = coq.goals
                    relevant_lemmas = coq.local_lemmas
                    if self.baseline:
                        predictions_and_certanties = [baseline_tactic + ".", 1] \
                                                     * num_predictions
                    else:
                        predictions_and_certainties, loss = net.predictKTacticsWithLoss(
                            TacticContext(relevant_lemmas, prev_tactics, hyps,
                                          goals), num_predictions, command)

                    prediction_runs = [
                        run_prediction(coq, prediction) for prediction,
                        certainty in predictions_and_certainties
                    ]

                    try:
                        coq.run_stmt(command)
                        actual_result_context = coq.proof_context
                        actual_result_goal = coq.goals
                        actual_result_hypotheses = coq.hypotheses
                        actual_result_lemmas = coq.local_lemmas
                        assert isinstance(actual_result_context, str)
                    except (AckError, CompletedError, CoqExn, BadResponse,
                            ParseError, LexError, TimeoutError):
                        print("In file {}:".format(filename))
                        raise

                    prediction_results = [
                        (prediction,
                         evaluate_prediction(fresult, initial_context, command,
                                             actual_result_context,
                                             prediction_run), certainty)
                        for prediction_run, (prediction, certainty) in zip(
                            prediction_runs, predictions_and_certainties)
                    ]
                    assert net.training_args
                    if self.cfilter(
                            TacticContext(relevant_lemmas, prev_tactics, hyps,
                                          goals), command,
                            TacticContext(actual_result_lemmas,
                                          prev_tactics + [command],
                                          actual_result_hypotheses,
                                          actual_result_goal),
                            net.training_args):
                        fresult.add_command_result([
                            pred for pred, ctxt, ex in prediction_runs
                        ], [
                            grade
                            for pred, grade, certainty in prediction_results
                        ], command, loss)

                        command_results.append(
                            (command, hyps, goals, prediction_results))
                    else:
                        command_results.append((command, ))
                else:
                    try:
                        coq.run_stmt(command)
                    except (AckError, CompletedError, CoqExn, BadResponse,
                            ParseError, LexError, TimeoutError):
                        print("In file {}:".format(filename))
                        raise
                    command_results.append((command, ))

        write_csv(fresult.details_filename(), self.output_dir, gresult.options,
                  command_results)

        doc, tag, text, line = Doc().ttl()

        with tag('html'):
            details_header(tag, doc, text, filename)
            with tag('div', id='overlay', onclick='event.stopPropagation();'):
                with tag('div', id='predicted'):
                    pass
                with tag('div', id='context'):
                    pass
                with tag('div', id='stats'):
                    pass
                pass
            with tag('body',
                     onclick='deselectTactic()',
                     onload='setSelectedIdx()'), tag('pre'):
                for idx, command_result in enumerate(command_results):
                    if len(command_result) == 1:
                        with tag('code', klass='plaincommand'):
                            text(command_result[0])
                    else:
                        command, hyps, goal, prediction_results = \
                            cast(TacticResult, command_result)
                        predictions, grades, certainties = zip(
                            *prediction_results)
                        search_index = 0
                        for pidx, prediction_result in enumerate(
                                prediction_results):
                            prediction, grade, certainty = prediction_result
                            if (grade != "failedcommand"
                                    and grade != "superfailedcommand"):
                                search_index = pidx
                                break
                        with tag(
                                'span', ('data-hyps', "\n".join(hyps)),
                            ('data-goal', shorten_whitespace(goal)),
                            ('data-num-total', str(fresult.num_tactics)),
                            ('data-predictions',
                             to_list_string(cast(List[str], predictions))),
                            ('data-num-predicteds',
                             to_list_string([
                                 fresult.predicted_tactic_frequency.get(
                                     get_stem(prediction), 0)
                                 for prediction in cast(
                                     List[str], predictions)
                             ])),
                            ('data-num-corrects',
                             to_list_string([
                                 fresult.correctly_predicted_frequency.get(
                                     get_stem(prediction), 0)
                                 for prediction in cast(
                                     List[str], predictions)
                             ])),
                            ('data-certainties',
                             to_list_string(cast(List[float], certainties))),
                            ('data-num-actual-corrects',
                             fresult.correctly_predicted_frequency.get(
                                 get_stem(command), 0)),
                            ('data-num-actual-in-file',
                             fresult.actual_tactic_frequency.get(
                                 get_stem(command))),
                            ('data-actual-tactic', strip_comments(command)),
                            ('data-grades',
                             to_list_string(cast(List[str], grades))),
                            ('data-search-idx', search_index),
                                id='command-' + str(idx),
                                onmouseover='hoverTactic({})'.format(idx),
                                onmouseout='unhoverTactic()',
                                onclick=
                                'selectTactic({}); event.stopPropagation();'.
                                format(idx)):
                            doc.stag("br")
                            for idx, prediction_result in enumerate(
                                    prediction_results):
                                prediction, grade, certainty = prediction_result
                                if search_index == idx:
                                    with tag('code', klass=grade):
                                        text(" " + command.strip())
                                else:
                                    with tag('span', klass=grade):
                                        doc.asis(" &#11044;")

        with open(
                "{}/{}.html".format(self.output_dir,
                                    fresult.details_filename()), "w") as fout:
            fout.write(doc.getvalue())

        gresult.add_file_result(fresult)
        rows.put(fresult)
コード例 #30
0
def write_html(output_dir: Path2, filename: Path2,
               command_results: List[CommandResult],
               stats: 'ResultStats') -> None:
    def details_header(tag: Any, doc: Doc, text: Text,
                       filename: Path2) -> None:
        header(tag, doc, text, details_css, details_javascript,
               "Proverbot Detailed Report for {}".format(filename))

    doc, tag, text, line = Doc().ttl()

    def write_highlighted(vernac: str) -> None:
        nonlocal text
        nonlocal tag
        substrings = syntax_highlight(vernac)

        for substring in substrings:
            if isinstance(substring, ColoredString):
                with tag('span', style=f'color:{substring.color}'):
                    text(substring.contents)
            else:
                text(substring)

    with tag('html'):
        details_header(tag, doc, text, filename)
        with tag('div', id='overlay', onclick='event.stopPropagation();'):
            with tag('div', id='predicted'):
                pass
            with tag('div', id='context'):
                pass
            with tag('div', id='stats'):
                pass
            pass
        with tag('body', onclick='deselectTactic()',
                 onload='init()'), tag('pre'):
            for region_idx, region in enumerate(
                    split_into_regions(command_results)):
                if len(region) > 1 and len(region[1]) == 1:
                    for cmd_idx, command_result in enumerate(region):
                        assert isinstance(command_result[0], str)
                        with tag('code', klass='plaincommand'):
                            write_highlighted(command_result[0])
                else:
                    doc.stag("br")
                    with tag('button',
                             klass='collapsible',
                             id='collapsible-{}'.format(region_idx)):
                        with tag('code', klass='buttontext'):
                            assert isinstance(region[0][0], str), region
                            write_highlighted(region[0][0].strip("\n"))
                        num_unfiltered = count_region_unfiltered(region)
                        with tag(
                                'code',
                                klass='numtacs ' +
                            ('nonempty' if num_unfiltered > 3 else 'empty')):
                            text(num_unfiltered)
                    with tag('div', klass='region'):
                        for cmd_idx, command_result in enumerate(region[1:]):
                            command, hyps, goal, prediction_results = \
                                cast(TacticResult, command_result)
                            predictions: List[str]
                            grades: List[str]
                            certainties: List[float]
                            if len(prediction_results) > 0:
                                predictions, grades, certainties = zip(
                                    *prediction_results)  # type: ignore
                            else:
                                predictions, grades, certainties = [], [], []
                            with tag('span',
                                     ('data-hyps',"\n".join(hyps)),
                                     ('data-goal',goal),
                                     ('data-num-total', str(stats.num_tactics)),
                                     ('data-predictions',
                                      to_list_string(cast(List[str], predictions))),
                                     ('data-num-predicteds',
                                      to_list_string([stats.predicted_tactic_frequency
                                                      .get(get_stem(prediction), 0)
                                                      for prediction in cast(List[str],
                                                                             predictions)])),
                                     ('data-num-corrects',
                                      to_list_string([stats.correctly_predicted_frequency
                                                      .get(get_stem(prediction), 0)
                                                      for prediction in
                                                      cast(List[str], predictions)])),
                                     ('data-certainties',
                                      to_list_string(cast(List[float], certainties))),
                                     ('data-num-actual-corrects',
                                      stats.correctly_predicted_frequency
                                      .get(get_stem(command), 0)),
                                     ('data-num-actual-in-file',
                                      stats.actual_tactic_frequency
                                      .get(get_stem(command), 0)),
                                     ('data-actual-tactic',
                                      strip_comments(command)),
                                     ('data-grades',
                                      to_list_string(cast(List[str], grades))),
                                     ('data-search-idx', 0),
                                     id='command-{}-{}'.format(region_idx, cmd_idx),
                                     onmouseover='hoverTactic("{}-{}")'\
                                     .format(region_idx, cmd_idx),
                                     onmouseout='unhoverTactic()',
                                     onclick='selectTactic("{}-{}"); event.stopPropagation();'
                                     .format(region_idx, cmd_idx)):
                                doc.stag("br")
                                if len(grades) == 0:
                                    with tag('code', klass="plaincommand"):
                                        write_highlighted(command.strip("\n"))
                                else:
                                    with tag('code', klass=grades[0]):
                                        text(command.strip("\n"))
                                    for grade in grades[1:]:
                                        with tag('span', klass=grade):
                                            doc.asis(" &#11044;")
    with (output_dir / escape_filename(str(filename))).with_suffix(".html")\
                                                      .open(mode='w') as fout:
        fout.write(doc.getvalue())

    pass