def _predictDistributions( self, in_datas: List[TacticContext]) -> torch.FloatTensor: assert self._tokenizer assert self._embedding assert self.training_args goals_batch = [ normalizeSentenceLength(self._tokenizer.toTokenList(goal), self.training_args.max_length) for _, _, _, goal in in_datas ] hyps = [ get_closest_hyp(hyps, goal, self.training_args.max_length) for _, _, hyps, goal in in_datas ] hyp_types = [serapi_instance.get_hyp_type(hyp) for hyp in hyps] hyps_batch = [ normalizeSentenceLength(self._tokenizer.toTokenList(hyp_type), self.training_args.max_length) for hyp_type in hyp_types ] word_features_batch = [ self._get_word_features(in_data) for in_data in in_datas ] vec_features_batch = [ self._get_vec_features(in_data) for in_data in in_datas ] stem_distribution = self._model(LongTensor(goals_batch), LongTensor(hyps_batch), FloatTensor(vec_features_batch), LongTensor(word_features_batch)) return stem_distribution
def mkHFSample(max_length : int, word_feature_functions : List[WordFeature], vec_feature_functions : List[VecFeature], zipped : Tuple[EmbeddedSample, List[int], List[int]]) \ -> HypFeaturesSample: context, goal, best_hyp = zipped (prev_tactic_list, hypotheses, goal_str, tactic) = context tac_context = TacticContext(prev_tactic_list, hypotheses, goal_str) return HypFeaturesSample( [feature(tac_context) for feature in word_feature_functions], [ feature_val for feature in vec_feature_functions for feature_val in feature(tac_context) ], normalizeSentenceLength(goal, max_length), normalizeSentenceLength(best_hyp, max_length), tactic)
def _encode_data(self, data : RawDataset, arg_values : Namespace) \ -> Tuple[EncFeaturesDataset, Tuple[Tokenizer, Embedding, List[VecFeature], List[WordFeature]]]: preprocessed_data = list(self._preprocess_data(data, arg_values)) stripped_data = [ strip_scraped_output(dat) for dat in preprocessed_data ] self._vec_feature_functions = [ feature_constructor(stripped_data, arg_values) for # type: ignore feature_constructor in vec_feature_constructors ] self._word_feature_functions = [ feature_constructor(stripped_data, arg_values) for # type: ignore feature_constructor in word_feature_constructors ] embedding, embedded_data = embed_data(RawDataset(preprocessed_data)) tokenizer, tokenized_goals = tokenize_goals(embedded_data, arg_values) result_data = EncFeaturesDataset([ EncFeaturesSample( self._get_vec_features( TacticContext(prev_tactics, hypotheses, goal)), self._get_word_features( TacticContext(prev_tactics, hypotheses, goal)), normalizeSentenceLength(tokenized_goal, arg_values.max_length), tactic) for (prev_tactics, hypotheses, goal, tactic), tokenized_goal in zip(embedded_data, tokenized_goals) ]) return result_data, (tokenizer, embedding, self._vec_feature_functions, self._word_feature_functions)
def _predictCompositeDistributionFromStemDistribution( self, beam_width : int, stem_distribution : torch.FloatTensor, in_datas : List[TacticContext]) \ -> Tuple[torch.FloatTensor, torch.LongTensor]: assert self.training_args assert self._tokenizer goals_batch = torch.LongTensor([ normalizeSentenceLength(self._tokenizer.toTokenList(goal), self.training_args.max_length) for _, _, _, goal in in_datas ]) batch_size = stem_distribution.size()[0] num_stem_poss = stem_distribution.size()[1] stem_width = min(beam_width, num_stem_poss) probs, indices = stem_distribution.topk(stem_width) stems_batch = indices.view(batch_size * stem_width) probs_batch = probs.view(batch_size * stem_width) goals_batch = goals_batch.view(batch_size, 1, self.training_args.max_length)\ .expand(-1, stem_width, -1).contiguous()\ .view(batch_size * stem_width, self.training_args.max_length) conditional_distributions = \ self._model.find_arg_rnn(goals_batch, stems_batch)[:,1:] num_probs = conditional_distributions.size()[1] all_batch_probs = (conditional_distributions.t() + probs_batch.view(-1)).t() all_prob_batches = all_batch_probs\ .contiguous().view(batch_size, stem_width * num_probs) return all_prob_batches, indices
def predictDistribution(self, in_data : TacticContext) \ -> torch.FloatTensor: return self.decoder.run( self.encoder.run( LongTensor( normalizeSentenceLength( self.tokenizer.toTokenList(in_data.goal), self.max_length)).view(1, -1)))
def _data_tensors(self, encoded_data : PECDataset, arg_values : Namespace) \ -> List[torch.Tensor]: prevs, goals, nexts = zip(*encoded_data) goal_stream = torch.LongTensor([ normalizeSentenceLength(goal, arg_values.max_length) for goal in goals ]) prev_stream = torch.LongTensor(prevs) out_stream = torch.LongTensor(nexts) return [goal_stream, prev_stream, out_stream]
def _predictDistributions(self, in_datas : List[TacticContext]) \ -> torch.FloatTensor: assert self.training_args tokenized_goals = [ self._tokenizer.toTokenList(in_data.goal) for in_data in in_datas ] goal_list = [ normalizeSentenceLength(tokenized_goal, self.training_args.max_length) for tokenized_goal in tokenized_goals ] goal_tensor = LongTensor(goal_list).view(len(in_datas), -1) prev_tensor = LongTensor([self._get_prev(in_data) for in_data in in_datas])\ .view(len(in_datas), -1) return self._model.run(goal_tensor, prev_tensor)
def run(self, hidden: torch.FloatTensor, max_length: int) -> Sentence: decoder_hidden = hidden assert self.batch_size == 1 decoder_input = self.initInput() prediction: Sentence = [] for di in range(max_length): decoder_output, decoder_hidden = self(decoder_input, decoder_hidden) probability, decoder_input = decoder_output.view(1, -1).topk(1) decoded_char = decoder_input.item() prediction.append(decoded_char) if decoded_char == EOS_token: prediction = normalizeSentenceLength(prediction, max_length) break return prediction
def mkCopySample(max_length : int, word_feature_functions : List[WordFeature], vec_feature_functions : List[VecFeature], zipped : Tuple[EmbeddedSample, List[int], int]) \ -> CopyArgSample: context, goal, arg_idx = zipped (prev_tactic_list, hypotheses, goal_str, tactic_idx) = context tac_context = TacticContext(prev_tactic_list, hypotheses, goal_str) word_features = [feature(tac_context) for feature in word_feature_functions] assert len(word_features) == 3 return CopyArgSample(normalizeSentenceLength(goal, max_length), word_features, [feature_val for feature in vec_feature_functions for feature_val in feature(tac_context)], tactic_idx, arg_idx)
def _predictDistributions( self, in_datas: List[TacticContext]) -> torch.FloatTensor: assert self.training_args vec_features_batch = [ self._get_vec_features(in_data) for in_data in in_datas ] word_features_batch = [ self._get_word_features(in_data) for in_data in in_datas ] goals_batch = [ normalizeSentenceLength(self._tokenizer.toTokenList(goal), self.training_args.max_length) for _, _, goal in in_datas ] return self._model(torch.FloatTensor(vec_features_batch), torch.LongTensor(word_features_batch), torch.LongTensor(goals_batch))
def run_test(args_list: List[str]): parser = argparse.ArgumentParser() parser.add_argument("save_file", type=str) parser.add_argument("--print-inputs", dest="print_inputs", default=False, action='store_const', const=True) arg_values = parser.parse_args(args_list) checkpoint = torch.load(arg_values.save_file) assert checkpoint['max-length'] assert checkpoint['tokenizer'] assert checkpoint['tokenizer-name'] assert checkpoint['encoder'] assert checkpoint['num-encoder-layers'] assert checkpoint['decoder'] assert checkpoint['num-decoder-layers'] assert checkpoint['hidden-size'] assert checkpoint['context-filter'] tokenizer = checkpoint['tokenizer'] encoder = maybe_cuda( EncoderRNN(tokenizer.numTokens(), checkpoint['hidden-size'], checkpoint['num-encoder-layers'])) encoder.load_state_dict(checkpoint['encoder']) decoder = maybe_cuda( DecoderRNN(checkpoint['hidden-size'], tokenizer.numTokens(), checkpoint['num-decoder-layers'])) decoder.load_state_dict(checkpoint['decoder']) for term in sys.stdin: data_in = torch.LongTensor( normalizeSentenceLength(tokenizer.toTokenList(term), checkpoint['max-length'])).view(1, -1) if arg_values.print_inputs: print("{} ({}) -> ".format(term.strip(), data_in), end="") data_out = decoder.run(encoder.run(data_in), checkpoint['max-length']) print( tokenizer.toString( list(itertools.takewhile(lambda x: x != EOS_token, data_out))))
def predictKTacticsWithLoss_batch(self, in_data : List[TacticContext], k : int, corrects : List[str]) -> \ Tuple[List[List[Prediction]], float]: assert self.training_args if len(in_data) == 0: return [], 0 with self._lock: goals_tensor = LongTensor([ normalizeSentenceLength(self._tokenizer.toTokenList(goal), self.training_args.max_length) for relevant_lemmas, prev_tactics, hypotheses, goal in in_data ]) prevs_tensor = LongTensor( [self._get_prev(in_datum) for in_datum in in_data]) correct_stems = [get_stem(correct) for correct in corrects] prediction_distributions = self._model.run(goals_tensor, prevs_tensor) output_var = maybe_cuda( Variable( torch.LongTensor([ self._embedding.encode_token(correct_stem) if self._embedding.has_token(correct_stem) else 0 for correct_stem in correct_stems ]))) loss = self._criterion(prediction_distributions, output_var).item() if k > self._embedding.num_tokens(): k = self._embedding.num_tokens() certainties_and_idxs_list = [ single_distribution.view(-1).topk(k) for single_distribution in list(prediction_distributions) ] results = [[ Prediction( self._embedding.decode_token(stem_idx.item()) + ".", math.exp(certainty.item())) for certainty, stem_idx in zip(*certainties_and_idxs) ] for certainties_and_idxs in certainties_and_idxs_list] return results, loss
def get_data(args: List[str]) -> None: parser = argparse.ArgumentParser( description="Parse datafiles into multiple formats") parser.add_argument("format", choices=[ "terms", "goals", "hyps+goal", "hyps+goal+tactic", "tacvector", "scrapefile-rd", "scrapefile" ]) parser.add_argument("scrape_file", type=Path2) parser.add_argument("--tokenizer", choices=list(tokenizers.keys()), type=str, default=list(tokenizers.keys())[0]) parser.add_argument("--max-tuples", dest="max_tuples", default=None, type=int) parser.add_argument("--num-keywords", dest="num_keywords", default=100, type=int) parser.add_argument("--num-head-keywords", dest="num_head_keywords", type=int, default=100) parser.add_argument("--num-tactic-keywords", dest="num_tactic_keywords", type=int, default=50) parser.add_argument("--print-keywords", dest="print_keywords", action='store_true') parser.add_argument("--no-truncate-semicolons", dest="truncate_semicolons", action='store_false') parser.add_argument("--max-length", dest="max_length", default=30, type=int) parser.add_argument("--lineend", dest="lineend", default=False, const=True, action='store_const') parser.add_argument("-j", "--num-threads", default=None, type=int) parser.add_argument("--context-filter", dest="context_filter", default="default") parser.add_argument('-v', "--verbose", action="count") parser.add_argument("--num-threads", "-j", type=int, default=None) parser.add_argument("--no-use-substitutions", action='store_false', dest='use_substitutions') parser.add_argument("--no-normalize-numeric-args", action='store_false', dest='normalize_numeric_args') parser.add_argument("--sort", action='store_true') arg_values = parser.parse_args(args) if arg_values.format == "terms": terms, tokenizer = data.term_data( data.RawDataset( list( itertools.islice( data.read_text_data(arg_values.scrape_file), arg_values.max_tuples))), tokenizers[arg_values.tokenizer], arg_values.num_keywords, 2) if arg_values.max_length: terms = [ data.normalizeSentenceLength(term, arg_values.max_length) for term in terms ] for term in terms: print(tokenizer.toString( list(itertools.takewhile(lambda x: x != data.EOS_token, term))), end="\\n\n" if arg_values.lineend else "\n") else: dataset = data.get_text_data(arg_values) if arg_values.sort: dataset = data.RawDataset( sorted(dataset, key=lambda d: len(d.hypotheses), reverse=True)) if arg_values.format == "goals": for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset: print(goal) elif arg_values.format == "hyps+goal": for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset: for hyp in hyps: print(hyp) print("================================") print(goal) elif arg_values.format == "hyps+goal+tactic": for relevant_lemmas, prev_tactics, hyps, goal, tactic in dataset: for hyp in hyps: print(hyp) print("================================") print(goal) print("====> {}".format(tactic)) pass elif arg_values.format == "tacvector": embedding = SimpleEmbedding() eprint("Encoding tactics...", guard=arg_values.verbose) answers = [ embedding.encode_token(serapi_instance.get_stem(datum.tactic)) for datum in dataset ] stripped_data = [ strip_scraped_output(scraped) for scraped in dataset ] eprint("Constructing features...", guard=arg_values.verbose) word_feature_functions = [ word_feature_constructor(stripped_data, arg_values) # type: ignore for word_feature_constructor in features.word_feature_constructors ] vec_features_functions = [ vec_feature_constructor(stripped_data, arg_values) for vec_feature_constructor in features.vec_feature_constructors ] eprint("Extracting features...", guard=arg_values.verbose) word_features = [[ feature(c) for feature in word_feature_functions ] for c in stripped_data] vec_features = [[ feature_val for feature in vec_features_functions for feature_val in feature(c) ] for c in stripped_data] eprint("Done", guard=arg_values.verbose) for word_feat, vec_feat, tactic in zip(word_features, vec_features, answers): print(",".join( list(map(str, word_feat)) + list(map(str, vec_feat)) + [str(tactic)])) elif arg_values.format == "scrapefile-rd": for point in dataset: print( json.dumps({ "relevant_lemmas": point.relevant_lemmas, "prev_tactics": point.prev_tactics, "context": { "fg_goals": [{ "hypotheses": point.hypotheses, "goal": point.goal }], "bg_goals": [], "shelved_goals": [], "given_up_goals": [] }, "tactic": point.tactic })) elif arg_values.format == "scrapefile": for point in dataset: print( json.dumps({ "relevant_lemmas": point.relevant_lemmas, "prev_tactics": point.prev_tactics, "prev_hyps": point.hypotheses, "prev_goal": point.goal, "tactic": point.tactic }))
def train(dataset : ClassifySequenceDataset, autoencoder : EncoderRNN, train_autoencoder: bool, max_length : int, encoder_hidden_size : int, classifier_hidden_size : int, output_vocab_size : int, num_layers : int, batch_size : int, learning_rate : float, gamma : float, epoch_step : int, num_epochs : int, print_every : int, optimizer_f : Callable[..., Optimizer]) \ -> Iterable[Checkpoint]: print("Initializing PyTorch...") in_stream = [ normalizeSentenceLength(goal, max_length) for goal, tactic in dataset ] out_stream = [tactic for goal, tactic in dataset] dataloader = \ torchdata.DataLoader(torchdata.TensorDataset(torch.LongTensor(in_stream), torch.LongTensor(out_stream)), batch_size=batch_size, num_workers=0, shuffle=True, pin_memory=True, drop_last=True) classifier = maybe_cuda( ClassifierDNN(encoder_hidden_size, classifier_hidden_size, output_vocab_size, num_layers, batch_size)) optimizers = [optimizer_f(classifier.parameters(), lr=learning_rate)] if train_autoencoder: optimizers += [optimizer_f(autoencoder.parameters(), lr=learning_rate)] criterion = maybe_cuda(nn.NLLLoss()) adjusters = [ scheduler.StepLR(optimizer, epoch_step, gamma) for optimizer in optimizers ] start = time.time() num_items = len(dataset) * num_epochs total_loss = 0 print("Training...") for epoch in range(num_epochs): print("Epoch {}".format(epoch)) for adjuster in adjusters: adjuster.step() for batch_num, (input_batch, output_batch) in enumerate(dataloader): # Reset the optimizer for optimizer in optimizers: optimizer.zero_grad() # Run the classifier on pre-encoded vectors encoded_input_batch = autoencoder.run( cast(torch.LongTensor, input_batch)) prediction_distribution = classifier.run(encoded_input_batch) # Get the loss output_var = maybe_cuda(Variable(output_batch)) loss = criterion(prediction_distribution, output_var) # Update the weights loss.backward() for optimizer in optimizers: optimizer.step() # Report progress items_processed = (batch_num + 1) * batch_size + epoch * len(dataset) total_loss += loss.item() * batch_size assert isinstance(total_loss, float) if (batch_num + 1) % print_every == 0: progress = items_processed / num_items print("{} ({:7} {:5.2f}%) {:.4f}".format( timeSince(start, progress), items_processed, progress * 100, total_loss / items_processed)) yield Checkpoint(classifier_state=classifier.state_dict(), autoencoder_state=autoencoder.state_dict(), training_loss=total_loss / items_processed)
def use_tokenizer(tokenizer: tk.Tokenizer, max_length: int, term_strings: str): return [ normalizeSentenceLength(tokenizer.toTokenList(term_string), max_length) for term_string in term_strings ]
def get_data(args: List[str]) -> None: parser = argparse.ArgumentParser( description="Parse datafiles into multiple formats") parser.add_argument("format", choices=[ "terms", "goals", "hyps+goal", "hyps+goal+tactic", "tacvector" ]) parser.add_argument("scrape_file", type=Path2) parser.add_argument("--tokenizer", choices=list(tokenizers.keys()), type=str, default=list(tokenizers.keys())[0]) parser.add_argument("--max-tuples", dest="max_tuples", default=None, type=int) parser.add_argument("--num-keywords", dest="num_keywords", default=100, type=int) parser.add_argument("--num-head-keywords", dest="num_head_keywords", type=int, default=100) parser.add_argument("--num-tactic-keywords", dest="num_tactic_keywords", type=int, default=50) parser.add_argument("--print-keywords", dest="print_keywords", action='store_true') parser.add_argument("--max-length", dest="max_length", default=None, type=int) parser.add_argument("--lineend", dest="lineend", default=False, const=True, action='store_const') parser.add_argument("--context-filter", dest="context_filter", default="default") parser.add_argument("--verbose", action="store_true") arg_values = parser.parse_args(args) if arg_values.format == "terms": terms, tokenizer = data.term_data( data.RawDataset( list( itertools.islice( data.read_text_data(arg_values.scrape_file), arg_values.max_tuples))), tokenizers[arg_values.tokenizer], arg_values.num_keywords, 2) if arg_values.max_length: terms = [ data.normalizeSentenceLength(term, arg_values.max_length) for term in terms ] for term in terms: print(tokenizer.toString( list(itertools.takewhile(lambda x: x != data.EOS_token, term))), end="\\n\n" if arg_values.lineend else "\n") elif arg_values.format == "goals": dataset = data.get_text_data(arg_values) for prev_tactics, hyps, goal, tactic in dataset: print(goal) elif arg_values.format == "hyps+goal": dataset = data.get_text_data(arg_values) for prev_tactics, hyps, goal, tactic in dataset: for hyp in hyps: print(hyp) print("================================") print(goal) elif arg_values.format == "hyps+goal+tactic": dataset = data.get_text_data(arg_values) for prev_tactics, hyps, goal, tactic in dataset: for hyp in hyps: print(hyp) print("================================") print(goal) print("====> {}".format(tactic)) pass elif arg_values.format == "tacvector": dataset = data.get_text_data(arg_values) embedding = SimpleEmbedding() eprint("Encoding tactics...", guard=arg_values.verbose) answers = [ embedding.encode_token(serapi_instance.get_stem(datum.tactic)) for datum in dataset ] stripped_data = [strip_scraped_output(scraped) for scraped in dataset] eprint("Constructing features...", guard=arg_values.verbose) word_feature_functions = [ word_feature_constructor(stripped_data, arg_values) for word_feature_constructor in features.word_feature_constructors ] vec_features_functions = [ vec_feature_constructor(stripped_data, arg_values) for vec_feature_constructor in features.vec_feature_constructors ] eprint("Extracting features...", guard=arg_values.verbose) word_features = [[feature(c) for feature in word_feature_functions] for c in stripped_data] vec_features = [[ feature_val for feature in vec_features_functions for feature_val in feature(c) ] for c in stripped_data] eprint("Done", guard=arg_values.verbose) for word_feat, vec_feat, tactic in zip(word_features, vec_features, answers): print(",".join( list(map(str, word_feat)) + list(map(str, vec_feat)) + [str(tactic)]))