def guess(self, questions: List[str], n_guesses: Optional[int]) -> List[List[Tuple[str, float]]]: log.info('Generating {} guesses for each of {} questions'.format( n_guesses, len(questions))) log.info('Converting text to embedding indices...') x_test = [ convert_text_to_embeddings_indices(tokenize_question(q), self.embedding_lookup) for q in questions ] log.info('Computing question lengths...') x_test_lengths = compute_lengths(x_test) log.info('Converting questions to tensorflow format...') x_test = tf_format(x_test, self.max_len, self.embeddings.shape[0]) x_test = np.array(x_test) self.model = AuxDanModel(self.dan_params, self.max_len, self.n_classes, self.n_ans_type_classes, self.n_gender_classes, self.n_category_classes) log.info('Starting Tensorflow model guessing...') guess_labels, guess_scores = self.model.guess(x_test, x_test_lengths, n_guesses) log.info( 'Guess generation and fetching top guesses done, converting to output format' ) all_guesses = [] for i_row, score_row in zip(guess_labels, guess_scores): guesses = [] for label, score in zip(i_row, score_row): guesses.append((self.i_to_class[label], score)) all_guesses.append(guesses) return all_guesses
def guess(self, questions: List[QuestionText], max_n_guesses: Optional[int]) -> List[List[Tuple[Answer, float]]]: x_test = [convert_text_to_embeddings_indices( tokenize_question(q), self.embedding_lookup) for q in questions ] for r in x_test: if len(r) == 0: log.warn('Found an empty question, adding an UNK token to it so that NaNs do not occur') r.append(self.embedding_lookup['UNK']) x_test = np.array(x_test) y_test = np.zeros(len(x_test)) _, t_x_batches, t_offset_batches, t_y_batches = batchify( self.batch_size, x_test, y_test, truncate=False, shuffle=False ) self.model.eval() self.model.cuda() guesses = [] for b in range(len(t_x_batches)): t_x = Variable(t_x_batches[b], volatile=True) t_offset = Variable(t_offset_batches[b], volatile=True) out = self.model(t_x, t_offset) probs = F.softmax(out) scores, preds = torch.max(probs, 1) scores = scores.data.cpu().numpy() preds = preds.data.cpu().numpy() for p, s in zip(preds, scores): guesses.append([(self.i_to_class[p], s)]) return guesses
def guess(self, questions, max_n_guesses): x_test = [convert_text_to_embeddings_indices( tokenize_question(q), self.embedding_lookup) for q in questions ] for r in x_test: if len(r) == 0: log.warn('Found an empty question, adding an UNK token to it so that NaNs do not occur') r.append(self.embedding_lookup['UNK']) x_test = np.array(x_test) y_test = np.zeros(len(x_test)) _, t_x_batches, lengths, masks, t_y_batches = batchify( self.batch_size, x_test, y_test, truncate=False, shuffle=False ) self.model.eval() self.model.cuda() guesses = [] for b in range(len(t_x_batches)): t_x = Variable(t_x_batches[b], volatile=True) length_batch = lengths[b] mask_batch = masks[b] probs = self.model(t_x, length_batch, mask_batch) scores, preds = torch.max(probs, 1) scores = scores.data.cpu().numpy() preds = preds.data.cpu().numpy() for p, s in zip(preds, scores): guesses.append([(self.i_to_class[p], s)]) return guesses
def guess(self, questions: List[QuestionText], max_n_guesses: Optional[int]) -> List[List[Tuple[Answer, float]]]: log.info('Generating {} guesses for each of {} questions'.format(max_n_guesses, len(questions))) x_test = [nn.convert_text_to_embeddings_indices( tokenize_question(q), self.embedding_lookup) for q in questions] x_test = np.array(nn.tf_format(x_test, self.max_len, 0)) class_probabilities = self.model.predict_proba(x_test, batch_size=self.batch_size) guesses = [] for row in class_probabilities: sorted_labels = np.argsort(-row)[:max_n_guesses] sorted_guesses = [self.i_to_class[i] for i in sorted_labels] sorted_scores = np.copy(row[sorted_labels]) guesses.append(list(zip(sorted_guesses, sorted_scores))) return guesses
def guess(self, questions: List[QuestionText], max_n_guesses: Optional[int]) -> List[List[Tuple[Page, float]]]: y_data = np.zeros((len(questions))) x_data = [tokenize_question(q) for q in questions] batches = batchify(x_data, y_data, shuffle=False, batch_size=32) guesses = [] for x_batch, y_batch, length_batch in batches: out = self.model(x_batch.cuda(), length_batch.cuda()) probs = F.softmax(out).data.cpu().numpy() preds = np.argsort(-probs, axis=1) n_examples = probs.shape[0] for i in range(n_examples): example_guesses = [] for p in preds[i][:max_n_guesses]: example_guesses.append((self.i_to_class[p], probs[i][p])) guesses.append(example_guesses) return guesses
def guess(self, questions: List[QuestionText], max_n_guesses: Optional[int]) -> List[List[Tuple[Page, float]]]: y_data = np.zeros((len(questions))) x_data = [tokenize_question(q) for q in questions] batches = batchify(x_data, y_data, shuffle=False, batch_size=32) guesses = [] for x_batch, y_batch, length_batch in batches: out = self.model(x_batch.cuda(), length_batch.cuda()) probs = F.softmax(out).data.cpu().numpy() preds = np.argsort(-probs, axis=1) n_examples = probs.shape[0] for i in range(n_examples): example_guesses = [] for p in preds[i][:max_n_guesses]: example_guesses.append((self.i_to_class[p], probs[i][p])) guesses.append(example_guesses) return guesses
def run(self): tagme.GCUBE_TOKEN = TAGME_GCUBE_TOKEN with open('output/tagme/batches.pickle', 'rb') as f: batch_dict = pickle.load(f) batch_questions = batch_dict[self.question_batch] dict_annotations = {} for q in batch_questions: annotated_sentences = {} for s, text in q.text.items(): # This insures that preprocessing is matched for neural models text = ' '.join(tokenize_question(text)) annotation = annotation_to_dict(tagme.annotate(text)) annotated_sentences[s] = annotation dict_annotations[q.qnum] = annotated_sentences with open( 'output/tagme/tagged_batch_{}.pickle'.format( self.question_batch), 'wb') as f: pickle.dump(dict_annotations, f)
def guess(self, questions: List[QuestionText], max_n_guesses: Optional[int]): x_test = [ convert_text_to_embeddings_indices(tokenize_question(q), self.embedding_lookup) for q in questions ] for r in x_test: if len(r) == 0: log.warn( 'Found an empty question, adding an UNK token to it so that NaNs do not occur' ) r.append(self.embedding_lookup['UNK']) x_test = np.array(x_test) y_test = np.zeros(len(x_test)) _, t_x_batches, lengths, t_y_batches, sort_batches = batchify( self.batch_size, x_test, y_test, truncate=False, shuffle=False) self.model.eval() self.model.cuda() guesses = [] hidden = self.model.init_hidden(self.batch_size) for b in range(len(t_x_batches)): t_x = Variable(t_x_batches[b], volatile=True) length_batch = lengths[b] sort = sort_batches[b] if len(length_batch) != self.batch_size: # This could happen for the last batch which is shorter than batch_size hidden = self.model.init_hidden(len(length_batch)) else: hidden = repackage_hidden(hidden, reset=True) out, hidden = self.model(t_x, length_batch, hidden) probs = F.softmax(out) scores, preds = torch.max(probs, 1) scores = scores.data.cpu().numpy()[np.argsort(sort)] preds = preds.data.cpu().numpy()[np.argsort(sort)] for p, s in zip(preds, scores): guesses.append([(self.i_to_class[p], s)]) return guesses