Exemple #1
0
 def guess(self, questions: List[str],
           n_guesses: Optional[int]) -> List[List[Tuple[str, float]]]:
     log.info('Generating {} guesses for each of {} questions'.format(
         n_guesses, len(questions)))
     log.info('Converting text to embedding indices...')
     x_test = [
         convert_text_to_embeddings_indices(tokenize_question(q),
                                            self.embedding_lookup)
         for q in questions
     ]
     log.info('Computing question lengths...')
     x_test_lengths = compute_lengths(x_test)
     log.info('Converting questions to tensorflow format...')
     x_test = tf_format(x_test, self.max_len, self.embeddings.shape[0])
     x_test = np.array(x_test)
     self.model = AuxDanModel(self.dan_params, self.max_len, self.n_classes,
                              self.n_ans_type_classes,
                              self.n_gender_classes,
                              self.n_category_classes)
     log.info('Starting Tensorflow model guessing...')
     guess_labels, guess_scores = self.model.guess(x_test, x_test_lengths,
                                                   n_guesses)
     log.info(
         'Guess generation and fetching top guesses done, converting to output format'
     )
     all_guesses = []
     for i_row, score_row in zip(guess_labels, guess_scores):
         guesses = []
         for label, score in zip(i_row, score_row):
             guesses.append((self.i_to_class[label], score))
         all_guesses.append(guesses)
     return all_guesses
Exemple #2
0
    def guess(self, questions: List[QuestionText], max_n_guesses: Optional[int]) -> List[List[Tuple[Answer, float]]]:
        x_test = [convert_text_to_embeddings_indices(
            tokenize_question(q), self.embedding_lookup)
            for q in questions
        ]
        for r in x_test:
            if len(r) == 0:
                log.warn('Found an empty question, adding an UNK token to it so that NaNs do not occur')
                r.append(self.embedding_lookup['UNK'])
        x_test = np.array(x_test)
        y_test = np.zeros(len(x_test))

        _, t_x_batches, t_offset_batches, t_y_batches = batchify(
            self.batch_size, x_test, y_test, truncate=False, shuffle=False
        )

        self.model.eval()
        self.model.cuda()
        guesses = []
        for b in range(len(t_x_batches)):
            t_x = Variable(t_x_batches[b], volatile=True)
            t_offset = Variable(t_offset_batches[b], volatile=True)
            out = self.model(t_x, t_offset)
            probs = F.softmax(out)
            scores, preds = torch.max(probs, 1)
            scores = scores.data.cpu().numpy()
            preds = preds.data.cpu().numpy()
            for p, s in zip(preds, scores):
                guesses.append([(self.i_to_class[p], s)])

        return guesses
Exemple #3
0
    def guess(self, questions, max_n_guesses):
        x_test = [convert_text_to_embeddings_indices(
            tokenize_question(q), self.embedding_lookup)
            for q in questions
        ]
        for r in x_test:
            if len(r) == 0:
                log.warn('Found an empty question, adding an UNK token to it so that NaNs do not occur')
                r.append(self.embedding_lookup['UNK'])
        x_test = np.array(x_test)
        y_test = np.zeros(len(x_test))

        _, t_x_batches, lengths, masks, t_y_batches = batchify(
            self.batch_size, x_test, y_test,
            truncate=False, shuffle=False
        )

        self.model.eval()
        self.model.cuda()
        guesses = []
        for b in range(len(t_x_batches)):
            t_x = Variable(t_x_batches[b], volatile=True)
            length_batch = lengths[b]
            mask_batch = masks[b]

            probs = self.model(t_x, length_batch, mask_batch)
            scores, preds = torch.max(probs, 1)
            scores = scores.data.cpu().numpy()
            preds = preds.data.cpu().numpy()
            for p, s in zip(preds, scores):
                guesses.append([(self.i_to_class[p], s)])

        return guesses
Exemple #4
0
 def guess(self, questions: List[QuestionText], max_n_guesses: Optional[int]) -> List[List[Tuple[Answer, float]]]:
     log.info('Generating {} guesses for each of {} questions'.format(max_n_guesses, len(questions)))
     x_test = [nn.convert_text_to_embeddings_indices(
         tokenize_question(q), self.embedding_lookup)
         for q in questions]
     x_test = np.array(nn.tf_format(x_test, self.max_len, 0))
     class_probabilities = self.model.predict_proba(x_test, batch_size=self.batch_size)
     guesses = []
     for row in class_probabilities:
         sorted_labels = np.argsort(-row)[:max_n_guesses]
         sorted_guesses = [self.i_to_class[i] for i in sorted_labels]
         sorted_scores = np.copy(row[sorted_labels])
         guesses.append(list(zip(sorted_guesses, sorted_scores)))
     return guesses
Exemple #5
0
    def guess(self, questions: List[QuestionText], max_n_guesses: Optional[int]) -> List[List[Tuple[Page, float]]]:
        y_data = np.zeros((len(questions)))
        x_data = [tokenize_question(q) for q in questions]
        batches = batchify(x_data, y_data, shuffle=False, batch_size=32)
        guesses = []
        for x_batch, y_batch, length_batch in batches:
            out = self.model(x_batch.cuda(), length_batch.cuda())
            probs = F.softmax(out).data.cpu().numpy()
            preds = np.argsort(-probs, axis=1)
            n_examples = probs.shape[0]
            for i in range(n_examples):
                example_guesses = []
                for p in preds[i][:max_n_guesses]:
                    example_guesses.append((self.i_to_class[p], probs[i][p]))
                guesses.append(example_guesses)

        return guesses
Exemple #6
0
    def guess(self, questions: List[QuestionText], max_n_guesses: Optional[int]) -> List[List[Tuple[Page, float]]]:
        y_data = np.zeros((len(questions)))
        x_data = [tokenize_question(q) for q in questions]
        batches = batchify(x_data, y_data, shuffle=False, batch_size=32)
        guesses = []
        for x_batch, y_batch, length_batch in batches:
            out = self.model(x_batch.cuda(), length_batch.cuda())
            probs = F.softmax(out).data.cpu().numpy()
            preds = np.argsort(-probs, axis=1)
            n_examples = probs.shape[0]
            for i in range(n_examples):
                example_guesses = []
                for p in preds[i][:max_n_guesses]:
                    example_guesses.append((self.i_to_class[p], probs[i][p]))
                guesses.append(example_guesses)

        return guesses
Exemple #7
0
    def run(self):
        tagme.GCUBE_TOKEN = TAGME_GCUBE_TOKEN
        with open('output/tagme/batches.pickle', 'rb') as f:
            batch_dict = pickle.load(f)
        batch_questions = batch_dict[self.question_batch]
        dict_annotations = {}
        for q in batch_questions:
            annotated_sentences = {}
            for s, text in q.text.items():
                # This insures that preprocessing is matched for neural models
                text = ' '.join(tokenize_question(text))
                annotation = annotation_to_dict(tagme.annotate(text))
                annotated_sentences[s] = annotation
            dict_annotations[q.qnum] = annotated_sentences

        with open(
                'output/tagme/tagged_batch_{}.pickle'.format(
                    self.question_batch), 'wb') as f:
            pickle.dump(dict_annotations, f)
Exemple #8
0
    def guess(self, questions: List[QuestionText],
              max_n_guesses: Optional[int]):
        x_test = [
            convert_text_to_embeddings_indices(tokenize_question(q),
                                               self.embedding_lookup)
            for q in questions
        ]
        for r in x_test:
            if len(r) == 0:
                log.warn(
                    'Found an empty question, adding an UNK token to it so that NaNs do not occur'
                )
                r.append(self.embedding_lookup['UNK'])
        x_test = np.array(x_test)
        y_test = np.zeros(len(x_test))

        _, t_x_batches, lengths, t_y_batches, sort_batches = batchify(
            self.batch_size, x_test, y_test, truncate=False, shuffle=False)

        self.model.eval()
        self.model.cuda()
        guesses = []
        hidden = self.model.init_hidden(self.batch_size)
        for b in range(len(t_x_batches)):
            t_x = Variable(t_x_batches[b], volatile=True)
            length_batch = lengths[b]
            sort = sort_batches[b]

            if len(length_batch) != self.batch_size:
                # This could happen for the last batch which is shorter than batch_size
                hidden = self.model.init_hidden(len(length_batch))
            else:
                hidden = repackage_hidden(hidden, reset=True)

            out, hidden = self.model(t_x, length_batch, hidden)
            probs = F.softmax(out)
            scores, preds = torch.max(probs, 1)
            scores = scores.data.cpu().numpy()[np.argsort(sort)]
            preds = preds.data.cpu().numpy()[np.argsort(sort)]
            for p, s in zip(preds, scores):
                guesses.append([(self.i_to_class[p], s)])

        return guesses