def test_reasonable_predictions(self): model = Comparison(**self.default_config(n_epochs=3)) # fake dataset generation animals = ["dog", "cat", "horse", "cow", "pig", "sheep", "goat", "chicken", "guinea pig", "donkey", "turkey", "duck", "camel", "goose", "llama", "rabbit", "fox"] numbers = ["one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen", "sixteen"] n_per = 50 similar = [] different = [] for dataset in [animals, numbers]: for i in range(n_per // 2): similar.append([random.choice(dataset), random.choice(dataset)]) for i in range(n_per): different.append([random.choice(animals), random.choice(numbers)]) targets = np.asarray(["similar"] * len(similar) + ["different"] * len(different)) data = similar + different x_tr, x_te, t_tr, t_te = train_test_split(data, targets, test_size=0.3) model.finetune(*list_transpose(x_tr), t_tr) predictions = model.predict(*list_transpose(x_te)) accuracy = np.mean([pred == true for pred, true in zip(predictions, t_te)]) naive_baseline = max(np.mean(targets == "similar"), np.mean(targets == "different")) self.assertGreater(accuracy, naive_baseline)
def finetune_grid_search(cls, Xs, Y, *, test_size, config=None, eval_fn=None, probs=False, return_all=False): """ Performs grid search over config items defined using "GridSearchable" objects and returns either full results or the config object that relates to the best results. The default config contains grid searchable objects for the most important parameters to search over. :param Xs: Input text. Either [num_samples] or [sequence, num_samples] for single or multi input models respectively. :param Y: Targets, A list of targets, [num_samples] that correspond to each sample in Xs. :param test_size: Int or float. If an int is given this number of samples is used to validate, if a float is given then that fraction of samples is used. :param config: A config object, or None to use the default config. :param eval_fn: An eval function that takes 2 inputs (prediction, truth) and returns a float, with a max value being desired. :param probs: If true, eval_fn is passed probability outputs from predict_proba, otherwise the output of predict is used. :param return_all: If True, all results are returned, if False, only the best config is returned. :return: default is to return the best config object. If return_all is true, it returns a list of tuples of the form [(config, eval_fn output), ... ] """ if isinstance(Xs[0], str): Xs = [Xs] config = config or get_default_config() config.val_size = 0.0 eval_fn = eval_fn or cls.get_eval_fn() trainXs, testXs, trainY, testY = train_test_split(list_transpose(Xs), Y, test_size=test_size, shuffle=True) trainXs = list_transpose(trainXs) testXs = list_transpose(testXs) gs = config.get_grid_searchable() ranged_keys = gs.keys() ranged_iterators = gs.values() grid_gen = itertools.product(*ranged_iterators) results = [] for grid_item in grid_gen: config_ = deepcopy(config) config_.update(dict(zip(ranged_keys, grid_item))) instance = cls(config=config_) instance.finetune(*trainXs, Y=trainY) if probs: res = instance.predict_proba(*testXs) else: res = instance.predict(*testXs) results.append((config_, eval_fn(res, testY))) del instance if return_all: return results return max(results, key=lambda x: x[1])[0]
def predict_proba(self, questions, answers): """ Produces a probability distribution over classes for each example in X. :param question: List or array of text, shape [batch] :param answers: List or array of text, shape [batch, n_answers] :returns: list of dictionaries. Each dictionary maps from a class label to its assigned class probability. """ answers = list_transpose(answers) raw_probas = self._predict_proba(zip(questions, answers)) formatted_predictions = [] for probas, *answers_per_sample in zip(raw_probas, *answers): formatted_predictions.append(dict(zip(answers_per_sample, probas))) return formatted_predictions
def predict(self, question, answers, max_length=None): """ Produces a list of most likely class labels as determined by the fine-tuned model. :param question: List or array of text, shape [batch] :param answers: List or array of text, shape [n_answers, batch] :param max_length: the number of byte-pair encoded tokens to be included in the document representation. Providing more than `max_length` tokens as input will result in truncation. :returns: list of class labels. """ answers = list_transpose(answers) raw_ids = BaseModel.predict(self, question, answers, max_length=max_length) return [ans[i] for ans, i in zip(zip(*answers), raw_ids)]
def predict_proba(self, question, answers, max_length=None): """ Produces a probability distribution over classes for each example in X. :param question: List or array of text, shape [batch] :param answers: List or array of text, shape [n_answers, batch] :param max_length: the number of byte-pair encoded tokens to be included in the document representation. Providing more than `max_length` tokens as input will result in truncation. :returns: list of dictionaries. Each dictionary maps from a class label to its assigned class probability. """ answers = list_transpose(answers) raw_probas = self._predict_proba(question, answers, max_length) formatted_predictions = [] for probas, *answers_per_sample in zip(raw_probas, *answers): formatted_predictions.append(dict(zip(answers_per_sample, probas))) return formatted_predictions
def finetune(self, question, answers, correct_answer, batch_size=None, fit_lm_only=False): """ :param question: List or array of text, shape [batch] :param correct_answer: List or array of correct answers [batch] either in the format of an idx to the correct answer or a string of the correct answer. :param answers: List or array of text, shape [batch, n_answers], must contain the correct answer for each entry. :param batch_size: integer number of examples per batch. When N_GPUS > 1, this number corresponds to the number of training examples provided to each GPU. """ answer_idx = [] if not len(correct_answer) == len(answers) == len(question): raise ValueError( "Answers, questions and corrext_answer are not all the same length, {},{},{}" .format(len(question), len(correct_answer), len(answers))) for correct, others in zip(correct_answer, answers): if isinstance(correct, int): if 0 > correct > len(others): raise ValueError( "Correct answer is of type int but is invalid with value {} for answers of len {}" .format(correct, len(others))) answer_idx.append(correct) else: try: ans_idx = others.index(correct) answer_idx.append(ans_idx) except ValueError: raise ValueError( "Correct answer {} is not contained in possible answers {}" .format(correct, others)) answers = list_transpose(answers) self.num_answers = len(answers) arr_encoded = self._text_to_ids(question, answers) labels = None if fit_lm_only else answer_idx return self._training_loop(arr_encoded, Y=labels, batch_size=batch_size)