Ejemplo n.º 1
0
def eval_single_task(model_path, dataset_id, task, evaluator, embeddings,
                     mappings, data):
    # load the BiLSTM model
    model = BiLSTM.loadModel(model_path)

    # create dataset dictionary
    dataset = Dataset(dataset_id)
    dataset_dict = dataset.to_dict(task)

    # set the model mappings and datasets
    model.setMappings(mappings, embeddings)
    model.setDataset(dataset_dict, data)

    # obtain mapping of indices to POS/NER labels
    label = task + '_BIO' if task == 'NER' else task
    idx2label = model.idx2Labels[label]

    # obtain train and test data
    train_data = data[dataset_id]['trainMatrix']
    test_data = data[dataset_id]['testMatrix']

    # obtain correct and predicted sentences
    corr_idxs = [sentence[label] for sentence in test_data]
    pred_idxs = model.predictLabels(test_data)[label]

    # convert indices to labels (POS tags or NER tags in BIO format)
    corr_labels = [[idx2label[idx] for idx in sent] for sent in corr_idxs]
    pred_labels = [[idx2label[idx] for idx in sent] for sent in pred_idxs]

    evaluator.eval(dataset.name, dataset.lang, task, corr_labels, pred_labels,
                   train_data, test_data)
    print(f'Evaluated single_task - {dataset_id} - {task}')
Ejemplo n.º 2
0
def generate_fold_for_datasets():
    nb_folds = 10
    # Check that we really want to override the current folds
    if os.path.isdir(os.path.join(FOLDS_PATH)):
        answ = input("Folds already exist, overwrite them? y/n: ")
        if answ != 'y':
            print("Folds creation aborted.")
            return

    dataset_names = Dataset.get_dataset_names(
    ) + Dataset.interesting_2d_datasets()
    for dataset_name in dataset_names:
        print("Creating folds for dataset ", dataset_name)
        dataset = Dataset(dataset_name)
        os.makedirs(os.path.join(FOLDS_PATH, dataset_name), exist_ok=True)

        for run_nb in range(nb_folds):
            skf = StratifiedKFold(n_splits=nb_folds, shuffle=True)
            labels = dataset.target

            for fold_nb, (train_indices, test_indices) in enumerate(
                    skf.split(np.zeros(len(labels)), labels)):
                to_write = dict()
                to_write["train_indices"] = train_indices.tolist()
                to_write["test_indices"] = test_indices.tolist()
                with open(os.path.join(
                        FOLDS_PATH, dataset_name,
                        "run{}_fold{}.txt".format(run_nb, fold_nb)),
                          mode='w') as fold_file:
                    json.dump(to_write, fold_file)
Ejemplo n.º 3
0
    def train(self,
              data: Dataset,
              epochs: int,
              batch_size: int,
              tracker: PerformanceTracker = None,
              learning_rate=0.001) -> None:
        val_X, val_y = data.get_val()
        val_X, val_y = torch.from_numpy(val_X), torch.from_numpy(val_y)
        val_X = val_X.permute(0, 3, 1, 2)
        val_X, val_y = val_X.float(), val_y.long()
        # Assigned to GPU in batches in validate method.

        model = self
        model = model.to(self.used_device)
        optimizer = optim.Adam(model.parameters(), lr=learning_rate)
        num_batches = data.num_possible_batches(batch_size)
        batch_range = range(
            num_batches
        )  # from 0, to the len of x, stepping BATCH_SIZE at a time. [:50] ..for now just to dev

        for epoch in range(epochs):
            train_losses = []
            train_accuracies = []
            epoch_data = deepcopy(data)

            for _ in tqdm(batch_range):
                batch_X, batch_y = epoch_data.get_next_batch(batch_size)
                batch_X, batch_y = torch.from_numpy(batch_X), torch.from_numpy(
                    batch_y)
                # batch_X = batch_X.view(-1, 3, 128, 128)
                batch_X = batch_X.permute(0, 3, 1, 2)
                batch_X, batch_y = batch_X.float(), batch_y.long()
                batch_X, batch_y = batch_X.to(self.used_device), batch_y.to(
                    self.used_device)

                model.zero_grad()
                # print('Train pass')
                pred_y = model(batch_X)
                loss = self.loss_function(pred_y, batch_y)
                train_losses.append(loss.item())
                loss.backward()
                optimizer.step()
                pred_y_indices = torch.argmax(pred_y, dim=1)

                num_correct = int((pred_y_indices == batch_y).int().sum())
                accuracy = num_correct / batch_size
                train_accuracies.append(accuracy)

            print('Batch prediction:')
            print(pred_y)
            val_loss, val_acc = model.validate(val_X, val_y, batch_size)
            tracker.add_train(mean(train_losses), mean(train_accuracies))
            tracker.add_val(val_loss, val_acc)
            tracker.print_stats(epoch)
Ejemplo n.º 4
0
def run_cobras_on_dataset(dataset_name, number_of_queries):
    data = Dataset(dataset_name)

    clusterer = COBRAS(similarity_pred=True)
    querier = WeakQuerier(data.data,
                          data.target,
                          number_of_queries,
                          'local_nondet',
                          max_prob=1)
    clusterings, runtimes, mls, cls, dks = clusterer.fit(
        data.data, None, None, querier)
    logger: COBRASLogger = clusterer.logger

    print("COBRAS finished in {}s with result {}".format(
        runtimes[-1], clusterings[-1]))
    print("Number of DK queries: {}".format(querier.total_DK))
    print("# of predicted constraints: {}".format(
        len(logger.predicted_constraints)))
    if len(logger.predicted_constraints) != 0:
        print("% of correct predictions: {}%".format(
            logger.n_correct_preds * 100 / len(logger.predicted_constraints)))
    print("ARI: ", metrics.adjusted_rand_score(data.target, clusterings[-1]))

    n_merging = logger.algorithm_phases.count("merging")
    n_splitting = logger.algorithm_phases.count("splitting")
    print("Merging ", n_merging)
    print("Splitting ", n_splitting)
Ejemplo n.º 5
0
def baseline_cobras():
    n_queries = 100
    datasets = Dataset.interesting_2d_datasets() + Dataset.get_quick_dataset_names()
    tests = TestCollection()
    tests.add_10_times_10_fold_test("handle_low_nocap",
                                    "COBRAS",
                                    cobras_algorithm_settings_to_string(),
                                    datasets,
                                    "weak_querier",
                                    weak_querier_settings_to_string('local_nondet', max_prob=1, max_queries=n_queries))
    run_tests_local(tests, nb_of_cores=4)

    comparison_name = "final Cobras cap"
    test_names = ["cobras_no_uncertainty_s3", "cobras_no_uncertainty"]
    line_names = ["COBRAS: Max 8 SIs", "COBRAS: No max"]
    calculate_aris_and_compare_for_tests(comparison_name, test_names, line_names, query_budget=n_queries, nb_of_cores=6)
Ejemplo n.º 6
0
def run_dees():
    dataset = Dataset("ionosphere")
    querier = LabelQuerier(dataset.target, 100)
    splitstrat = StandardSplitLevelEstimationStrategyAlwayskmeans(
        SelectMostInstancesHeuristic())
    clusterer = COBRAS()
    #clusterer = COBRAS(cluster_algo=KmeansFixedRepresentative(), superinstance_builder=SuperInstance_select_representative_Builder(),splitlevel_strategy=splitstrat)
    print("done")
    clusterer.fit(dataset.data, None, None, querier)
Ejemplo n.º 7
0
 def run(self):
     dataset = Dataset(self.dataset_name)
     target = dataset.target
     with open(self.clustering_path, mode='r') as clustering_file:
         clusterings, runtimes, ml, cl, dn, train_indices = json.load(
             clustering_file)
     aris = intermediate_results_to_ARIs(clusterings, target, train_indices)
     os.makedirs(os.path.dirname(self.result_path), exist_ok=True)
     with open(self.result_path, mode='w') as ari_file:
         json.dump(aris, ari_file)
Ejemplo n.º 8
0
    def run(self):
        algorithm = algorithm_info_to_object(self.algorithm_name,
                                             self.algorithm_parameters)
        querier_builder = querier_info_to_object(self.querier_name,
                                                 self.querier_parameters)
        dataset = Dataset(self.dataset_name)
        train_indices = fold_path_to_train_indices(self.fold_path)
        querier = querier_builder.build_querier(dataset)

        result = algorithm.fit(dataset.data, dataset.number_of_classes(),
                               train_indices, querier)

        save_pred_results = False
        if save_pred_results:
            logger = algorithm.logger
            stats = (querier.total_DK, len(logger.predicted_constraints),
                     logger.n_correct_preds)
            output_file = '' + self.result_path[17:]
            os.makedirs(os.path.dirname(output_file), exist_ok=True)
            with open(output_file, mode="w") as result_file:
                json.dump(stats, result_file)

        save_rf_accuracy = False
        if save_rf_accuracy:
            output_file = '' + self.result_path[17:]
            os.makedirs(os.path.dirname(output_file), exist_ok=True)
            with open(output_file, mode="w") as result_file:
                json.dump(algorithm.logger.accuracy_per_n_constraints,
                          result_file)

        # None is not json serializable so use the string "None" instead
        train_indices = train_indices if train_indices is not None else "None"
        full_result = result + (train_indices, )
        os.makedirs(os.path.dirname(self.result_path), exist_ok=True)
        with open(self.result_path, mode="w") as result_file:
            json.dump(full_result, result_file)
Ejemplo n.º 9
0
    def run(self):
        dataset = Dataset(self.dataset_name)
        target = dataset.target
        with open(self.clustering_path, mode='r') as clustering_file:
            clusterings, runtime, ml, cl, dn, train_indices = json.load(
                clustering_file)
        last_clustering = clusterings[-1]
        # Again the string "None" because JSON does not serialize None
        if train_indices == "None":
            ari = get_ARI(last_clustering, target)
        else:
            ari = get_ARI(last_clustering, target, train_indices=train_indices)

        os.makedirs(os.path.dirname(self.result_path), exist_ok=True)
        with open(self.result_path, mode='w') as ari_file:
            json.dump(ari, ari_file)
Ejemplo n.º 10
0
def een_test_2(datatset_names):
    for dataset_name in datatset_names:
        data = Dataset(dataset_name)
        querier = LabelQuerier(data.target, 100)
        clusterer = COBRAS(cobras_plus=True)
        logger = COBRASLogger()
        clusterings, runtimes, ml, cl, dk = clusterer.fit(
            data.data, None, None, querier, logger)

        final_clustering = clusterings[-1]
        total_predictions = len(logger.predicted_constraints)

        correct_predictions, incorrect_predictions = calculate_correct_predictions(
            logger.predicted_constraints, data.target)

        print(
            "Total predictions: {}\nCorrect predictions: {}\nFalse predictions: {}"
            .format(total_predictions, correct_predictions,
                    incorrect_predictions))
        print(len(ml) + len(cl))
Ejemplo n.º 11
0
def een_test_1(dataset_names):
    # dataset_names = ['faces_eyes_imagenet'] # Dataset.get_standard_dataset_names()
    for dataset_name in dataset_names:
        data = Dataset(dataset_name)
        querier = LabelQuerier(data.target, 100)
        clusterer = COBRAS(cobras_plus=True)
        logger = COBRASLogger()

        clusterings, runtimes, ml, cl, dk = clusterer.fit(
            data.data, None, None, querier, logger)

        counter = 0
        for clustering in clusterings:
            name = "clustering_" + str(counter)
            plot_clustering(data, clustering, name, True)
            counter += 1

        print("Queries done: ", len(logger.queried_constraints))
        print("predicted constraints: ", len(logger.predicted_constraints))
        print("MLs: ", len(ml))
        print("CLs: ", len(cl))