def valid(self):
        test_iter = Clip_Iterator(c.VALID_DIR_CLIPS)
        evaluator = Evaluator(self.global_step)
        i = 0
        for data in test_iter.sample_valid(self._batch):
            in_data = data[:, :self._in_seq, ...]
            if c.IN_CHANEL == 3:
                gt_data = data[:,
                               self._in_seq:self._in_seq + self._out_seq, :, :,
                               1:-1]
            elif c.IN_CHANEL == 1:
                gt_data = data[:, self._in_seq:self._in_seq + self._out_seq,
                               ...]
            else:
                raise NotImplementedError
            if c.NORMALIZE:
                in_data = normalize_frames(in_data)
                gt_data = normalize_frames(gt_data)

            mse, mae, gdl, pred = self.g_model.valid_step(in_data, gt_data)
            evaluator.evaluate(gt_data, pred)
            self.logger.info(f"Iter {self.global_step} {i}: \n\t "
                             f"mse:{mse:.4f} \n\t "
                             f"mae:{mae:.4f} \n\t "
                             f"gdl:{gdl:.4f}")
            i += 1
        evaluator.done()
Exemple #2
0
def evaluateWillsSamplerParallel(windowSize,
                                 filename='./data/creditcard.csv',
                                 max_window_ct=3):
    parallel_counts = [1, 2, 4, 8, 16, 32]
    filename = './data/creditcard.csv'
    sample_sizes = [30, 40, 60]
    for sample_size in sample_sizes:
        for parallel_count in parallel_counts:
            hopper, column_names = buildFromCSV(filename, windowSize, "Time")
            windows = hopper.hopper()
            sampler = WillsSampler(sample_size, (2, 29),
                                   eta,
                                   parallel_count=parallel_count)
            samples = dict()
            num_windows = 0
            for window in windows:
                samples[num_windows] = sampler.sample(window).deheapify()
                num_windows += 1
                if num_windows > max_window_ct: break

            storage_filename = sampler.persistent_filename(
                filename, windowSize)
            # store dataset and return evaluation metrics on it
            sampler.persist_sample_set(samples, storage_filename, column_names,
                                       num_windows)
            e = Evaluator(samples, sampler)
            e.save(storage_filename + "evaluator.csv")
    return
Exemple #3
0
def main():
    args = parse_arguments()

    print("#" * 80)
    print("Model:                         ", args.model_class)
    print("Parameters:                    ", args.model_parameters)
    print("X:                             ", args.x_filepath)
    print("Y:                             ", args.y_filepath)
    print("Splits:                        ", args.n_splits)
    print("Random State:                  ", args.random_state)
    print("Model Filepath:                ", args.model_filepath)
    print("Raw Evaluation Filepath:       ", args.raw_model_score_filepath)
    print("Aggregate Evaluation Filepath: ",
          args.aggregated_model_score_filepath)

    model = initialize_model(args.model_class, args.model_parameters)

    X = np.load(args.x_filepath)

    Y = np.load(args.y_filepath)

    evaluator = Evaluator(args.n_splits)

    train_model(model, X, Y, evaluator, args.n_splits, args.random_state)

    evaluator.save(args.raw_model_score_filepath,
                   args.aggregated_model_score_filepath)

    joblib.dump(model, args.model_filepath)

    print("#" * 80)
Exemple #4
0
 def __init__(self, model, params, space):
     self.__model = model
     self.__params = params
     self.__space = space
     self.__evaluator = Evaluator(self.__model)
     self.__train_df = None
     self.__test_df = None
Exemple #5
0
def evaluateWillsSamplerClusters(windowSize,
                                 filename='./data/creditcard.csv',
                                 max_window_ct=3):
    parallel_counts = [1, 4, 16]
    cluster_choices = [1, 6, 11, 21, 31, 51]
    cluster_centers_collection = loadClusters(cluster_choices)
    filename = './data/creditcard.csv'
    sample_sizes = [30, 40, 60, 100]
    for sample_size in sample_sizes:
        for num_centers, cluster_centers in cluster_centers_collection.items():
            for parallel_count in parallel_counts:
                hopper, column_names = buildFromCSV(filename, windowSize,
                                                    "Time")
                windows = hopper.hopper()
                sampler = WillsSampler(sample_size, (2, 29),
                                       eta,
                                       parallel_count=parallel_count,
                                       cluster_centers=cluster_centers)
                samples = dict()
                num_windows = 0
                for window in windows:
                    samples[num_windows] = sampler.sample(window).deheapify()
                    num_windows += 1
                    if num_windows > max_window_ct: break

                storage_filename = sampler.persistent_filename(
                    filename, windowSize)
                # store dataset and return evaluation metrics on it
                sampler.persist_sample_set(samples, storage_filename,
                                           column_names, num_windows)
                e = Evaluator(samples, sampler)
                e.save(storage_filename + "evaluator.csv")
    return
Exemple #6
0
def build_ideal_window_hopping_set(windowSize,
                                   AbstractBaseSampler,
                                   filename='./data/creditcard.csv',
                                   max_window_ct=3):
    filename = './data/creditcard.csv'
    sample_sizes = [30, 40, 60]
    for sample_size in sample_sizes:
        hopper, column_names = buildFromCSV(filename, windowSize, "Time")
        windows = hopper.hopper()
        sampler = AbstractBaseSampler(sample_size, (2, 29),
                                      eta)  # sample size, column range, eta
        samples = dict()
        num_windows = 0
        for window in windows:
            samples[num_windows] = sampler.sample(window)
            num_windows += 1
            if num_windows > max_window_ct: break

        storage_filename = _persistentFileName(str(sampler), filename,
                                               windowSize, sample_size)
        # store dataset and return evaluation metrics on it
        sampler.persist_sample_set(samples, storage_filename, column_names,
                                   num_windows)
        e = Evaluator(samples, sampler)
        e.save(storage_filename + "evaluator.csv")
    return
Exemple #7
0
    def run_benchmark(self, iter, mode="Valid"):
        if mode == "Valid":
            time_interval = c.RAINY_VALID
            stride = 20
        else:
            time_interval = c.RAINY_TEST
            stride = 1
        test_iter = Iterator(time_interval=time_interval,
                             sample_mode="sequent",
                             seq_len=c.IN_SEQ + c.OUT_SEQ,
                             stride=1)
        evaluator = Evaluator(iter)
        i = 1
        while not test_iter.use_up:
            data, date_clip, *_ = test_iter.sample(batch_size=c.BATCH_SIZE)
            in_data = np.zeros(shape=(c.BATCH_SIZE, c.IN_SEQ, c.H, c.W, c.IN_CHANEL))
            gt_data = np.zeros(shape=(c.BATCH_SIZE, c.OUT_SEQ, c.H, c.W, 1))
            if type(data) == type([]):
                break
            in_data[...] = data[:, :c.IN_SEQ, ...]

            if c.IN_CHANEL == 3:
                gt_data[...] = data[:, c.IN_SEQ:c.IN_SEQ + c.OUT_SEQ, :, :, 1:-1]
            elif c.IN_CHANEL == 1:
                gt_data[...] = data[:, c.IN_SEQ:c.IN_SEQ + c.OUT_SEQ, ...]
            else:
                raise NotImplementedError

            # in_date = date_clip[0][:c.IN_SEQ]

            if c.NORMALIZE:
                in_data = normalize_frames(in_data)
                gt_data = normalize_frames(gt_data)

            mse, mae, gdl, pred = self.model.valid_step(in_data, gt_data)
            evaluator.evaluate(gt_data, pred)
            logging.info(f"Iter {iter} {i}: \n\t mse:{mse} \n\t mae:{mae} \n\t gdl:{gdl}")
            i += 1
            if i % stride == 0:
                if c.IN_CHANEL == 3:
                    in_data = in_data[:, :, :, :, 1:-1]

                for b in range(c.BATCH_SIZE):
                    predict_date = date_clip[b][c.IN_SEQ]
                    logging.info(f"Save {predict_date} results")
                    if mode == "Valid":
                        save_path = os.path.join(c.SAVE_VALID, str(iter), predict_date.strftime("%Y%m%d%H%M"))
                    else:
                        save_path = os.path.join(c.SAVE_TEST, str(iter), predict_date.strftime("%Y%m%d%H%M"))

                    path = os.path.join(save_path, "in")
                    save_png(in_data[b], path)

                    path = os.path.join(save_path, "pred")
                    save_png(pred[b], path)

                    path = os.path.join(save_path, "out")
                    save_png(gt_data[b], path)
        evaluator.done()
Exemple #8
0
 def __init__(self, crf, gibbs=False, cd=False, n_samps=5, burn=5, interval=5):
     self.crf = crf
     self.gibbs = gibbs
     self.cd = gibbs and cd
     self.E_f = self.exp_feat_gibbs if gibbs else self.exp_feat
     self.n_samples = n_samps
     self.burn = burn
     self.interval = interval
     self.ev = Evaluator()
Exemple #9
0
 def __init__(self,
              init_pop,
              growth_time=2 * 60,
              mut_prob=0.5,
              pop_size=30):
     self._init_pop = init_pop
     self._mut_prob = mut_prob
     self._evaluator = Evaluator(growth_time)
     self._nsgaii_sorter = NSGAII(2, None, None)
     self._pop_size = pop_size
Exemple #10
0
def main():
    USE_GPU = True
    if USE_GPU and torch.cuda.is_available():
        torch.cuda.empty_cache()
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
        
    print('using device:', device)
    dtype = torch.float32
    
    '''    
    filename = 'curr_model_soumya_mat_it_es'
    f = open(filename, 'rb')
    model = pickle.load(f)
    f.close()

    '''
    n_epochs = 5
    n_refinement = 5
    batch_size = 32
    
    model = UMWE(dtype, device, batch_size, n_epochs, n_refinement)
    model.build_model()
    model.discrim_fit()
    filename = 'curr_model_soumya_mat_it_es'
    f = open(filename, 'wb')
    pickle.dump(model, f)
    f.close()

    eval_ = Evaluator(model)
    print(eval_.clws('es', 'en'))
    print(eval_.clws('en', 'es'))
    print(eval_.clws('it', 'en'))
    eval_.word_translation('es', 'en')
    eval_.word_translation('en', 'es')
    eval_.word_translation('it', 'en')

    model.mpsr_refine()
    print(eval_.clws('es', 'en'))
    print(eval_.clws('en', 'es'))
    print(eval_.clws('it', 'en'))
    eval_.word_translation('es', 'en')
    eval_.word_translation('en', 'es')
    eval_.word_translation('it', 'en')
    
    
    filename = 'curr_model_soumya_mpsr_it_es'
    f = open(filename, 'wb')
    pickle.dump(model, f)
    f.close()
    
    for lang in model.src_langs.values():
        model.export_embeddings(lang, model.embs, "txt", "20th")
Exemple #11
0
def get_classifier_evaluation(prediction,
                              test,
                              classifier_name,
                              data_name,
                              b=2):
    """
    this function get the evaluation of each classifier: print the amount of errors and the text of them, plot roc_curve
    and return the measures scores.
    """
    evaluation = Evaluator(prediction, test, b)
    return evaluation.get_evaluation(classifier_name, data_name)
Exemple #12
0
def val_eval(model, validation_data, loss_fn):
    model.eval()
    eval = Evaluator()
    x_val = validation_data.X.to('cuda')
    y_val = validation_data.Y.to('cuda')
    z_val = validation_data.Z.to('cuda')
    yhat = model(x_val)
    val_loss = loss_fn(yhat, y_val)
    eval.update_counter(yhat, y_val, z_val)
    eval.update_loss(0, 0, 0)
    return val_loss, eval.total_percenage[0]
Exemple #13
0
def label_evaluation(test_data, predicted_labels):
    gold_labels = flatten([flatten(i["gold_labels"]) for i in test_data])
    gold_labels = [1 if i else 0 for i in gold_labels]
    metric_evaluation = Evaluator()
    metric_evaluation.compute_all(gold_labels, predicted_labels)
    log.write("Confusion Matrix :")
    log.write(metric_evaluation.confusion_matrix)
    log.write("Accuracy     = %f" % metric_evaluation.accuracy)
    log.write("Precision    = %f" % metric_evaluation.precision)
    log.write("Recall       = %f" % metric_evaluation.recall)
    log.write("F1 Score     = %f" % metric_evaluation.f1_score)
Exemple #14
0
    def get_evaluation_metrics(self, df_original, df_imputed, target,
                               mask_missing, m_prop, verbose):
        """
        Generate evaluation metrics for datasets

        :param m_prop:
        :param verbose:
        :param target:
        :param df_original:
        :param df_imputed:
        :param mask_missing:
        :return:
        """
        results = dict()
        results['prop'] = m_prop
        results['strategy'] = self.strategy_abbr
        # todo: refactor it with score factory
        if self.strategy_abbr not in ['constant', 'emb']:
            results['rmse'] = Evaluator().get_compare_metrics(
                df_original, df_imputed, mask_missing)
        if self.strategy_abbr not in ['emb']:
            results['uce'] = Evaluator().uce(df_original, df_imputed)
            results['silhouette'] = Evaluator().silhouette(df_imputed)

        # todo: add pipeline for regression with auto detect the target type
        sce_or = Evaluator().sce(df_original, target)
        sce_im = Evaluator().sce(df_imputed, target)
        results['sce'] = sce_im - sce_or
        results['f1'] = Evaluator().f1_score(df_imputed, target)
        # if verbose:
        #     self.logger.info(f'UCE - clustering error between original and imputed datasets = ', np.round(results['uce'], 5))
        #     self.logger.info(f'RMSE score between original values and imputed = ', np.round(results['rmse'], 5))
        #     self.logger.info(f'SCE - classification error between original and imputed datasets', np.round(results['sce'], 5))
        return results
Exemple #15
0
    def generate_callbacks(self):
        callbacks = []

        tbpath = os.path.join(self.out_path, "tensorboard")
        symtbpath = os.path.join(args.output, "tensorboard", args.tag)
        if not os.path.exists(tbpath):
            os.makedirs(tbpath)
        if not os.path.exists(symtbpath):
            os.symlink(tbpath, symtbpath)
            print(f"Symlinked {tbpath} -> {symtbpath}")
        log_files_list = os.listdir(tbpath)
        if log_files_list != []:
            for fn in log_files_list:
                print(f"Deleting {os.path.join(tbpath, fn)}")
                shutil.rmtree(os.path.join(tbpath, fn))
        checkpath = os.path.join(self.out_path, 'checkpoint/')
        if not os.path.exists(checkpath):
            os.makedirs(checkpath)

        tb_callback = tf.keras.callbacks.TensorBoard(log_dir=tbpath,
                                                     update_freq='epoch',
                                                     write_graph=True,
                                                     write_images=True)
        callbacks.append(tb_callback)

        check_name = os.path.join(checkpath, f'{args.model}_{args.tag}.hdf5')
        if self.data == 'opportunity':
            monitorname = f"out_{self.label_names[0]}_fmeasure"
            if len(self.label_names) == 1:
                monitorname = 'fmeasure'
        elif self.data == 'deap':
            monitorname = f"val_out_{self.label_names[0]}_accuracy"
        check_callback = tf.keras.callbacks.\
            ModelCheckpoint(check_name,
                            monitor=monitorname,
                            save_best_only=True,
                            mode='max',
                            save_freq='epoch',
                            save_weights_only=False)
        callbacks.append(check_callback)

        if self.data == 'opportunity':
            evaluator = Evaluator(self.label_names)
            eval_dir = os.path.join(outpath, 'evaluation')
            if not os.path.isdir(eval_dir):
                os.makedirs(eval_dir)
            eval_callback = EvaluationCallback(self.val_data, self.label_names,
                                               self.num_classes, eval_dir)
            callbacks.append(eval_callback)
        return callbacks
Exemple #16
0
def main(argv):
    del argv

    labels = read_class_labels()
    evaluator = Evaluator(labels)
    with PredictionWriter(labels, FLAGS.dest) as pwriter:
        pwriter.write_headers()
        for filepath in glob.glob(FLAGS.source + '/**/*.wav', recursive=True):
            filename = os.path.basename(filepath)
            predictions = process_file(filepath, FLAGS.ckpt, FLAGS.labels)
            true_label = read_true_label(filepath)
            evaluator.record(predictions, true_label)
            pwriter.write_row(filename, predictions)
    evaluator.print_eval()
Exemple #17
0
    def evaluate(self, X_test, Y_test, Y_test_classes):
        if not self.model:
            raise Exception("Load or fit new model first")

        score, acc = self.model.evaluate(X_test, Y_test, batch_size=3)
        print("Test accuracy:", acc)

        evaluator = Evaluator()
        predictions_encoded = self.model.predict(X_test)
        predictions = self.lb.inverse_transform(
            [np.argmax(pred) for pred in predictions_encoded])
        evaluator.accuracy(Y_test_classes, predictions)
        # evaluator.classification_report(Y_test_classes, predictions)
        evaluator.confusion_matrix(Y_test_classes, predictions)
Exemple #18
0
def run_experiment(args: dict[str, str]):
    if args["models"] == "all":
        args["models"] = ALL_MODEL_NAMES
    if args["datasets"] == "all":
        args["datasets"] = ALL_DATASET_NAMES

    models = setup_models(args["models"].split(), args["location"], daner_path=args["daner"])
    log(f"Succesfully set up {len(models)} models")

    datasets = setup_datasets(args["datasets"].split(), wikiann_path=args["wikiann"], plank_path=args["plank"])
    log(f"Sucessfully acquired {len(datasets)} NER datasets")

    for model in models:
        for dataset in datasets:
            e = Evaluator(model, dataset)
            res = e.run()
            res.save(os.path.join(args["location"], "-".join((model.name, dataset.name))))
def run_predictions(input_path, output_path, thresholds_file, num_skip,
                    check_existing):
    """Creates thread pool which will concurrently run the prediction for every
    protein map in the 'input_path'

    Parameters
    ----------
    input_path: str
        Path of the input directory where the different protein directories are
        located

    output_path: str
        Path of the folder where all generated files will be stored

    thresholds_file: str
        Path of the JSON file which contains the threshold values for the input
        files

    num_skip: int
        The number of prediction steps that should be skipped

    check_existing: bool
        If set prediction steps are only executed if their results are not
        existing in the output path yet
    """
    # Create list of parameters for every prediction
    params_list = [
        (emdb_id, input_path, output_path, thresholds_file, num_skip,
         check_existing) for emdb_id in filter(
             lambda d: os.path.isdir(input_path + d), os.listdir(input_path))
    ]

    start_time = time()
    pool = Pool(min(cpu_count(), len(params_list)))
    results = pool.map(run_prediction, params_list)

    # Filter 'None' results
    results = filter(lambda r: r is not None, results)

    evaluator = Evaluator(input_path)
    for emdb_id, predicted_file, gt_file, execution_time in results:
        evaluator.evaluate(emdb_id, predicted_file, gt_file, execution_time)

    evaluator.create_report(output_path, time() - start_time)
Exemple #20
0
    def valid_clips(self, step):
        test_iter = Clip_Iterator(c.VALID_DIR_CLIPS)
        evaluator = Evaluator(step)
        i = 0
        for data in test_iter.sample_valid(c.BATCH_SIZE):
            in_data = data[:, :c.IN_SEQ, ...]
            if c.IN_CHANEL == 3:
                gt_data = data[:, c.IN_SEQ:c.IN_SEQ + c.OUT_SEQ, :, :, 1:-1]
            elif c.IN_CHANEL == 1:
                gt_data = data[:, c.IN_SEQ:c.IN_SEQ + c.OUT_SEQ, ...]
            else:
                raise NotImplementedError
            if c.NORMALIZE:
                in_data = normalize_frames(in_data)
                gt_data = normalize_frames(gt_data)

            mse, mae, gdl, pred = self.model.valid_step(in_data, gt_data)
            evaluator.evaluate(gt_data, pred)
            logging.info(f"Iter {step} {i}: \n\t mse:{mse} \n\t mae:{mae} \n\t gdl:{gdl}")
            i += 1
        evaluator.done()
    def __init__(self,
                 model: Model,
                 optimizer_name: str = "Adagrad",
                 batch_size: int = 256,
                 learning_rate: float = 1e-2,
                 decay1: float = 0.9,
                 decay2: float = 0.99,
                 regularizer_name: str = "N3",
                 regularizer_weight: float = 5e-2,
                 verbose: bool = True):
        self.model = model
        self.batch_size = batch_size
        self.verbose = verbose

        # build all the supported optimizers using the passed params (learning rate and decays if Adam)
        supported_optimizers = {
            'Adagrad':
            optim.Adagrad(params=self.model.parameters(), lr=learning_rate),
            'Adam':
            optim.Adam(params=self.model.parameters(),
                       lr=learning_rate,
                       betas=(decay1, decay2)),
            'SGD':
            optim.SGD(params=self.model.parameters(), lr=learning_rate)
        }

        # build all the supported regularizers using the passed regularizer_weight
        supported_regularizers = {
            'N3': N3(weight=regularizer_weight),
            'N2': N2(weight=regularizer_weight)
        }

        # choose the Torch Optimizer object to use, based on the passed name
        self.optimizer = supported_optimizers[optimizer_name]

        # choose the regularizer
        self.regularizer = supported_regularizers[regularizer_name]

        # create the evaluator to use between epochs
        self.evaluator = Evaluator(self.model)
Exemple #22
0
    def __init__(self,
                 model: TuckER,
                 batch_size: int = 128,
                 learning_rate: float = 0.03,
                 decay: float = 1.0,
                 label_smoothing: float = 0.1,
                 verbose: bool = True):
        self.model = model
        self.dataset = self.model.dataset
        self.batch_size = batch_size
        self.label_smoothing = label_smoothing
        self.verbose = verbose
        self.learning_rate = learning_rate
        self.decay_rate = decay
        self.verbose = verbose

        self.loss = torch.nn.BCELoss()
        self.optimizer = optim.Adam(params=self.model.parameters(),
                                    lr=learning_rate)
        self.scheduler = optim.lr_scheduler.ExponentialLR(
            self.optimizer, decay)

        # create the evaluator to use between epochs
        self.evaluator = Evaluator(self.model)
Exemple #23
0
def main():
    USE_GPU = True
    if USE_GPU and torch.cuda.is_available():
        torch.cuda.empty_cache()
        device = torch.device('cuda')
    else:
        device = torch.device('cpu')
        
    print('using device:', device)
    dtype = torch.float32
    
# =============================================================================
#     filename = 'curr_model'
#     f = open(filename, 'rb')
#     model = pickle.load(f)
#     f.close()
#     
# =============================================================================
    model = UMWE(dtype, device, 32, 2)
    model.build_model()
    # model.discrim_fit()
    # filename = 'curr_model'
    # f = open(filename, 'wb')
    # pickle.dump(model, f)
    # f.close()
# =============================================================================
    model.mpsr_refine()
# =============================================================================
# =============================================================================
    # for lang in model.src_langs.values():
        # model.export_embeddings(lang, model.embs, "txt")
# =============================================================================
    model.export_embeddings('es', model.embs, "txt")
    eval_ = Evaluator(model)
    print(eval_.clws('es', 'en'))
    eval_.word_translation('es', 'en')
Exemple #24
0
def run_evaluation():
    with open("queries.txt", "r") as queries_file:
        queries = list(map(str.strip, queries_file.readlines()))
    print(Evaluator().evaluate_to_latex(queries, "query.csv", "like.csv", relevance_cutoff=2))  
Exemple #25
0
dataset = IHDP(replications=args.reps)
scores = np.zeros((args.reps, 3))
scores_test = np.zeros((args.reps, 3))

M = None
d = 20  # latent space dimension
lamba = 1e-4  # weight decay
nh, h = 5, 200  # number and size of hidden layers

for i, (train, valid, test, contfeats,
        binfeats) in enumerate(dataset.get_train_valid_test()):
    print('\nReplication {}/{}'.format(i + 1, args.reps))
    (xtr, ttr, ytr), (y_cftr, mu0tr, mu1tr) = train
    (xva, tva, yva), (y_cfva, mu0va, mu1va) = valid
    (xte, tte, yte), (y_cfte, mu0te, mu1te) = test
    evaluator_test = Evaluator(yte, tte, y_cf=y_cfte, mu0=mu0te, mu1=mu1te)

    # Reorder features with binary first and continuous after
    perm = binfeats + contfeats
    xtr, xva, xte = xtr[:, perm], xva[:, perm], xte[:, perm]

    xalltr, talltr, yalltr = np.concatenate(
        [xtr, xva], axis=0), np.concatenate([ttr, tva],
                                            axis=0), np.concatenate([ytr, yva],
                                                                    axis=0)

    evaluator_train = Evaluator(yalltr,
                                talltr,
                                y_cf=np.concatenate([y_cftr, y_cfva], axis=0),
                                mu0=np.concatenate([mu0tr, mu0va], axis=0),
                                mu1=np.concatenate([mu1tr, mu1va], axis=0))
Exemple #26
0
        fold_importance_df["fold"] = n_fold + 1
        feature_importance_df = pd.concat(
            [feature_importance_df, fold_importance_df], axis=0)

        predictions += clf.predict(
            X_test, num_iteration=clf.best_iteration) / folds.n_splits

    print("CV score (Validation): {:<8.5f}".format(roc_auc_score(Y_train,
                                                                 oof)))
    print("CV score (Test): {:<8.5f}".format(roc_auc_score(
        Y_test, predictions)))

    y_pred = np.zeros(predictions.shape[0])
    y_pred[predictions >= 0.1] = 1

    eval = Evaluator()
    eval.evaluate(Y_test, y_pred)

    cols = (feature_importance_df[[
        "feature", "importance"
    ]].groupby("feature").mean().sort_values(by="importance",
                                             ascending=False)[:1000].index)
    best_features = feature_importance_df.loc[
        feature_importance_df.feature.isin(cols)]

    plt.figure(figsize=(14, 26))
    sns.barplot(x="importance",
                y="feature",
                data=best_features.sort_values(by="importance",
                                               ascending=False))
    plt.title('LightGBM Features (averaged over folds)')
    KFold,
    cross_val_predict,
    cross_val_score,
    LeaveOneOut,
    GridSearchCV,
)
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier
from sklearn.svm import SVC

import utils
import random
from evaluation import Evaluator
from feature_extraction import tfidf_features_1, bag_of_words_features_1

evaluator = Evaluator()


class PopularityModel:
    def name(self):
        return "Popularity model"

    def get_most_representative_class(self, Y_train):
        """Return most representative class"""
        item_counts = Y_train[utils.col_to_predict].value_counts()
        most_reprenetative = item_counts.idxmax()
        return most_reprenetative

    def predict(self, train, test):
        most_representative_class = self.get_most_representative_class(train)
        return [most_representative_class for _ in range(len(test))]
Exemple #28
0
nh, h = args.nh, args.h_dim  # number and size of hidden layers
batch_size = args.batch_size
epochs = args.epochs
lr = args.lr
drop_ratio = args.drop_ratio
drop_type = args.drop_type
gnoise = args.gnoise
arg_info = vars(args)
for i, (train, valid, test, contfeats, binfeats) in enumerate(dataset.get_train_valid_test()):
    print('\nReplication {}/{}'.format(i + 1, args.reps))
    # data preperation
    if task == 'jobs':
        (xtr, ttr, ytr), etr = train
        (xva, tva, yva), eva = valid
        (xte, tte, yte), ete = test
        evaluator_test = Evaluator(yte, tte, e=ete, task=task)
    else:
        (xtr, ttr, ytr), (y_cftr, mu0tr, mu1tr) = train
        (xva, tva, yva), (y_cfva, mu0va, mu1va) = valid
        (xte, tte, yte), (y_cfte, mu0te, mu1te) = test
        evaluator_test = Evaluator(yte, tte, y_cf=y_cfte, mu0=mu0te, mu1=mu1te, task=task)
    num_train = len(xtr) + len(xva)
    num_test = len(xte)
    # reorder features with binary first and continuous after
    perm = binfeats + contfeats
    xtr, xva, xte = xtr[:, perm], xva[:, perm], xte[:, perm]
    """Add noise"""
    # add gaussian noise
    if gnoise > 0:
        gnoise_train = np.random.normal(scale=gnoise,size=xtr.shape)
        gnoise_valid = np.random.normal(scale=gnoise,size=xva.shape)
Exemple #29
0
                        type=float,
                        default=0.1,
                        help="Amount of label smoothing.")

    args = parser.parse_args()
    #torch.backends.cudnn.deterministic = True
    #seed = 20
    #np.random.seed(seed)
    #torch.manual_seed(seed)
    #if torch.cuda.is_available:
    #    torch.cuda.manual_seed_all(seed)

    dataset_name = args.dataset
    dataset = Dataset(dataset_name)

    tucker = TuckER(dataset=dataset, entity_dimension=args.entity_dimension, relation_dimension=args.relation_dimension,
                   input_dropout=args.input_dropout, hidden_dropout_1=args.hidden_dropout_1,
                   hidden_dropout_2=args.hidden_dropout_2, init_random=True) # type: TuckER

    optimizer = BCEOptimizer(model=tucker, batch_size=args.batch_size, learning_rate=args.learning_rate,
                             decay=args.decay_rate, label_smoothing=args.label_smoothing)

    optimizer.train(train_samples=dataset.train_samples, max_epochs=args.max_epochs, evaluate_every=10,
                    save_path=os.path.join(MODEL_PATH, "TuckER_" + dataset_name + ".pt"),
                    valid_samples=dataset.valid_samples)

    print("Evaluating model...")
    mrr, h1 = Evaluator(model=tucker).eval(samples=dataset.test_samples, write_output=False)
    print("\tTest Hits@1: %f" % h1)
    print("\tTest Mean Reciprocal Rank: %f" % mrr)
    input_data = BucketIterator(
        dataset=data,
        batch_size=1,
        train=True,
    )

    training_batches = next(
        iter(
            BucketIterator(
                dataset=training_data,
                batch_size=1,
                train=True,
                sort_key=lambda x: interleave_keys(len(x.src), len(x.trg)))))

    predictor = Predictor(model)
    evaluator = Evaluator(training_data.english.vocab,
                          training_data.french.vocab)

    # evaluator.add_sentences(input_data.trg[0], predictor.predict(input_data))
    for i in range((len(data) // model.batch_size) + 1):
        sentence = next(iter(input_data))
        predicted_sentence, _ = predictor.predict(sentence)
        evaluator.add_sentences(sentence.trg[0], predicted_sentence, eos_token)
    #
    # for i in range((len(data) // batch_size) + 1):
    #     sentence = next(iter(input_data))
    #     src, trg = evaluator.convert_sentences(sentence)
    #     file.write(' '.join(src) + '\n')
    #     file.write(' '.join(trg) + '\n')
    #     file.write('\n')

    print('bleu:', evaluator.bleu())