Ejemplo n.º 1
0
def main():
    input_lang, output_lang, pairs = prepare_data('ques',
                                                  'ans',
                                                  '../debug.json',
                                                  reverse=False)
    encoder = EncoderRNN(input_lang.n_words, hidden_size).to(device)
    attn_decoder = AttnDecoderRNN(hidden_size,
                                  output_lang.n_words,
                                  dropout_p=0.1,
                                  max_length=1000).to(device)

    rate = 0.9
    pairs_train, pairs_test = pairs[0:int(len(pairs) *
                                          rate)], pairs[int(len(pairs) *
                                                            rate):]
    encoder.load_state_dict(torch.load('model/encoder-0.model'))
    encoder.eval()
    attn_decoder.load_state_dict(torch.load('model/decoder-0.model'))
    attn_decoder.eval()
    evaluate_all(encoder,
                 attn_decoder,
                 pairs_test,
                 max_length=1000,
                 input_lang=input_lang,
                 output_lang=output_lang,
                 n=len(pairs_test))
    # show_plot(loss_history)
    print('done test')
Ejemplo n.º 2
0
def main():
    input_lang, output_lang, pairs = prepare_data('ques',
                                                  'ans',
                                                  '../test.json',
                                                  reverse=False)
    model = Transformer(
        src_vocab_size=input_lang.n_words,
        src_max_len=MAX_LENGTH,
        tgt_vocab_size=output_lang.n_words,
        tgt_max_len=MAX_LENGTH,
    ).to(device)

    rate = 0.9
    pairs_train, pairs_test = pairs[0:int(len(pairs) *
                                          rate)], pairs[int(len(pairs) *
                                                            rate):]
    model.load_state_dict(torch.load('model/transformer-0.model'))
    model.eval()
    evaluate_all(model,
                 pairs_train,
                 max_length=100,
                 input_lang=input_lang,
                 output_lang=output_lang,
                 n=len(pairs_train))
    # show_plot(loss_history)
    print('done test')
Ejemplo n.º 3
0
def main():
    print(HELLO_MSG)
    args = parse_args()
    print(args)
    if args.action == "run":
        launch_on_all_nodes()
    elif args.action == "eval":
        evaluate_all()
Ejemplo n.º 4
0
def human_performance(args, params):
    x_gt, ground_truths, processor, dl = load.load_test(
        params, fit_processor=True)
    ground_truths, probs = human_gt_and_probs(
        params, x_gt, ground_truths, processor, review_indiv=True)
    evaluate.evaluate_all(
            ground_truths, probs, processor.classes,
            model_title='Human Performance Average', plot_flag=args.plot)
Ejemplo n.º 5
0
def train(model, data, words, params):
    start_time = time.time()

    counter = 0
    try:
        for eidx in xrange(params.epochs):

            kf = utils.get_minibatches_idx(len(data), params.batchsize, shuffle=True)
            uidx = 0
            for _, train_index in kf:

                uidx += 1

                batch = [data[t] for t in train_index]
                for i in batch:
                    i[0].populate_embeddings(words)
                    i[1].populate_embeddings(words)

                (g1x, g1mask, g2x, g2mask, p1x, p1mask, p2x, p2mask) = getpairs(model, batch, params)

                cost = model.train_function(g1x, g2x, p1x, p2x, g1mask, g2mask, p1mask, p2mask)

                if np.isnan(cost) or np.isinf(cost):
                    print 'NaN detected'

                if (utils.checkIfQuarter(uidx, len(kf))):
                    if (params.save):
                        counter += 1
                        utils.saveParams(model, params.outfile + str(counter) + '.pickle')
                    if (params.evaluate):
                        evaluate_all(model, words)
                        sys.stdout.flush()

                #undo batch to save RAM
                for i in batch:
                    i[0].representation = None
                    i[1].representation = None
                    i[0].unpopulate_embeddings()
                    i[1].unpopulate_embeddings()

                #print 'Epoch ', (eidx+1), 'Update ', (uidx+1), 'Cost ', cost

            if (params.save):
                counter += 1
                utils.saveParams(model, params.outfile + str(counter) + '.pickle')

            if (params.evaluate):
                evaluate_all(model, words)

            print 'Epoch ', (eidx + 1), 'Cost ', cost

    except KeyboardInterrupt:
        print "Training interupted"

    end_time = time.time()
    print "total time:", (end_time - start_time)
Ejemplo n.º 6
0
def train(model, data, words, params):
    start_time = time.time()

    counter = 0
    try:
        for eidx in xrange(params.epochs):

            kf = utils.get_minibatches_idx(len(data), params.batchsize, shuffle=True)
            uidx = 0
            for _, train_index in kf:

                uidx += 1

                batch = [data[t] for t in train_index]
                for i in batch:
                    i[0].populate_embeddings(words)
                    i[1].populate_embeddings(words)

                (g1x, g1mask, g2x, g2mask, p1x, p1mask, p2x, p2mask) = getpairs(model, batch, params)

                cost = model.train_function(g1x, g2x, p1x, p2x, g1mask, g2mask, p1mask, p2mask)

                if np.isnan(cost) or np.isinf(cost):
                    print 'NaN detected'

                if (utils.checkIfQuarter(uidx, len(kf))):
                    if (params.save):
                        counter += 1
                        utils.saveParams(model, params.outfile + str(counter) + '.pickle')
                    if (params.evaluate):
                        evaluate_all(model, words)
                        sys.stdout.flush()

                # undo batch to save RAM
                for i in batch:
                    i[0].representation = None
                    i[1].representation = None
                    i[0].unpopulate_embeddings()
                    i[1].unpopulate_embeddings()

                    # print 'Epoch ', (eidx+1), 'Update ', (uidx+1), 'Cost ', cost

            if (params.save):
                counter += 1
                utils.saveParams(model, params.outfile + str(counter) + '.pickle')

            if (params.evaluate):
                evaluate_all(model, words)

            print 'Epoch ', (eidx + 1), 'Cost ', cost

    except KeyboardInterrupt:
        print "Training interupted"

    end_time = time.time()
    print "total time:", (end_time - start_time)
Ejemplo n.º 7
0
def agreement(args, params):
    _, ground_truths, classes = load.load_test(params)
    NUM_REPETITIONS = 10
    gt_all = []
    probs_all = []
    for i in range(NUM_REPETITIONS):
        gt, probs = get_ground_truths_and_human_probs(ground_truths,
                                                      params["num_reviewers"])
        gt_all.append(gt)
        probs_all.append(probs)
    ground_truths = np.concatenate(tuple(gt_all), axis=1)
    probs = np.concatenate(tuple(probs_all), axis=0)
    evaluate.evaluate_all(ground_truths,
                          probs,
                          classes,
                          model_title='Human Agreement')
Ejemplo n.º 8
0
def run_and_evaluate(**suite_params):

    suite = EntailmentSuite(**suite_params)
    try:
        type = eval(suite.cfgparser.get('DEFAULT', 'type'))
    except:
        logging.info(
            "Warning: type of experiment not specified.  Assuming cross-validation."
        )
        type = "cv"
    if type == "heldout":
        suite = EntailmentSuiteHeldOut(**suite_params)
    elif type == "traintest":
        suite = EntailmentSuiteTrainTest(**suite_params)
    elif type == "heldoutstrict":
        suite = EntailmentSuiteHeldOutStrict(**suite_params)
    suite.start()

    experiments = suite.cfgparser.sections()
    for experiment in experiments:
        logging.info("Running experiment: %s", experiment)
        experiment_path = os.path.join(
            eval(suite.cfgparser.get('DEFAULT', 'path')), experiment)
        params = suite.get_params(experiment_path)
        path = os.path.join(params['path'], params['name'])
        rows = evaluate.evaluate_all(path)
        evaluate.write_summary(rows, os.path.join(path, 'analysis.csv'))
Ejemplo n.º 9
0
def run_and_evaluate(**suite_params):


    suite = EntailmentSuite(**suite_params)
    try:
        type = eval(suite.cfgparser.get('DEFAULT', 'type'))
    except:
        logging.info("Warning: type of experiment not specified.  Assuming cross-validation.")
        type="cv"
    if type=="heldout":
        suite = EntailmentSuiteHeldOut(**suite_params)
    elif type=="traintest":
        suite = EntailmentSuiteTrainTest(**suite_params)
    elif type=="heldoutstrict":
        suite = EntailmentSuiteHeldOutStrict(**suite_params)
    suite.start()

    experiments = suite.cfgparser.sections()
    for experiment in experiments:
        logging.info("Running experiment: %s", experiment)
        experiment_path = os.path.join(eval(suite.cfgparser.get('DEFAULT', 'path')),
                                       experiment)
        params = suite.get_params(experiment_path)
        path = os.path.join(params['path'],
                            params['name'])
        rows = evaluate.evaluate_all(path)
        evaluate.write_summary(rows, os.path.join(path, 'analysis.csv'))
Ejemplo n.º 10
0
 def test(self, dataloader):
     epoch = self.best_valid_epoch
     if self.data_parallel:
         model = nn.DataParallel(self.custom_net(
             **self.custom_net_args)).to(self.device).eval()
     else:
         model = self.custom_net(**self.custom_net_args).to(
             self.device).eval()
     params_path = os.path.join(self.params_dir, self.descriptor)
     print(
         'For test set predictions, loading model params from params_path=',
         params_path)
     check_point = torch.load(params_path)
     model.load_state_dict(check_point['params'])
     with torch.no_grad():
         epoch_loss, pred_epoch, gr_truth_epoch, volume_accs_epoch = self.iterate_through_batches(
             model, dataloader, epoch, training=False)
     self.eval_results_test = evaluate.evaluate_all(self.eval_results_test,
                                                    epoch,
                                                    self.label_meanings,
                                                    gr_truth_epoch,
                                                    pred_epoch)
     self.plot_roc_and_pr_curves('test', epoch, pred_epoch, gr_truth_epoch)
     self.save_all_pred_probs('test', epoch, pred_epoch, gr_truth_epoch,
                              volume_accs_epoch)
     print("{:5s} {:<3d} {:11s} {:.3f}".format('Epoch', epoch, 'Test Loss',
                                               epoch_loss))
Ejemplo n.º 11
0
def human_gt_and_probs(params, x_gt, ground_truths, processor, review_indiv=False):
    gt_all = []
    probs_all = []
    for i in TEST_REVIEWS:
        params["epi_ext"] = "_rev" + str(i) + ".episodes.json"
        x_rev, probs, dl = load.load_x_y_with_processor(params, processor)
        gt_i, rev_i = get_matching_indices(x_gt, x_rev)
        gt = ground_truths[:, gt_i]
        probs = probs[rev_i]
        if review_indiv is True:
            evaluate.evaluate_all(
                gt, probs, processor.classes,
                model_title='Human Performance with review ' + str(i))
        gt_all.append(gt)
        probs_all.append(probs)
    ground_truths = np.concatenate(tuple(gt_all), axis=1)
    probs = np.concatenate(tuple(probs_all), axis=0)
    return ground_truths, probs
Ejemplo n.º 12
0
def main():
    input_lang, output_lang, pairs = prepare_data('ques', 'ans', '../data.json',reverse=False)
    encoder = Encoder(input_lang.n_words, MAX_LENGTH).to(device)
    attn_decoder = AttnDecoderRNN(hidden_size, output_lang.n_words, dropout_p=0.1, max_length=MAX_LENGTH).to(device)


    rate = 0.9
    epoch = 10
    pairs_train,pairs_test = pairs[0:int(len(pairs)*rate)], pairs[int(len(pairs)*rate):]
    for i in range(epoch):
        encoder.train()
        attn_decoder.train()
        train(encoder, attn_decoder, len(pairs_train), pairs=pairs_train, input_lang=input_lang,output_lang=output_lang, print_every=10)
        encoder.eval()
        attn_decoder.eval()
        evaluate_all(encoder, attn_decoder, pairs_test, max_length=MAX_LENGTH, input_lang=input_lang, output_lang=output_lang,
                         n=len(pairs_test))
        torch.save(encoder.state_dict(), 'model/encoder-' + str(i) + '.model')
        torch.save(attn_decoder.state_dict(), 'model/decoder-' + str(i) + '.model')
    #show_plot(loss_history)
    print('done training')
Ejemplo n.º 13
0
 def valid(self, dataloader, epoch):
     model = self.model.eval()
     with torch.no_grad():
         epoch_loss, pred_epoch, gr_truth_epoch, volume_accs_epoch = self.iterate_through_batches(
             model, dataloader, epoch, training=False)
     self.valid_loss[epoch] = epoch_loss
     self.eval_results_valid = evaluate.evaluate_all(
         self.eval_results_valid, epoch, self.label_meanings,
         gr_truth_epoch, pred_epoch)
     self.early_stopping_check(epoch, pred_epoch, gr_truth_epoch,
                               volume_accs_epoch)
     print("{:5s} {:<3d} {:11s} {:.3f}".format('Epoch', epoch, 'Valid Loss',
                                               epoch_loss))
Ejemplo n.º 14
0
def run_and_evaluate(**suite_params):
    suite = EntailmentSuite(**suite_params)
    suite.start()

    experiments = suite.cfgparser.sections()
    for experiment in experiments:
        logging.info("Running experiment: %s", experiment)
        experiment_path = os.path.join(eval(suite.cfgparser.get('DEFAULT', 'path')),
                                       experiment)
        params = suite.get_params(experiment_path)
        path = os.path.join(params['path'],
                            params['name'])
        rows = evaluate.evaluate_all(path)
        evaluate.write_summary(rows, os.path.join(path, 'analysis.csv'))
Ejemplo n.º 15
0
def predict():
    """
    Uses the models to make the predictions and displays results on the page
    """
    article = get_article(request.form['article_url'])

    if article is None:
        return "The article URL is invalid/not supported. " + \
            "Try one of the following news sources: {}".format(TRUSTED_SOURCES)

    predictions = evaluate_all(article.text, article.title, BASE_PATH, True)
    sentiment = predictions['sentiment']
    fake = predictions['fake']
    category = predictions['category']
    emotion = predictions['emotion']

    sentiment = '{:.1f}% positive'.format(float(sentiment) * 100 / 4)

    return render_template('index.html',
                           sentiment=sentiment,
                           fake=fake,
                           emotion=emotion,
                           category=category)
Ejemplo n.º 16
0
def plot_bleu(model_filter=None, token_filter=None, opt_filter=None):

    # Calculate all scores first
    bleu_scores = evaluate_all(model_filter, token_filter, opt_filter)
    print('Finished calculating BLEU scores: %s' %
          "\n".join(bleu_scores.keys()))

    file_out = "plots/bleu"
    if model_filter is not None:
        file_out += '_' + model_filter
    if token_filter is not None:
        file_out += '_' + token_filter
    if opt_filter is not None:
        file_out += '_' + opt_filter

    labels = []
    xs = []
    ys = []

    plt.figure()  # reset the plot
    with open(file_out + '.csv',
              mode='w') as csv_file:  # overwrite any existing file
        csv_writer = csv.writer(csv_file,
                                delimiter=',',
                                quotechar='"',
                                quoting=csv.QUOTE_MINIMAL)
        # For each, lookup the val_loss and plot them
        for model_name, model_class in models.items():
            if model_filter is None or model_filter == model_name:
                for token_id, tokenizer in tokenizers.items():
                    if token_filter is None or token_filter == token_id:
                        for opt_id, optimizer in optimizer_opts.items():
                            if opt_filter is None or opt_filter == opt_id:
                                # save each one
                                label = model_name + '_' + token_id + '_' + opt_id
                                filename = label + '_' + version
                                try:
                                    history = genfromtxt('checkpoints/' +
                                                         filename + '.csv',
                                                         delimiter=',',
                                                         skip_header=1)
                                    bleu = bleu_scores[filename]
                                    val_loss = history[:, 2][-1]
                                    csv_writer.writerow(
                                        [filename, bleu, val_loss])
                                    print("bleu=%s, val_loss=%s" %
                                          (bleu, val_loss))
                                    labels.append(label)
                                    xs.append(bleu)
                                    ys.append(val_loss)
                                    # if isinstance(bleu, numbers.Number):
                                    #   plt.plot(bleu, val_loss, label=filename, markersize=12)
                                    print("Plotted: " + filename)
                                except UserWarning as uw:
                                    # print(uw)
                                    traceback.print_exc()
                                    # No model trained yet
                                    print('No val_los history: ' + filename)
                                except Exception as e:
                                    # print(e)
                                    traceback.print_exc()
                                    # No model trained yet
                                    print('No model logs for: ' + filename)

    if not labels:
        print("No matching data for filter %s, %s, %s" %
              (model_filter, token_filter, opt_filter))

    else:
        plt.scatter(xs, ys, marker='o')
        for label, x, y in zip(labels, xs, ys):
            plt.annotate(label,
                         xy=(x, y),
                         xytext=(-20, 20),
                         textcoords='offset points',
                         ha='right',
                         va='bottom',
                         bbox=dict(boxstyle='round,pad=0.5',
                                   fc='yellow',
                                   alpha=0.5),
                         arrowprops=dict(arrowstyle='->',
                                         connectionstyle='arc3,rad=0'))

        # summarize history for loss
        plt.style.use('seaborn-whitegrid')
        plt.title('BLEU-1 vs val_loss')
        plt.ylabel('val_loss')
        plt.xlabel('BLEU')
        # plt.show()
        plt.subplots_adjust(left=0.2, right=0.9, top=0.9, bottom=0.1)

        plt.savefig(file_out + '.png')
        print("Wrote plot to " + file_out)
Ejemplo n.º 17
0
        for l in [0, 1, 10, 100]:
            params['CLR_c k={} l={}'.format(
                k, l)] = [CLRcRegressor(k, l, constr_id=constr_id), X, y]
            params['CLR_c k={} l={} ens=10'.format(k, l)] = [
                RegressorEnsemble(CLRcRegressor(k, l, constr_id=constr_id)), X,
                y
            ]
            params['kplane k={} l={} w=size'.format(
                k, l)] = [KPlaneRegressor(k, l, weighted='size'), X, y]
            params['kplane k={} l={} w=size ens=10'.format(k, l)] = [
                RegressorEnsemble(KPlaneRegressor(k, l, weighted='size')), X, y
            ]

    results = evaluate_all(
        params,
        file_name="results/{}-tmp1.csv".format(args.dataset),
        n_jobs=args.n_jobs,
        gl_parallel=args.global_parallel,
    )
    results = results.sort_values('test_mse_mean')

    add_params = {}
    # assuming ensembles are always best
    algos = [CLRpRegressor, KPlaneRegressor]
    algo_names = ['CLR_p', 'kplane']
    for algo, algo_name in zip(algos, algo_names):
        for idx in results.index:
            if algo_name in idx:
                k = int(idx.split()[1].split('=')[1])
                l = int(idx.split()[2].split('=')[1])
                w = idx.split()[3].split('=')[1] == 'True'
                f = idx.split()[4].split('=')[1] == 'True'
Ejemplo n.º 18
0
import evaluate
if __name__ == '__main__':
    evaluate.evaluate_all(models_dir='models/robotics_klcoeff')
    def train(self, data, words, params):

        start_time = time.time()
        evaluate_all(self, words)

        counter = 0

        try:
            for eidx in xrange(params.epochs):

                kf = self.get_minibatches_idx(len(data),
                                              params.batchsize,
                                              shuffle=True)
                uidx = 0

                for _, train_index in kf:

                    uidx += 1
                    batch = [data[t] for t in train_index]

                    for i in batch:
                        i[0].populate_embeddings(words, True)
                        i[1].populate_embeddings(words, True)

                    (g1x, g1mask, g2x, g2mask, p1x, p1mask, p2x,
                     p2mask) = self.getpairs(batch, params)

                    cost = self.train_function(g1x, g2x, p1x, p2x, g1mask,
                                               g2mask, p1mask, p2mask)

                    if np.isnan(cost) or np.isinf(cost):
                        print 'NaN detected'

                    if utils.check_if_quarter(uidx, len(kf)):
                        if params.save:
                            counter += 1
                            self.save_params(
                                params.outfile + str(counter) + '.pickle',
                                words)
                        if params.evaluate:
                            evaluate_all(self, words)

                    for i in batch:
                        i[0].representation = None
                        i[1].representation = None
                        i[0].unpopulate_embeddings()
                        i[1].unpopulate_embeddings()

                if params.save:
                    counter += 1
                    self.save_params(params.outfile + str(counter) + '.pickle',
                                     words)

                if params.evaluate:
                    evaluate_all(self, words)

                print 'Epoch ', (eidx + 1), 'Cost ', cost

        except KeyboardInterrupt:
            print "Training interupted"

        end_time = time.time()
        print "total time:", (end_time - start_time)
Ejemplo n.º 20
0
    def train(self, data, words, params):

        start_time = time.time()
        evaluate_all(self, words)

        counter = 0
        try:
            for eidx in xrange(params.epochs):

                kf = self.get_minibatches_idx(len(data),
                                              params.batchsize,
                                              shuffle=True)
                uidx = 0
                for _, train_index in kf:

                    uidx += 1
                    batch = [data[t] for t in train_index]

                    for i in batch:
                        if params.scramble:
                            n = np.random.binomial(1, params.scramble, 1)[0]
                            if n > 0:
                                self.scramble(i[0], words)
                                self.scramble(i[1], words)
                            else:
                                i[0].populate_embeddings(words)
                                i[1].populate_embeddings(words)
                        else:
                            i[0].populate_embeddings(words)
                            i[1].populate_embeddings(words)

                    (g1x, g1mask, g2x, g2mask, p1x, p1mask, p2x,
                     p2mask) = self.get_pairs(batch, params)
                    cost = self.train_function(g1x, g2x, p1x, p2x, g1mask,
                                               g2mask, p1mask, p2mask)

                    if np.isnan(cost) or np.isinf(cost):
                        print 'NaN detected. Exiting.'
                        sys.exit(0)

                    if (check_quarter(uidx, len(kf))):
                        if (params.save):
                            counter += 1
                            self.save_params(params.outfile + str(counter) +
                                             '.pickle')
                        if (params.evaluate):
                            evaluate_all(self, words)

                    #undo batch to save RAM
                    for i in batch:
                        i[0].representation = None
                        i[1].representation = None
                        i[0].unpopulate_embeddings()
                        i[1].unpopulate_embeddings()

                if (params.save):
                    counter += 1
                    self.save_params(params.outfile + str(counter) + '.pickle')

                if (params.evaluate):
                    evaluate_all(self, words)

                print 'Epoch ', (eidx + 1), 'Cost ', cost

        except KeyboardInterrupt:
            print "Training interupted"

        end_time = time.time()
        print "total time:", (end_time - start_time)
Ejemplo n.º 21
0
    def train(self, data, words, params):

        start_time = time.time()
        evaluate_all(self, words, params)

        old_v = 0
        try:

            for eidx in xrange(params.epochs):

                kf = self.get_minibatches_idx(len(data),
                                              params.batchsize,
                                              shuffle=True)
                lkf = len(kf)
                uidx = 0

                while (len(kf) > 0):

                    megabatch = []
                    idxs = []
                    idx = 0
                    for i in range(params.mb_batchsize):
                        if len(kf) > 0:
                            arr = [data[t] for t in kf[0][1]]
                            curr_idxs = [i + idx for i in range(len(kf[0][1]))]
                            kf.pop(0)
                            megabatch.extend(arr)
                            idxs.append(curr_idxs)
                            idx += len(curr_idxs)
                    uidx += len(idxs)

                    for i in megabatch:
                        if params.wordtype == "words":
                            if params.scramble > 0:
                                n = np.random.binomial(1, params.scramble,
                                                       1)[0]
                                if n > 0:
                                    i[0].populate_embeddings_scramble(words)
                                    i[1].populate_embeddings_scramble(words)
                                else:
                                    i[0].populate_embeddings(words, True)
                                    i[1].populate_embeddings(words, True)
                            else:
                                i[0].populate_embeddings(words, True)
                                i[1].populate_embeddings(words, True)
                        else:
                            i[0].populate_embeddings_ngrams(words, 3, True)
                            i[1].populate_embeddings_ngrams(words, 3, True)

                    (g1x, g1mask, g2x, g2mask, p1x, p1mask, p2x,
                     p2mask) = self.get_pairs(megabatch, params)

                    cost = 0
                    for i in idxs:
                        cost += self.train_function(g1x[i], g2x[i], p1x[i],
                                                    p2x[i], g1mask[i],
                                                    g2mask[i], p1mask[i],
                                                    p2mask[i])

                    cost = cost / len(idxs)

                    if np.isnan(cost) or np.isinf(cost):
                        print 'NaN detected'

                    if utils.check_if_quarter(uidx - len(idxs), uidx, lkf):
                        if params.evaluate:
                            v = evaluate_all(self, words, params)
                        if params.save:
                            if v > old_v:
                                old_v = v
                                self.save_params(params.outfile + '.pickle',
                                                 words)

                    for i in megabatch:
                        i[0].representation = None
                        i[1].representation = None
                        i[0].unpopulate_embeddings()
                        i[1].unpopulate_embeddings()

                if params.evaluate:
                    v = evaluate_all(self, words, params)

                if params.save:
                    if v > old_v:
                        old_v = v
                        self.save_params(params.outfile + '.pickle', words)

                print 'Epoch ', (eidx + 1), 'Cost ', cost

        except KeyboardInterrupt:
            print "Training interupted"

        end_time = time.time()
        print "total time:", (end_time - start_time)
Ejemplo n.º 22
0
    def train(self, data, words, params):

        start_time = time.time()
        evaluate_all(self, words, params)

        old_v = 0
        try:

            for eidx in range(params.epochs):

                kf = self.get_minibatches_idx(len(data), params.batchsize, shuffle=True)
                lkf = len(kf)
                uidx = 0
                sentence_samples = []

                while(len(kf) > 0):

                    megabatch = []
                    idxs = []
                    idx = 0
                    for i in range(params.mb_batchsize):
                        if len(kf) > 0:
                            arr = [data[t] for t in kf[0][1]]
                            curr_idxs = [i + idx for i in range(len(kf[0][1]))]
                            kf.pop(0)
                            megabatch.extend(arr)
                            idxs.append(curr_idxs)
                            idx += len(curr_idxs)
                    uidx += len(idxs)

                    for i in megabatch:
                        if params.wordtype == "words":
                            if params.scramble > 0:
                                n = np.random.binomial(1, params.scramble, 1)[0]
                                if n > 0:
                                    i[0].populate_embeddings_scramble(words)
                                    i[1].populate_embeddings_scramble(words)
                                else:
                                    i[0].populate_embeddings(words, True)
                                    i[1].populate_embeddings(words, True)
                            else:
                                i[0].populate_embeddings(words, True)
                                i[1].populate_embeddings(words, True)
                        else:
                            i[0].populate_embeddings_ngrams(words, 3, True)
                            i[1].populate_embeddings_ngrams(words, 3, True)

                    (g1x, g1mask, g2x, g2mask, p1x, p1mask, p2x, p2mask),(g1_s,g2_s,p1_s,p2_s) = self.get_pairs(megabatch, params)
                    cost = 0
                    for i in idxs:
                       # cc1,cc2 = self.cost_each_data(g1x[i], g2x[i], p1x[i], p2x[i], g1mask[i], g2mask[i], p1mask[i], p2mask[i])
                        cost += self.train_function(g1x[i], g2x[i], p1x[i], p2x[i], g1mask[i], g2mask[i], p1mask[i], p2mask[i])
                       # for j in range(len(i)):
                       #     try:
                       #         sentence_samples.append({'orig':g1_s[i[j]],'para':g2_s[i[j]], 'neg_orign':p1_s[i[j]], 'neg_para':p2_s[i[j]], 'orig_cost':str(cc1[j]),'para_cost':str(cc2[j])})
                       #     except IndexError:
                       #         print(j,i[j])
                    cost = cost / len(idxs)

                    if np.isnan(cost) or np.isinf(cost):
                        print('NaN detected')

                    if utils.check_if_quarter(uidx-len(idxs), uidx, lkf):
                        if params.evaluate:
                            v = evaluate_all(self, words, params)
                        if params.save:
                            if v > old_v:
                                old_v = v
                                self.save_params(params.outfile + '.pickle', words)

                    for i in megabatch:
                        i[0].representation = None
                        i[1].representation = None
                        i[0].unpopulate_embeddings()
                        i[1].unpopulate_embeddings()

                if params.evaluate:
                    v = evaluate_all(self, words, params)

                if params.save:
                    if v > old_v:
                        old_v = v
                        self.save_params(params.outfile + '.pickle', words)

                print('Epoch ', (eidx + 1), 'Cost ', cost)
                #with open("../data/sampled_samples/%s_epoch_%d.json" % (params.model, eidx),'w') as f:
                #    json_result = json.dumps(sentence_samples, indent=2)
                #    f.write(json_result)

        except KeyboardInterrupt:
            print("Training interupted")

        end_time = time.time()
        print("total time:", (end_time - start_time))
Ejemplo n.º 23
0
from evaluate import evaluate_all
evaluate_all()
Ejemplo n.º 24
0
    y = abalone_data.iloc[:, 8].as_matrix().astype(np.float)
    constr_id = 10
  elif args.dataset == 'auto-mpg':
    data = pd.read_csv('data/auto-mpg.data', header=None, sep='\s+', na_values='?')
    data = data.dropna()
    X = pd.get_dummies(data.iloc[:,1:-1], columns=[7]).as_matrix().astype(np.float)
    y = data[0].as_matrix().astype(np.float)
    constr_id = 5
  else:
    print("Dataset is not supported")
    sys.exit(0)
  X, y = preprocess_data(X, y)

  params = {}

  for k in [2, 8]:
    for ens in range(1, 21):
      params['kplane k={} ens={}'.format(k, ens)] = [
        RegressorEnsemble(KPlaneRegressor(k, 100), n_estimators=ens), X, y, 10, 20]
      params['CLR_p k={} ens={}'.format(k, ens)] = [
        RegressorEnsemble(CLRpRegressor(k, 10, weighted=True), n_estimators=ens), X, y, 10, 20]
      params['CLR_c k={} ens={}'.format(k, ens)] = [
        RegressorEnsemble(CLRcRegressor(k, 10, constr_id=constr_id), n_estimators=ens), X, y, 10, 20]

  results = evaluate_all(
    params,
    file_name="results_ens/{}.csv".format(args.dataset),
    n_jobs=args.n_jobs,
    gl_parallel=args.global_parallel,
  )
                    for min_samples_leaf in [1, 10, 30, 50]:
                        params['rf md={}, mf={}, mss={}, msl={}'.format(
                            max_depth, max_features, min_samples_split,
                            min_samples_leaf)] = [
                                RandomForestRegressor(
                                    n_estimators=30,
                                    max_depth=max_depth,
                                    max_features=max_features,
                                    min_samples_leaf=min_samples_leaf,
                                    min_samples_split=min_samples_split,
                                    n_jobs=n_jobs), X, y, 3, 1
                            ]

        results = evaluate_all(
            params,
            file_name="results/patient-claims-rf.csv",
            n_jobs=args.n_jobs,
            gl_parallel=args.global_parallel,
        )

    if args.run_clrs:
        print("Run clrs")
        if args.n_jobs == 1:
            for k in [2, 4, 6, 8]:
                for l in [0, 1, 10, 100, 1000, 10000]:
                    tm = time.time()
                    gen_clrs(k, l, X, y, max_iter=5, n_estimators=10)
                    print("k={}, l={}, time={}".format(k, l, time.time() - tm))
                    tm = time.time()
                    gen_clrs(k,
                             l,
                             X,
Ejemplo n.º 26
0
 def predict(self, data, words, params):
     start_time = time.time()
     evaluate_all(self, words, params)
     end_time = time.time()
     print "total time:", (end_time - start_time)