Beispiel #1
0
def pred_evaluation(prepare_data, data, iterator):
    """
    Compute recall@20 and mrr@20
    prepare_data: usual prepare_data for that dataset.
    """
    recall = 0.0
    mrr = 0.0
    evalutation_point_count = 0
    # pred_res = []
    # att = []

    for _, valid_index in iterator:
        x, mask, y = prepare_data([data[0][t] for t in valid_index],
                                  np.array(data[1])[valid_index])
        preds = sess.run(output_probs,feed_dict={input_items: x,target_items : y,input_item_mask :mask,keep_prob_1:1.0,keep_prob_2:1.0})
        # weights = f_weight(x, mask)
        targets = y
        ranks = (preds.T > np.diag(preds.T[targets])).sum(axis=0) + 1
        rank_ok = (ranks <= 20)

        # pred_res += list(rank_ok)
        recall += rank_ok.sum()
        mrr += (1.0 / ranks[rank_ok]).sum()
        evalutation_point_count += len(ranks)
        # att.append(weights)
    recall = numpy_floatX(recall) / evalutation_point_count
    mrr = numpy_floatX(mrr) / evalutation_point_count
    eval_score = (recall, mrr)
    return eval_score
Beispiel #2
0
def predict(_args, lex_test, idxs_test, f_classify, groundtruth_test, batchsize=1, graph=False, dep=None, weighted=False, print_prediction=False, prediction_file=None):
    ''' On the test set predict the labels using f_classify.
    Compare those labels against groundtruth.

    It returns a dictionary 'results' that contains
    f1 : F1 or Accuracy
    p : Precision
    r : Recall
    '''
    predictions_test = []
    if print_prediction:
        assert prediction_file is not None
        pred_file = open(prediction_file, 'w')
    if batchsize > 1:
        nb_idxs = get_minibatches_idx(len(lex_test), batchsize, shuffle=False)
        for i, tr_idxs in enumerate(nb_idxs):
            words = [lex_test[ii] for ii in tr_idxs]
            eidxs = [idxs_test[ii] for ii in tr_idxs]
            #labels = [groundtruth_test[ii] for ii in tr_idxs]
            orig_eidxs = eidxs
            if graph:
                assert dep is not None
                masks = [dep[ii] for ii in tr_idxs]
            else:
                masks = None
            x, masks, eidxs = prepare_data(words, eidxs, masks, maxlen=200)
            if weighted or not graph:
                pred_all = f_classify(x, masks, *eidxs)
                predictions_test.extend(list(numpy.argmax(pred_all, axis=1))) #[0]))
            else:
                pred_all = f_classify(x, masks.sum(axis=-1), *eidxs)
                predictions_test.extend(list(numpy.argmax(pred_all, axis=1)))
            if print_prediction:
                for idx, p in zip(tr_idxs, pred_all): 
                    pred_file.write(str(idx) + '\t' + str(p[1]) + '\n')
    else:
        for i, (word, idxs) in enumerate(zip(lex_test, idxs_test)):
            idxs = conv_idxs(idxs, len(word))
            if graph:
                assert dep is not None
                if weighted:
                    predictions_test.append(f_classify(word, dep[i], *idxs))  #.sum(axis=-1) 
                else:
                    predictions_test.append(f_classify(word, dep[i].sum(axis=-1), *idxs))  #.sum(axis=-1) 
            else:
                predictions_test.append(f_classify(word, *idxs))
    print 'in predict,', len(predictions_test), len(groundtruth_test)
    if print_prediction:
        pred_file.close()
    #results = eval_logitReg_F1(predictions_test, groundtruth_test) 
    results = eval_logitReg_accuracy(predictions_test, groundtruth_test)
    return results, predictions_test
Beispiel #3
0
 def construct_feeddict(self,
                        batch_data,
                        batch_label,
                        keepprob,
                        state,
                        starting=False):
     x, mask, y, lengths = data_process.prepare_data(
         batch_data, batch_label)
     feed = {
         self.x_input: x,
         self.mask_x: mask,
         self.y_target: y,
         self.len_x: lengths,
         self.keep_prob: keepprob,
         self.state: state,
         self.starting: starting
     }
     # feed the initialized state into placeholder
     return feed
Beispiel #4
0
def pred_evaluation(f_pred_prob,
                    prepare_data,
                    data,
                    iterator,
                    item_freqs,
                    k=20):
    """
    Compute recall@20 and mrr@20
    f_pred_prob: Theano fct computing the prediction
    prepare_data: usual prepare_data for that dataset.
    """
    recall = 0.0
    mrr = 0.0
    evalutation_point_count = 0
    preds_freqs = []
    preds_items = []
    #print('HEEELLLOOO', len(item_freqs))
    for _, valid_index in iterator:
        x, mask, y = prepare_data([data[0][t] for t in valid_index],
                                  np.array(data[1])[valid_index])
        preds = f_pred_prob(x, mask)
        #print(preds.shape)
        targets = y
        ranks = (preds.T > np.diag(preds.T[targets])).sum(axis=0) + 1
        rank_ok = (ranks <= k)
        recall += rank_ok.sum()
        mrr += (1.0 / ranks[rank_ok]).sum()
        evalutation_point_count += len(ranks)
        for i in range(preds.shape[0]):
            series = pd.Series(data=preds[i])
            s = series.nlargest(k).index.values
            for r in s:
                preds_items.append(r)
                preds_freqs.append(item_freqs[r])

    recall = numpy_floatX(recall) / evalutation_point_count
    mrr = numpy_floatX(mrr) / evalutation_point_count
    eval_score = (recall, mrr,
                  len(list(set(preds_items))) / len(item_freqs.keys()),
                  (np.mean(preds_freqs) / max(item_freqs.values())))
    return eval_score
Beispiel #5
0
        # Get new shuffled index for the training set.
        kf = get_minibatches_idx(len(train[0]), batch_size, shuffle=True)
        kf_valid = get_minibatches_idx(len(valid[0]), valid_batch_size)
        kf_test = get_minibatches_idx(len(test[0]), valid_batch_size)

        for _, train_index in kf:
            uidx += 1

            # Select the random examples for this minibatch
            y = [train[1][t] for t in train_index]
            x = [train[0][t]for t in train_index]

            # Get the data in numpy.ndarray format
            # This swap the axis!
            # Return something of shape (minibatch maxlen, n samples)
            x, mask, y = prepare_data(x, y)
            n_samples += x.shape[0]

            _,loss = sess.run([optimizer,ce_loss],feed_dict={input_items: x,target_items : y,input_item_mask :mask,keep_prob_1:0.75,keep_prob_2:0.5})
            epoch_loss.append(loss)
            if np.isnan(loss) or np.isinf(loss):
                print('bad loss detected: ', loss)

            if np.mod(uidx, dispFreq) == 0:
                print('Epoch ', eidx, 'Update ', uidx, 'Loss ', np.mean(epoch_loss))

        if is_valid:
            valid_evaluation = pred_evaluation(prepare_data, valid, kf_valid)
            test_evaluation = pred_evaluation(prepare_data, test, kf_test)
            history_errs.append([valid_evaluation, test_evaluation])
Beispiel #6
0
def train_single(train_lex,
                 train_idxs,
                 train_y,
                 _args,
                 f_cost,
                 f_update,
                 epoch_id,
                 learning_rate,
                 nsentences,
                 batchsize=1,
                 dep=None,
                 weighted=False):
    ''' This function is called from the main method. and it is primarily responsible for updating the
    parameters. Because of the way that create_relation_circuit works that creates f_cost, f_update etc. this function
    needs to be flexible and can't be put in a lib.
    Look at lstm_dependency_parsing_simplification.py for more pointers.
    '''

    # None-batched version
    def train_instance(words, idxs, sample_weights, label, learning_rate,
                       f_cost, f_update):
        ' Since function is called only for side effects, it is likely useless anywhere else'
        if words.shape[0] < 2:
            return 0.0

        # need to change here, add sample weights
        inputs = idxs + [words, sample_weights, label]
        iter_cost = f_cost(*inputs)  #words, id1, id2, labels)
        f_update(learning_rate)
        return iter_cost

    # Mini-batch version
    def train_batch(words, masks, idxs, sample_weights, label, learning_rate,
                    f_cost, f_update):
        if words.shape[0] < 2:
            return 0.0

        # need to change here, add sample weights
        inputs = idxs + [words, masks, sample_weights, label]
        iter_cost = f_cost(*inputs)  #words, id1, id2, labels)
        f_update(learning_rate)
        return iter_cost

    ## main body of train

    # generate the weights according to the train label distribution
    total_pos = 0
    total_neg = 0
    for y in train_y:
        if y[0] == 0 and y[1] == 1:
            total_pos += 1
        else:
            total_neg += 1

    print("total pos: %d neg:%d \n" % (total_pos, total_neg))

    sample_weights = [0] * (total_neg + total_pos)
    for idx, y in enumerate(train_y):
        if y[0] == 0 and y[1] == 1:
            sample_weights[idx] = 0.5 * (total_neg + total_pos) / (total_pos)
        else:
            sample_weights[idx] = 0.5 * (total_neg + total_pos) / (total_neg)

    if dep:
        shuffle([train_lex, train_idxs, train_y, sample_weights, dep],
                _args.seed)
    else:
        shuffle([train_lex, train_idxs, train_y, sample_weights], _args.seed)

    if nsentences < len(train_lex):
        train_lex = train_lex[:nsentences]
        train_idxs = train_idxs[:nsentences]
        train_y = train_y[:nsentences]
        sample_weights = sample_weights[:nsentences]

    tic = time.time()
    aggregate_cost = 0.0
    temp_cost_arr = [0.0] * 2

    # make the judge on whether use mini-batch or not.
    # No mini-batch
    if batchsize == 1:
        for i, (words, idxs, label, weight) in enumerate(
                zip(train_lex, train_idxs, train_y, sample_weights)):
            if len(words) < 2:
                continue
            #assert len(words) == len(labels) #+ 2
            idxs = conv_idxs(idxs, len(words))
            if _args.graph:
                assert dep is not None
                if weighted:
                    aggregate_cost += train_batch(words, dep[i], idxs, weight,
                                                  label, learning_rate, f_cost,
                                                  f_update)
                else:
                    aggregate_cost += train_batch(words, dep[i].sum(axis=-1),
                                                  idxs, weight, label,
                                                  learning_rate, f_cost,
                                                  f_update)
            else:
                aggregate_cost += train_instance(words, idxs, weight, label,
                                                 learning_rate, f_cost,
                                                 f_update)
            if _args.verbose == 2 and i % 10 == 0:
                print '[learning] epoch %i >> %2.2f%%' % (epoch_id, (i + 1) *
                                                          100. / nsentences),
                print 'completed in %.2f (sec). << avg loss: %.2f <<\r' % (
                    time.time() - tic, aggregate_cost / (i + 1)),
                sys.stdout.flush()
    # Mini-batch
    else:
        nb_idxs = get_minibatches_idx(len(train_lex), batchsize, shuffle=False)
        nbatches = len(nb_idxs)
        for i, tr_idxs in enumerate(nb_idxs):
            words = [train_lex[ii] for ii in tr_idxs]
            eidxs = [train_idxs[ii] for ii in tr_idxs]
            labels = [train_y[ii] for ii in tr_idxs]
            weights = [sample_weights[ii] for ii in tr_idxs]

            orig_eidxs = eidxs
            if _args.graph:
                assert dep is not None
                masks = [dep[ii] for ii in tr_idxs]
            else:
                masks = None
            x, masks, eidxs, weight = prepare_data(words,
                                                   eidxs,
                                                   masks,
                                                   weights,
                                                   maxlen=200)

            #print 'mask shape:', masks.shape
            if weighted or dep is None:
                iter_cost = train_batch(x, masks, eidxs, weight, labels,
                                        learning_rate, f_cost, f_update)
                aggregate_cost += iter_cost  #[0]
            else:
                aggregate_cost += train_batch(x, masks.sum(axis=-1), eidxs,
                                              weight, labels, learning_rate,
                                              f_cost, f_update)
            if _args.verbose == 2:
                print '[learning] epoch %i >> %2.2f%%' % (epoch_id, (i + 1) *
                                                          100. / nbatches),
                print 'completed in %.2f (sec). << avg loss: %.2f <<\r' % (
                    time.time() - tic, aggregate_cost / (i + 1)),
                #print 'completed in %.2f (sec). << avg loss: %.2f <<%%' % (time.time() - tic, aggregate_cost/(i+1)),
                #print 'average cost for each part: (%.2f, %.2f) <<\r' %(temp_cost_arr[0]/(i+1), temp_cost_arr[1]/(i+1)),
                sys.stdout.flush()
    if _args.verbose == 2:
        print '\n>> Epoch completed in %.2f (sec) <<' % (
            time.time() - tic), 'training cost: %.2f' % (aggregate_cost)
Beispiel #7
0
def train_gru(args):
    #patience=100, saveto='gru_model.npz',
    #is_valid=True, is_save=False, reload_model=None, test_size=-1
    exps = pd.read_csv('exp.csv')
    for i, row in exps.iterrows():
        gc.collect()
        args['expname'] = row['name']
        args['sessionid'] = row['SessionID']
        args['itemid'] = row['ItemID']
        args['data_folder'] = row['path']
        args['valid_data'] = row['test']
        args['train_data'] = row['train']
        args['freq'] = row['freq']

        print('Train:', args['train_data'], ' -- Test:', args['valid_data'],
              ' -- Freq:', args['freq'])
        with open("LOGGER_" + args['expname'] + ".txt", "a") as myfile:
            myfile.write(row['train'] + ", " + row['test'] + "\n")
        # split patterns to train_patterns and test_patterns
        print('Start Data Preprocessing: Training Set')
        train, itemsIDs, freqs, old_new = load_sequence(
            args['data_folder'] + '/' + args['train_data'],
            args['itemid'],
            args['sessionid'],
            itemsIDs=[])
        args['n_items'] = len(itemsIDs) + 1
        freqs[0] = 0
        print('Start Data Preprocessing: Testing Set')
        valid, _, _, _ = load_sequence(args['data_folder'] + '/' +
                                       args['valid_data'],
                                       args['itemid'],
                                       args['sessionid'],
                                       Train=False,
                                       itemsIDs=itemsIDs,
                                       freq=args['freq'],
                                       old_new=old_new)
        print("%d train examples." % len(train[0]))
        print("%d valid examples." % len(valid[0]))
        # Model options
        params = init_params(args)
        tparams = init_tparams(params)
        (use_noise, x, mask, y, f_pred_prob, cost) = build_model(tparams, args)
        all_params = list(tparams.values())
        updates = adam(cost, all_params, args['lr'])
        train_function = theano.function(inputs=[x, mask, y],
                                         outputs=cost,
                                         updates=updates)
        uidx = 0  # the number of update done
        cPid = os.getpid()
        command_memory = "python memoryLogger.py " + str(
            cPid) + " " + args['expname'] + "train"
        #memory_task = subprocess.Popen(command_memory, stdout=subprocess.PIPE, shell=True)
        try:
            t1 = time.time()
            for eidx in range(args['epoch']):
                n_samples = 0
                epoch_loss = []
                # Get new shuffled index for the training set.
                kf = get_minibatches_idx(len(train[0]),
                                         int(args['batch_size']),
                                         shuffle=True)
                for _, train_index in kf:
                    uidx += 1
                    use_noise.set_value(1.)
                    # Select the random examples for this minibatch
                    y = [train[1][t] for t in train_index]
                    x = [train[0][t] for t in train_index]
                    x, mask, y = prepare_data(x, y)
                    n_samples += x.shape[1]
                    loss = train_function(x, mask, y)
                    epoch_loss.append(loss)
                    if np.isnan(loss) or np.isinf(loss):
                        print('bad loss detected: ', loss)
                        return 1., 1., 1.
                print('Epoch ', eidx, 'Loss ', np.mean(epoch_loss))

                # Report intermediate result to the tuner
                nni.report_intermediate_result(np.mean(epoch_loss))
                logger.debug('test loss %g', np.mean(epoch_loss))
                logger.debug('Pipe send intermediate result done')

                use_noise.set_value(0.)
                if eidx % 3 == 0:
                    kf_valid = get_minibatches_idx(len(valid[0]),
                                                   int(args['batch_size']))
                    valid_eval = pred_evaluation(f_pred_prob, prepare_data,
                                                 valid, kf_valid, freqs)
                    print('Valid Recall@20:', valid_eval[0], '\nValid Mrr@20:',
                          valid_eval[1])

            # Report intermediate result to the tuner
            nni.report_final_result(np.mean(epoch_loss))
            logger.debug('Final loss is %g', np.mean(epoch_loss))
            logger.debug('Send final result done')

        except KeyboardInterrupt:
            print("Training interupted")
        #memory_task.kill()
        train_time = time.time() - t1
        use_noise.set_value(0.)
        t1 = time.time()
        Ks = [1, 3, 5, 10, 30]
        hit = [0, 0, 0, 0, 0]
        MRR = [0, 0, 0, 0, 0]
        cov = [0, 0, 0, 0, 0]
        pop = [0, 0, 0, 0, 0]
        command_memory = "python memoryLogger.py " + str(
            cPid) + " " + args['expname'] + "test"
        #memory_task = subprocess.Popen(command_memory, stdout=subprocess.PIPE, shell=True)
        for k in range(len(Ks)):
            kf_valid = get_minibatches_idx(len(valid[0]),
                                           int(args['batch_size']))
            results = pred_evaluation(f_pred_prob, prepare_data, valid,
                                      kf_valid, freqs, Ks[k])
            hit[k] = results[0]
            MRR[k] = results[1]
            cov[k] = results[2]
            pop[k] = results[3]
        test_time = time.time() - t1
        #memory_task.kill()
        print('==================================================')
        print('Recall:', hit, '\nMRR:', MRR, '\nCoverage:', cov,
              '\nPopularity:', pop)
        print('\ntrain_time:', train_time, '\nTest time:', test_time / len(Ks))
        print('End Model Predictions')

        # Print experiment to the logger
        print('===================================================')
        print("LOGGER_" + args['expname'])
        print(
            str(hit[0]) + ',' + str(hit[1]) + ',' + str(hit[2]) + ',' +
            str(hit[3]) + ',' + str(hit[4]) + ',' + str(MRR[0]) + ',' +
            str(MRR[1]) + ',' + str(MRR[2]) + ',' + str(MRR[3]) + ',' +
            str(MRR[4]))
        print("\nCOV:" + str(cov[0]) + ',' + str(cov[1]) + ',' + str(cov[2]) +
              ',' + str(cov[3]) + ',' + str(cov[4]))
        print("\nPOP:" + str(pop[0]) + ',' + str(pop[1]) + ',' + str(pop[2]) +
              ',' + str(pop[3]) + ',' + str(pop[4]))
        print("\nTrainTime:" + str(train_time))
        print("\nTestTime:" + str(test_time))

        with open("LOGGER_" + args['expname'] + ".txt", "a") as myfile:
            myfile.write(
                str(hit[0]) + ',' + str(hit[1]) + ',' + str(hit[2]) + ',' +
                str(hit[3]) + ',' + str(hit[4]) + ',' + str(MRR[0]) + ',' +
                str(MRR[1]) + ',' + str(MRR[2]) + ',' + str(MRR[3]) + ',' +
                str(MRR[4]))
            myfile.write("\nCOV:" + str(cov[0]) + ',' + str(cov[1]) + ',' +
                         str(cov[2]) + ',' + str(cov[3]) + ',' + str(cov[4]))
            myfile.write("\nPOP:" + str(pop[0]) + ',' + str(pop[1]) + ',' +
                         str(pop[2]) + ',' + str(pop[3]) + ',' + str(pop[4]))
            myfile.write("\nTrainTime:" + str(train_time))
            myfile.write("\nTestTime:" + str(test_time))
            myfile.write("\n############################################\n")