Example #1
0
    def get_paginated_questions():
        try:
            categories = util.get_categories(formatted=True)
            current_category = request.args.get('cateogry')

            questions = util.get_questions(formatted=False)
            page = request.args.get('page', 1, type=int)
            start = QUESTIONS_PER_PAGE * (page - 1)
            end = start + QUESTIONS_PER_PAGE

            questions_on_selected_page = [question.format() 
                                          for question in questions[start: end]]
          
            if not questions_on_selected_page:
                raise ResourceNotFound

            return jsonify({
                "success": True,
                "questions": questions_on_selected_page,
                "total_questions": len(questions),
                "categories": categories,
                "current_category": current_category
                })

        except ResourceNotFound:
            abort(404)

        except Exception:
            abort(500)
def results_by_category(limit=None):

    for category in get_categories()[:limit]:
        source = os.path.join(TRAINING_CATEGORIES, category)
        if os.path.exists(source):
            results.clear()
            jobs = []
            X_train, X_test, y_train, y_test = load_train_test(source, category)

            for param in params:
                p = multiprocessing.Process(target=worker, args=(X_train, y_train,
                                                                 X_test, y_test,
                                                                 param, results))
                jobs.append(p)
                p.start()
                while len(jobs) >= 4:
                    jobs[0].join()
                    jobs = [j for j in jobs if j.is_alive()]

            while len(jobs) != 0:
                jobs[0].join()
                jobs = [j for j in jobs if j.is_alive()]

            print results
            best_score = max(results.keys())
            yield [category, best_score] + results[best_score]
Example #3
0
 def __init__(self,
              x,
              y,
              lang='pt',
              batch_size=32,
              process_x=lambda x: x,
              process_y=lambda y: y,
              separate_val=validation,
              sampler=None):
     #self.h5 = h5py.File(h5_file, 'r', libver='latest')['test']
     self.lang = lang
     #self.lang_dset = self.h5[lang]
     self.process_x = process_x
     self.process_y = process_y
     self.num_classes = len(util.get_categories())
     self.x, self.y = x, y
     self.batch_size = batch_size
     self.indices = np.arange(self.x.shape[0])
     self.separate_val = separate_val
     if (self.separate_val):
         self.val_indices = np.load(DATA_PATH + 'val_index_' + self.lang +
                                    '.npy')
         self.indices = np.setdiff1d(self.indices, self.val_indices)
     self.sampler = sampler
     if (self.sampler != None):
         self.indices, y = self.sampler.fit_resample(
             self.indices.reshape(-1, 1), self.y[self.indices])
         del y
         gc.collect()
         self.indices = self.indices.reshape(-1)
     np.random.shuffle(self.indices)
def results_by_category(limit=None):

    for category in get_categories()[:limit]:
        source = os.path.join(TRAINING_CATEGORIES, category)
        if os.path.exists(source):

            X_train, X_test, y_train, y_test = load_train_test(source, category)
            f1s = [category]
            for m in ms:
                m.fit(X_train, y_train)
                y_pred = m.predict(X_test)
                f1s.append(metrics.f1_score(y_test, y_pred))

            yield f1s
Example #5
0
    def get_categories():
        try:
            categories = util.get_categories(formatted=True)

            if not categories:
                raise ResourceNotFound

            return jsonify({
                "success": True,
                "categories": categories,
                "total_categories": len(categories)
                })

        except ResourceNotFound:
            abort(404)

        except Exception:
            abort(500)
from keras.layers import Concatenate, Input, Dropout, SpatialDropout1D
from keras.models import Model
from attention import SeqSelfAttention
from util import CyclicLR
from sklearn.model_selection import StratifiedKFold
import keras.backend as K

NAME = "bi_lstm_gru_selfatt_kfold"

PARAMS = {
    'sequence_len': padding_len,
    'embedding_dim': 200,
    'epochs': 3,
    'batch_size': 256,
    'loss': 'categorical_crossentropy',
    'num_classes': len(util.get_categories()),
    'class_weights': None,
    'sampler': None,
    'k-folds': 4
}

PATH = DATA_PATH+'models/'+NAME+'/'

DEPENDENCIES = {
                    'categorical_recall': keras_metrics.categorical_recall(),
                    'balanced_recall': util.balanced_recall,
                    'SeqSelfAttention': SeqSelfAttention,
                    'f1': util.f1
                }

def load_model(path, extras={}):
Example #7
0
def load_images_from_files(filenames):
    images = []
    for path in filenames:
        images.append(cv2.imread(path, cv2.IMREAD_UNCHANGED))
    assert len(filenames) == len(images)
    return filenames, get_categories(filenames), images
Example #8
0
def train_classifier(feature_name, train_batch_num, base_npz_dir,
                     test_batches):
    test_acc = []
    base_path = util.get_base_path()
    categories = util.get_categories()
    train_batches = range(0, train_batch_num)
    #test_batches = range(train_batch_num,train_batch_num+1) JC edit
    set_name = 'setb50k'
    label_set_name = set_name
    subset = ''  #'_pca1'
    classifier_paramstring = ''
    if do_norm: classifier_paramstring += 'N'
    if props['C'] != 0:
        classifier_paramstring += 'C%d' % props['C']
    out_fn = os.path.join(
        base_npz_dir, feature_name, '%s%s_%s%s_%d-%d.pickle' %
        (classifier_type, classifier_paramstring, set_name, subset,
         train_batches[0], train_batches[-1]))
    if do_norm:
        out_fn_norm = os.path.join(
            base_npz_dir, feature_name,
            'norm_%s%s_%d.pickle' % (set_name, subset, train_batches[0]))
    print 'Training %s...' % out_fn

    if classifier_type == 'sgd_svm':
        is_incremental = True
    else:
        is_incremental = False

    norm = dict()
    clf = None

    for i_batch, train_batch in enumerate(train_batches + test_batches):
        fn = os.path.join(base_npz_dir, feature_name,
                          '%s_%05d%s.npz' % (set_name, train_batch, subset))
        print 'Processing feature file %s.' % fn
        print fn
        with np.load(fn) as file_contents:

            data = file_contents['data']

        true_labels, _ = util.load_labels(label_set_name, train_batch)

        if do_norm:
            if i_batch == 0:
                # Initial batch to determine mean and variance for normalization
                norm['mean'] = np.expand_dims(data.mean(axis=0), 0)
                norm['std'] = np.expand_dims(data.std(axis=0), 0)
                norm['std'] = np.maximum(norm['std'], 0.01)
                with open(out_fn_norm, 'wb') as fid:
                    pickle.dump(norm, fid)

            data -= norm['mean']
            data /= norm['std']
            print 'Data after normalization: Mean %f, Std %f' % (data.mean(
                axis=0).mean(axis=0), data.std(axis=0).mean(axis=0))

        if is_incremental:
            # Incremental: Do training every training iteration
            # Do testing not just on test but also during training before feeding the new training data
            do_train = (i_batch < len(train_batches))
            do_test = (i_batch > 0)
            use_data = data
            use_true_labels = true_labels
        else:
            # Non-incremental: Train once when all training batches have been collected
            do_train = (i_batch == len(train_batches) - 1)
            do_test = (i_batch >= len(train_batches))
            # data collection phase
            if not do_test:
                if i_batch == 0:
                    data_all = data
                    all_true_labels = true_labels
                else:
                    data_all = np.concatenate((data_all, data), axis=0)
                    all_true_labels = np.concatenate(
                        (all_true_labels, true_labels), axis=0)
            use_data = data_all
            use_true_labels = all_true_labels

        print '  use data %s.' % str(use_data.shape)
        print '  use labels %s' % str(use_true_labels.shape)

        if do_test:
            # After some batch training has been done, predict performance
            pred_labels = clf.predict(data)
            acc = float(sum(pred_labels == true_labels)) / true_labels.size
            test_acc.append(acc)
            print '  Batch accuracy: %.1f%%' % (acc * 100)

        if do_train:
            if classifier_type == 'sgd_svm':
                clf = train_sgd(clf, 'hinge', use_data, use_true_labels)
            elif classifier_type == 'svm':
                clf = train_svm(clf, use_data, use_true_labels, props)
            pred_labels = clf.predict(use_data)
            acc = float(
                sum(pred_labels == use_true_labels)) / use_true_labels.size
            print '  Train accuracy: %.1f%%' % (acc * 100)
            # Dump classifier data at every iteration
            with open(out_fn, 'wb') as fid:
                pickle.dump(clf, fid)
    return np.mean(test_acc)
Example #9
0
def categories():
    if should_return_json():
        return jsonify(dict(ok=True, data=get_categories()))
    else:
        return page_not_found()
Example #10
0
import matplotlib.pyplot as plt
import numpy as np
import util as ut

# Get the rating information
ratings_array, rating_info = ut.get_categories()

# Generate histogram of all ratings in the dataset
plt.hist(ratings_array, bins=np.arange(0.5, 6.5, 1))
plt.title('Histogram of All Ratings')
plt.xlabel('Ratings')
plt.savefig('histograms/41.png')
plt.show()

# Generate histogram of ratings of 10 best movies
plt.hist(rating_info[0], bins=np.arange(0.5, 6.5, 1))
plt.title('Histogram of Ratings of 10 Best Movies')
plt.xlabel('Ratings')
plt.savefig('histograms/43.png')
plt.show()

# Generate histogram of ratings of 10 most popular movies
plt.hist(rating_info[1], bins=np.arange(0.5, 6.5, 1))
plt.title('Histogram of Ratings of 10 Most Popular Movies')
plt.xlabel('Ratings')
plt.savefig('histograms/42.png')
plt.show()

# Generate histogram of ratings of Animation movies
plt.hist(rating_info[2], bins=np.arange(0.5, 6.5, 1))
plt.title('Histogram of Ratings of Animation Movies')
        'text_cnn_att': 1
    }
    #weights normalized
    model_weights = {k: (v / 45) for k, v in model_weights_int.items()}
    model_list = [k for k in model_weights_int.keys()]

    #weights for each epoch according with the number of epochs trained
    weigths_epoch = {
        1: [1],
        2: [0.35, 0.65],
        3: [0.15, 0.35, 0.5],
        4: [0.1, 0.2, 0.3, 0.4],
        5: [0.1, 0.15, 0.2, 0.25, 0.3]
    }

    num_classes = len(util.get_categories())

    #Load test data for each language
    data = {}
    for lang in ['es', 'pt']:
        X_test = util.get_X_test(data_type='keras_tokenized_tri',
                                 lang=lang,
                                 file_type="dump")
        index = np.load(DATA_PATH + 'test_index_' + lang + '.npy')
        data[lang] = {'index': index, 'X_test': process_x(X_test)}
        del X_test, index
        gc.collect()

    paths = {}
    for model_name in model_list:
        PATH = DATA_PATH + 'models/' + model_name + '/'
Example #12
0
def categories():
    if should_return_json():
        return jsonify(dict(ok=True, data=get_categories()))
    else:
        return page_not_found()