def _init_loaders(self):
        self.train_loader = DatasetManager.get_dataloader(
            self.config.datalist_config.trainlist_config,
            self.config.train_process_config)

        self.val_loader = DatasetManager.get_dataloader(
            self.config.datalist_config.testlist_configs,
            self.config.train_process_config,
            shuffle=False)
    def test_should_get_dataset(self):

        data = DatasetManager("./tests/resources/local_data")
        dataset = {
            "local_test": {
                "source": "./tests/resources/local_data/train.csv",
                "description": "my little dataset local"
            }
        }
        self.assertDictEqual(data.get_dataset("local_test"),
                             dataset.get("local_test"))
    def test_should_read_yaml_from_dir(self):

        expected = {
            "one_test": {
                "source": "http://source/teste",
                "description": "my little dataset"
            }
        }

        data = DatasetManager("./tests/resources/one_data")
        self.assertDictEqual(data.get_datasets(), expected)
    def test_should_print_ascii(self):
        self.maxDiff = None
        result = """+---------------------+------------+-----------------------------------------------------------------------------+
|     description     | identifier |                                    source                                   |
+---------------------+------------+-----------------------------------------------------------------------------+
|  my little dataset  |  one_test  | https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv |
| my little dataset 2 |  two_test  | https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv |
+---------------------+------------+-----------------------------------------------------------------------------+"""
        data = DatasetManager("./tests/resources/multiple_data")
        printer = Printer(data.get_datasets())
        self.assertEqual(result, printer.__repr__())
Exemple #5
0
def main():
    dataset_manager = DatasetManager()

    epochs = 10
    output_dir = os.path.abspath("workspace")
    model = MaskRCNN(output_dir)

    ratio = 0.2
    datasetA, datasetB = dataset_manager.split_dataset('ade20k_train',
                                                       ratio=ratio)
    weights = model.train(datasetA, weights=None, epochs=10)
    small_datasetB = dataset_manager.random_subset(datasetB, 1000)
    result = model.predict(small_datasetB, weights)
Exemple #6
0
 def setUp(self):
     with open('training_set_list.pickle', 'rb') as handle:
         self.training_dict = pickle.load(handle)
     with open('validation_set_list.pickle', 'rb') as handle:
         self.validation_dict = pickle.load(handle)
     with open('test_set_list.pickle', 'rb') as handle:
         self.test_dict = pickle.load(handle)
     with open('genres.json') as json_data:
         self.genres = json.load(json_data)
     with open('labels.json') as json_data:
         self.dataset = json.load(json_data)
     self.dataset_manager = DatasetManager(self.training_dict,
                                           self.validation_dict,
                                           self.test_dict, self.genres,
                                           self.dataset)
     self.batch_size = 50
Exemple #7
0
def evaluate(experiment_name, step=''):
    logging.info('*' * 50)
    logging.info('RUNNING EVALUATION FOR MODEL: %s', experiment_name)
    if step == '':
        interesting_checkpoint = tf.train.latest_checkpoint(
            os.path.join(CURRENT_DIR, '..', 'checkpoint', experiment_name))
    else:
        interesting_checkpoint = os.path.join(CURRENT_DIR, '..', 'checkpoint',
                                              experiment_name,
                                              'step-{}'.format(step))
    dataset_manager = DatasetManager()
    dataset_manager.boot()

    with tf.Graph().as_default() as gr:
        logging.info('-- Restoring graph for model: %s',
                     interesting_checkpoint)
        saver = tf.train.import_meta_graph(
            '{}.meta'.format(interesting_checkpoint))
        logging.info('-- Restored graph for model named: %s',
                     interesting_checkpoint)

        with tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True)).as_default() as sess:
            saver.restore(sess=sess, save_path=interesting_checkpoint)
            logging.info('-- Restored variables for model named: %s',
                         interesting_checkpoint)
            list_predictions = []
            list_labels = []
            for docs, labels in dataset_manager.get_test_by_batch(
                    batch_size=FLAGS.BATCH_SIZE):
                tf_input = gr.get_tensor_by_name('input/tf_input:0')
                tf_predictions = gr.get_tensor_by_name('prediction:0')

                prediction = sess.run(tf_predictions,
                                      feed_dict={tf_input: docs})
                list_predictions.extend(prediction)
                list_labels.extend(labels)
                logging.debug('-- Prediction length: %s/%s',
                              len(list_predictions),
                              dataset_manager.test_y.shape[0])
            logging.info('-- Report for model: %s', experiment_name)
            logging.info(
                classification_report(y_true=list_labels,
                                      y_pred=list_predictions,
                                      digits=4))
            logging.info(
                confusion_matrix(y_true=list_labels, y_pred=list_predictions))
Exemple #8
0
    def get_mat_id(self, mat_id_name='mat_id'):
        """
        Get material ID numbers of the underlying mesh elements.
        """
        if self.source is not None:
            dm = DatasetManager(dataset=self.source.outputs[0])

            mat_id = dm.cell_scalars[mat_id_name]
            return mat_id
    def test_should_create_dataset_with_custom_data(self):
        data = DatasetManager(self.trash_dir, fs=self.os)
        identifier = "data_name_custom"
        dataset = {
            "identifier": identifier,
            "description": "description",
            "source": "/tmp/test.csv"
        }
        data.create_dataset(**dataset)
        self.assertTrue(
            self.os.isfile("{}/{}.yaml".format(self.trash_dir, identifier)))

        self.assertEqual(len(os.listdir(self.trash_dir)), 2)
        loaded_dataset = data.get_datasets()
        self.assertEqual(list(loaded_dataset.keys()), [identifier])

        datasource_configs = loaded_dataset.get(identifier)
        self.assertEqual(datasource_configs["description"],
                         dataset["description"])
        self.assertEqual(datasource_configs["source"], dataset["source"])
    def test_should_read_multiple_yaml_from_dir(self):

        expected = {
            "one_test": {
                "source":
                "https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv",
                "description": "my little dataset"
            },
            "two_test": {
                "source":
                "https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv",
                "description": "my little dataset 2"
            }
        }

        data = DatasetManager("./tests/resources/multiple_data", fs=self.os)
        result = list(data.get_datasets().keys())
        result.sort()
        expected = ["one_test", "two_test"]
        self.assertListEqual(expected, result)
Exemple #11
0
def main():
    output_dir = os.path.abspath("workspace")

    dataset_manager = DatasetManager()
    model = MaskRCNN(output_dir)

    ratio = 0.2
    datasetA, datasetB = dataset_manager.split_dataset('ade20k_train',
                                                       ratio=ratio)

    weights = model.train(datasetA, weights=None, epochs=10)

    while True:
        small_datasetB = dataset_manager.random_subset(datasetB, 100)
        predictions = model.predict(small_datasetB, weights)
        annotations = annotator.filter(predictions)
        new_dataset = dataset_manager.create_dataset_with_new_annotations(
            datasetB, annotations)

        weights = model.train(new_dataset, weights, epochs=1)
Exemple #12
0
def predict(list_docs, experiment_name, step='', batch_size=64):

    logging.info('*' * 50)
    logging.info('RUNNING PREDICT FOR MODEL: %s', experiment_name)
    if step == '':
        interesting_checkpoint = tf.train.latest_checkpoint(os.path.join(CURRENT_DIR, '..', 'checkpoint', experiment_name))
    else:
        interesting_checkpoint = os.path.join(CURRENT_DIR, '..', 'checkpoint', experiment_name, 'step-{}'.format(step))
    dataset_manager = DatasetManager()
    dataset_manager.boot()

    list_preprocessed_sentences = preprocessor.preprocess(list_docs)

    list_vecs = dataset_manager.text2vec.doc_to_vec(list_preprocessed_sentences)
    print(dataset_manager.text2vec.vec_to_doc(list_vecs))
    list_vecs = dataset_manager.equalize_vector_length_to_np(list_vectors=list_vecs,
                                                             max_length=model_v6.SENTENCE_LENGTH_MAX)

    with tf.Graph().as_default() as gr:
        logging.info('-- Restoring graph for model: %s', interesting_checkpoint)
        saver = tf.train.import_meta_graph('{}.meta'.format(interesting_checkpoint))
        logging.info('-- Restored graph for model named: %s', interesting_checkpoint)

        with tf.Session(config=tf.ConfigProto(allow_soft_placement=True)).as_default() as sess:
            saver.restore(sess=sess, save_path=interesting_checkpoint)
            logging.info('-- Restored variables for model named: %s', interesting_checkpoint)
            list_predictions = []

            num_steps = len(list_vecs) // batch_size
            logging.info('There will be %s steps', num_steps + 1)
            for i in range(num_steps + 1):
                tf_input = gr.get_tensor_by_name('input/tf_input:0')
                tf_predictions = gr.get_tensor_by_name('prediction:0')

                prediction = sess.run(tf_predictions, feed_dict={
                    tf_input: list_vecs[i*batch_size: (i+1)*batch_size]
                })
                list_predictions.extend([dataset_manager.LABEL_UNMAPPING[p] for p in prediction])

            return list_predictions
    def test_should_create_dataset(self):
        data = DatasetManager(self.trash_dir, fs=self.os)
        identifier = "data_name"
        dataset = {
            "identifier": identifier,
            "description": "description",
            "source": "/tmp/test.csv",
        }

        data.create_dataset(**dataset)

        loaded_datasets = data.get_datasets()
        dataset_config = loaded_datasets.get(identifier)

        self.assertTrue(
            self.os.isfile("{}/{}.yaml".format(self.trash_dir, identifier)))
        self.assertEqual(len(self.os.listdir(self.trash_dir)), 2)

        self.assertEqual(list(loaded_datasets.keys())[0], identifier)
        self.assertEqual(dataset_config.get("description"),
                         dataset["description"])
        self.assertEqual(dataset_config.get("source"), dataset["source"])
    def test_should_print_html(self):
        self.maxDiff = None
        result = """<table>
    <tr>
        <th>description</th>
        <th>identifier</th>
        <th>source</th>
    </tr>
    <tr>
        <td>my little dataset</td>
        <td>one_test</td>
        <td>https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv</td>
    </tr>
    <tr>
        <td>my little dataset 2</td>
        <td>two_test</td>
        <td>https://raw.githubusercontent.com/pcsanwald/kaggle-titanic/master/train.csv</td>
    </tr>
</table>"""
        data = DatasetManager("./tests/resources/multiple_data")
        printer = Printer(data.get_datasets())
        self.assertEqual(result, printer._repr_html_())
Exemple #15
0
    def create_source(self):
        """
        Create a VTK source from data in a SfePy-supported file.

        Notes
        -----
        All data need to be set here, otherwise time stepping will not
        work properly - data added by user later will be thrown away on
        time step change.
        """
        if self.io is None:
            self.read_common(self.filename)

        dataset = self.create_dataset()

        try:
            out = self.io.read_data(self.step)
        except ValueError:
            out = None

        if out is not None:
            self.add_data_to_dataset(dataset, out)

        if self.mat_id_name is not None:
            mat_id = nm.concatenate(self.mesh.mat_ids)
            if self.single_color:
                rm = mat_id.min(), mat_id.max()
                mat_id[mat_id > rm[0]] = rm[1]

            dm = DatasetManager(dataset=dataset)
            dm.add_array(mat_id, self.mat_id_name, 'cell')

        src = VTKDataSource(data=dataset)
#        src.print_traits()
#        debug()
        return src
Exemple #16
0
    def create_source(self):
        """
        Create a VTK source from data in a SfePy-supported file.

        Notes
        -----
        All data need to be set here, otherwise time stepping will not
        work properly - data added by user later will be thrown away on
        time step change.
        """
        if self.io is None:
            self.read_common(self.filename)

        dataset = self.create_dataset()

        try:
            out = self.io.read_data(self.step)
        except ValueError:
            out = None

        if out is not None:
            self.add_data_to_dataset(dataset, out)

        if self.mat_id_name is not None:
            mat_id = self.mesh.cmesh.cell_groups
            if self.single_color:
                rm = mat_id.min(), mat_id.max()
                mat_id[mat_id > rm[0]] = rm[1]

            dm = DatasetManager(dataset=dataset)
            dm.add_array(mat_id, self.mat_id_name, 'cell')

        src = VTKDataSource(data=dataset)
#        src.print_traits()
#        debug()
        return src
Exemple #17
0
def add_subdomains_surface(obj,
                           position,
                           mat_id_name='mat_id',
                           threshold_limits=(None, None),
                           **kwargs):
    dm = DatasetManager(dataset=obj.outputs[0])
    mat_id = dm.cell_scalars[mat_id_name]

    rm = mat_id.min(), mat_id.max()

    active = mlab.pipeline.set_active_attribute(obj)
    active.cell_scalars_name = mat_id_name

    aa = mlab.pipeline.set_active_attribute(obj)
    aa.cell_scalars_name = mat_id_name

    threshold = mlab.pipeline.threshold(aa)
    threshold.threshold_filter.progress = 1.0
    if threshold_limits[0] is not None:
        threshold.lower_threshold = threshold_limits[0] + 0.1
    if threshold_limits[1] is not None:
        threshold.upper_threshold = threshold_limits[1] - 0.1

    threshold.auto_reset_lower = False
    threshold.auto_reset_upper = False

    surface = mlab.pipeline.surface(threshold, opacity=0.3)
    surface.actor.actor.position = position

    module_manager = surface.parent
    lm = module_manager.scalar_lut_manager
    lm.lut_mode = 'Blues'
    if (rm[1] - rm[0]) == 1:
        lm.reverse_lut = True

    surface2 = mlab.pipeline.surface(active, opacity=0.2)
    surface2.actor.actor.position = position

    module_manager = surface2.parent
    module_manager.scalar_lut_manager.lut_mode = 'Blues'

    return surface, surface2
 def test_should_remove_dataset(self):
     data = DatasetManager(self.trash_dir, fs=self.os)
     identifier = "data_name"
     dataset = {
         "identifier": identifier,
         "description": "description",
         "source": "/tmp/test.csv"
     }
     data.create_dataset(**dataset)
     self.assertTrue(
         os.path.isfile("{}/{}.yaml".format(self.trash_dir, identifier)))
     self.assertEqual(len(os.listdir(self.trash_dir)), 2)
     data.remove_dataset(identifier)
     self.assertFalse(
         os.path.isfile("{}/{}.yaml".format(self.trash_dir, identifier)))
     self.assertEqual(len(os.listdir(self.trash_dir)), 1)
params = "pd_fixed_trainratio80_outcome_all_data_singletask"
#params = "lstmsize%s_dropout%s_shared%s_specialized%s"%(lstmsize, dropout, n_shared_layers, n_specialized_layers)
checkpoint_prefix = os.path.join(
    output_dir, "checkpoints/model_%s_%s" % (dataset_name, params))
model_filename = glob.glob("%s*.hdf5" % checkpoint_prefix)[-1]
#model_filename = "code/output_files/models/model_28-1.51.h5"
results_file = os.path.join(
    output_dir, "evaluation_results/results_%s_%s_%s.csv" %
    (cls_method, dataset_name, params))

##### MAIN PART ######

print('Preparing data...')
start = time.time()

dataset_manager = DatasetManager(dataset_name)
data = dataset_manager.read_dataset()
train, test = dataset_manager.split_data(
    data, train_ratio, split=data_split_type
)  # to reproduce results of Tax et al., use 'ordered' instead of 'temporal'

dt_train = dataset_manager.encode_data_with_label_all_data(train)
dt_test = dataset_manager.encode_data_with_label_all_data(test)

if normalize_over == "train":
    dataset_manager.calculate_divisors(dt_train)
elif normalize_over == "all":
    dt_all = dataset_manager.extract_timestamp_features(data)
    dt_all = dataset_manager.extract_duration_features(dt_all)
    dataset_manager.calculate_divisors(dt_all)
else:
output_dir = "results"
n_estimators = 1000
max_features = 0.5
params = "nestimators%s_maxfeatures%s" % (n_estimators, max_features)

##### MAIN PART ######

for dataset_name in datasets:

    results_file = os.path.join(
        output_dir, "evaluation_results/results_%s_%s_%s.csv" %
        (cls_method, dataset_name, params))

    print("Loading data...")
    start = time.time()
    dataset_manager = DatasetManager(dataset_name)
    data = dataset_manager.read_dataset()
    train, test = dataset_manager.split_data(data,
                                             train_ratio,
                                             split="temporal")
    train = dataset_manager.get_train_sample(train, sample_size)
    #train, val = dataset_manager.get_train_val_data(train, sample_size, val_sample_size)
    print("Done: %s" % (time.time() - start))

    print('Encoding data...')
    start = time.time()
    dt_train = dataset_manager.encode_data(train)
    #dt_val = dataset_manager.encode_data(val)
    dt_test = dataset_manager.encode_data(test)
    #X, y = dataset_manager.generate_3d_data(dt_train, max_len)
    #X_val, y_val = dataset_manager.generate_3d_data(dt_val, max_len)
Exemple #21
0
from flask import Flask, send_file
from flask_restful import Resource, Api, reqparse
from flask_cors import CORS
import numpy as np
from data_backend import Dataset as HDF_Dataset
from dataset_manager import DatasetManager
from thumbnailer import Thumbnailer
from utils import merge_overlapping_filters

DATASET_PATH = "./datasets"
dataset_manager = DatasetManager(DATASET_PATH)
API_BASE_STR = "/api/v1"

# Init thumbnails (clean directory)
thumbnailer = Thumbnailer("./thumbnails")
thumbnailer.clean();

dataset_list = []
for dset_index, name in enumerate(dataset_manager.get_dataset_names()):
    dset = HDF_Dataset(DATASET_PATH, name)
    dataset_list.append({
        "id": dset_index,
        "name": name,
        "device": {
            "name": dset.device.name,
            "version": dset.device.version
        },
        "subsets": [
            {
                "id": subset_index,
                "name": subset,
print('Preparing data...')
start = time.time()

dataset_name = argv[1]
embedding_type = argv[2]
embedding_dim = int(argv[3])

scale_model = "row"

train_ratio = 0.8
val_ratio = 0.2
activation = "sigmoid"
optimizer = "adam"
nb_epoch = 50

dataset_manager = DatasetManager(dataset_name)
data = dataset_manager.read_dataset()
train, _ = dataset_manager.split_data_strict(data,
                                             train_ratio,
                                             split="temporal")
train, val = dataset_manager.split_val(train, val_ratio, split="random")

if embedding_type == "none":
    dt_train = dataset_manager.encode_data_with_label_all_data(train)
    dt_val = dataset_manager.encode_data_with_label_all_data(val)
else:
    dt_train = dataset_manager.encode_data_with_label_all_data_act_res_embedding(
        train,
        embedding_type=embedding_type,
        embedding_dim=embedding_dim,
        scale_model=scale_model)
Exemple #23
0
    lstmsize, lstmsize2, int(dropout * 100), int(
        learning_rate * 100000), nb_epoch, batch_size, sample_size)

##### MAIN PART ######

for dataset_name in datasets:

    results_file = os.path.join(
        output_dir,
        "evaluation_results/results_lstm_%s_%s.csv" % (dataset_name, params))
    checkpoint_prefix = os.path.join(
        output_dir, "checkpoints/weights_%s_%s" % (dataset_name, params))

    print("Loading data...")
    start = time.time()
    dataset_manager = DatasetManager(dataset_name)
    data = dataset_manager.read_dataset()
    train, test = dataset_manager.split_data(data,
                                             train_ratio,
                                             split="temporal")
    train, val = dataset_manager.get_train_val_data(train, sample_size,
                                                    val_sample_size)
    print("Done: %s" % (time.time() - start))

    print('Encoding data...')
    start = time.time()
    dt_train = dataset_manager.encode_data(train)
    dt_val = dataset_manager.encode_data(val)
    dt_test = dataset_manager.encode_data(test)
    X, y = dataset_manager.generate_3d_data(dt_train, max_len)
    X_val, y_val = dataset_manager.generate_3d_data(dt_val, max_len)
def run(experiment_name):
    BEST_THRES = 3
    WORST_THRES = 3
    POPULATION_STEPS = 500
    ITERATIONS = 100
    POPULATION_SIZE = 10
    accuracy_hist = np.zeros((POPULATION_SIZE, POPULATION_STEPS))
    l1_scale_hist = np.zeros((POPULATION_SIZE, POPULATION_STEPS))
    best_accuracy_hist = np.zeros((POPULATION_STEPS, ))
    best_l1_scale_hist = np.zeros((POPULATION_STEPS, ))

    with tf.Graph().as_default() as gr:

        with tf.variable_scope('input'):
            tf_input = tf.placeholder(
                dtype=tf.int32,
                shape=[
                    None, model_population_based_tunning.SENTENCE_LENGTH_MAX
                ],
                name='tf_input')
            tf_labels = tf.placeholder(dtype=tf.int32,
                                       shape=[None],
                                       name='tf_labels')

        models = [
            create_model(
                i, is_included_regularization=FLAGS.IS_INCLUDED_REGULARIZATION)
            for i in range(10)
        ]
        # It will help us with creation of different scope_name for each model
        for index, model in enumerate(models):
            with tf.variable_scope(str(index)):
                model.boot(tf_input, tf_labels)

        logging.info('Graph size: %s', utils.count_trainable_variables())

        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.GPU)
        with tf.Session(config=tf.ConfigProto(
                gpu_options=gpu_options,
                allow_soft_placement=True,
                log_device_placement=FLAGS.LOG_DEVICE_PLACEMENT)).as_default(
                ) as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())

            dataset_manager = DatasetManager()
            dataset_manager.boot()

            dataset_generator = dataset_manager.get_batch(
                batch_size=FLAGS.BATCH_SIZE,
                number_epochs=10 * FLAGS.NUMBER_EPOCHS)
            for i in range(POPULATION_STEPS):

                # Copy best
                sess.run([
                    m.get_copy_from_op(models[0])
                    for m in models[-WORST_THRES:]
                ])
                # Perturb others
                sess.run([m.l1_scale_perturb_op for m in models[BEST_THRES:]])
                # Training
                for _ in range(ITERATIONS):
                    docs, labels = next(dataset_generator)
                    sess.run([m.tf_optimizer for m in models],
                             feed_dict={
                                 tf_input: docs,
                                 tf_labels: labels
                             })
                docs, labels = next(dataset_generator)
                # Evaluate
                l1_scales = sess.run({m: m.l1_scale for m in models})
                accuracies = sess.run({m: m.tf_acc
                                       for m in models},
                                      feed_dict={
                                          tf_input: docs,
                                          tf_labels: labels
                                      })
                models.sort(key=lambda m: accuracies[m], reverse=True)
                # Logging
                best_accuracy_hist[i] = accuracies[models[0]]
                best_l1_scale_hist[i] = l1_scales[models[0]]
                for m in models:
                    l1_scale_hist[m.model_id, i] = l1_scales[m]
                    accuracy_hist[m.model_id, i] = accuracies[m]
            with open('temp', 'w') as output_f:
                json.dump(
                    {
                        'accuracy_hist': accuracy_hist,
                        'l1_scale_hist': l1_scale_hist,
                        'best_accuracy_hist': best_accuracy_hist,
                        'best_l1_scale_hist': best_l1_scale_hist
                    }, output_f)
Exemple #25
0
    def add_data_to_dataset(self, dataset, data):
        """Add point and cell data to the dataset."""
        dim = self.dim
        sym = (dim + 1) * dim / 2

        dm = DatasetManager(dataset=dataset)
        for key, val in data.iteritems():
            vd = val.data
##             print vd.shape
            if val.mode == 'vertex':
                if vd.shape[1] == 1:
                    aux = vd.reshape((vd.shape[0],))

                elif vd.shape[1] == 2:
                    zz = nm.zeros((vd.shape[0], 1), dtype=vd.dtype)
                    aux = nm.c_[vd, zz]

                elif vd.shape[1] == 3:
                    aux = vd

                else:
                    raise ValueError('unknown vertex data format! (%s)'\
                                     % vd.shape)

                dm.add_array(aux, key, 'point')

            elif val.mode == 'cell':
                ne, aux, nr, nc = vd.shape
                if (nr == 1) and (nc == 1):
                    aux = vd.reshape((ne,))

                elif (nr == dim) and (nc == 1):
                    if dim == 3:
                        aux = vd.reshape((ne, dim))
                    else:
                        zz = nm.zeros((vd.shape[0], 1), dtype=vd.dtype);
                        aux = nm.c_[vd.squeeze(), zz]

                elif (((nr == sym) or (nr == (dim * dim))) and (nc == 1)) \
                         or ((nr == dim) and (nc == dim)):
                    vd = vd.squeeze()

                    if dim == 3:
                        if nr == sym:
                            aux = vd[:,[0,3,4,3,1,5,4,5,2]]
                        elif nr == (dim * dim):
                            aux = vd[:,[0,3,4,6,1,5,7,8,2]]
                        else:
                            aux = vd.reshape((vd.shape[0], dim*dim))
                    else:
                        zz = nm.zeros((vd.shape[0], 1), dtype=vd.dtype);
                        if nr == sym:
                            aux = nm.c_[vd[:,[0,2]], zz, vd[:,[2,1]],
                                        zz, zz, zz, zz]
                        elif nr == (dim * dim):
                            aux = nm.c_[vd[:,[0,2]], zz, vd[:,[3,1]],
                                        zz, zz, zz, zz]
                        else:
                            aux = nm.c_[vd[:,0,[0,1]], zz, vd[:,1,[0,1]],
                                        zz, zz, zz, zz]

                dm.add_array(aux, key, 'cell')
Exemple #26
0
def run(experiment_name):
    with tf.Graph().as_default() as gr:
        with tf.variable_scope('input'):
            tf_input = tf.placeholder(dtype=tf.int32,
                                      shape=[None, model.SENTENCE_LENGTH_MAX],
                                      name='tf_input')
            tf_labels = tf.placeholder(dtype=tf.int32,
                                       shape=[None],
                                       name='tf_labels')

        tf_logits = model.inference(tf_input)
        tf_loss = model.loss(tf_logits, tf_labels)

        tf_optimizer, tf_global_step = model.optimize(tf_loss)
        model.measure_acc(tf_logits, tf_labels)

        tf_all_summary = tf.summary.merge_all()

        tf_train_writer = tf.summary.FileWriter(logdir=os.path.join(
            CURRENT_DIR, '..', 'summary', 'train_' + experiment_name),
                                                graph=gr)
        tf_test_writer = tf.summary.FileWriter(logdir=os.path.join(
            CURRENT_DIR, '..', 'summary', 'test_' + experiment_name),
                                               graph=gr)

        tf_embedding_writer = tf.summary.FileWriter(logdir=os.path.join(
            CURRENT_DIR, '..', 'checkpoint', experiment_name))

        # Visual word embedding
        config = projector.ProjectorConfig()
        embedding = config.embeddings.add()
        embedding.tensor_name = 'embedding/word_embeddings'  # Reference model_v6.py
        embedding.metadata_path = os.path.join(CURRENT_DIR, 'data',
                                               DatasetManager.VOCAB_FILE)
        projector.visualize_embeddings(tf_embedding_writer, config)

        saver = tf.train.Saver(max_to_keep=5,
                               keep_checkpoint_every_n_hours=0.03)

        logging.info('Graph size: %s', utils.count_trainable_variables())

        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.GPU)
        with tf.Session(config=tf.ConfigProto(
                gpu_options=gpu_options,
                allow_soft_placement=True,
                log_device_placement=FLAGS.LOG_DEVICE_PLACEMENT)).as_default(
                ) as sess:
            sess.run(tf.global_variables_initializer())
            sess.run(tf.local_variables_initializer())

            dataset_manager = DatasetManager()
            dataset_manager.boot()

            for docs, labels in dataset_manager.get_batch(
                    batch_size=FLAGS.BATCH_SIZE,
                    number_epochs=FLAGS.NUMBER_EPOCHS):
                _, global_step = sess.run([tf_optimizer, tf_global_step],
                                          feed_dict={
                                              tf_input: docs,
                                              tf_labels: labels
                                          })
                summary_interval_step = 10
                if global_step % summary_interval_step == 0:
                    logging.debug('Global step: %s', global_step)
                    train_summary_data = sess.run(tf_all_summary,
                                                  feed_dict={
                                                      tf_input: docs,
                                                      tf_labels: labels
                                                  })
                    tf_train_writer.add_summary(train_summary_data,
                                                global_step=global_step)

                if global_step % summary_interval_step == 0:
                    docs_test, labels_test = dataset_manager.get_test_set(
                        FLAGS.TEST_SIZE, is_shuffled=True)
                    test_summary_data = sess.run(tf_all_summary,
                                                 feed_dict={
                                                     tf_input: docs_test,
                                                     tf_labels: labels_test
                                                 })
                    tf_test_writer.add_summary(test_summary_data,
                                               global_step=global_step)

                if global_step % 200 == 0:
                    path_to_save = os.path.join(CURRENT_DIR, '..',
                                                'checkpoint', experiment_name)
                    if not os.path.exists(path_to_save):
                        os.makedirs(path_to_save)
                    saved_file = saver.save(sess,
                                            save_path=os.path.join(
                                                path_to_save, 'step'),
                                            global_step=global_step,
                                            write_meta_graph=True)
                    logging.debug('Saving model at %s', saved_file)
Exemple #27
0
class DataLoadingTests(unittest.TestCase):
    def setUp(self):
        with open('training_set_list.pickle', 'rb') as handle:
            self.training_dict = pickle.load(handle)
        with open('validation_set_list.pickle', 'rb') as handle:
            self.validation_dict = pickle.load(handle)
        with open('test_set_list.pickle', 'rb') as handle:
            self.test_dict = pickle.load(handle)
        with open('genres.json') as json_data:
            self.genres = json.load(json_data)
        with open('labels.json') as json_data:
            self.dataset = json.load(json_data)
        self.dataset_manager = DatasetManager(self.training_dict,
                                              self.validation_dict,
                                              self.test_dict, self.genres,
                                              self.dataset)
        self.batch_size = 50

    def test_normal_training_image_load(self):
        images = self.dataset_manager.next_batch(50, "train")
        self.assertEqual(images[0].shape, (50, 227, 227, 3))

    def test_normal_training_labels_load(self):
        images = self.dataset_manager.next_batch(50, "train")
        self.assertEqual(images[1].shape, (50, 26))

    def test_last_traninig_image_load(self):
        self.dataset_manager.cur_train = \
            len(self.dataset_manager.training_list) - \
            (self.batch_size / 2)
        images = self.dataset_manager.next_batch(50, "train")
        self.assertEqual(images[0].shape, (50, 227, 227, 3))

    def test_last_traninig_labels_load(self):
        self.dataset_manager.cur_train = \
            len(self.dataset_manager.training_list) - \
            (self.batch_size / 2)
        images = self.dataset_manager.next_batch(50, "train")
        self.assertEqual(images[1].shape, (50, 26))

    def test_normal_validation_image_load(self):
        images = self.dataset_manager.next_batch(50, "val")
        self.assertEqual(images[0].shape, (50, 227, 227, 3))

    def test_normal_validation_labels_load(self):
        images = self.dataset_manager.next_batch(50, "val")
        self.assertEqual(images[1].shape, (50, 26))

    def test_last_validation_image_load(self):
        self.dataset_manager.cur_val = \
            len(self.dataset_manager.validation_list) - \
            (self.batch_size / 2)
        images = self.dataset_manager.next_batch(50, "val")
        self.assertEqual(images[0].shape, (50, 227, 227, 3))

    def test_last_validation_labels_load(self):
        self.dataset_manager.cur_val = \
            len(self.dataset_manager.validation_list) - \
            (self.batch_size / 2)
        images = self.dataset_manager.next_batch(50, "val")
        self.assertEqual(images[1].shape, (50, 26))

    def test_normal_test_image_load(self):
        images = self.dataset_manager.next_batch(50, "test")
        self.assertEqual(images[0].shape, (50, 227, 227, 3))

    def test_normal_test_labels_load(self):
        images = self.dataset_manager.next_batch(50, "test")
        self.assertEqual(images[1].shape, (50, 26))

    def test_last_test_image_load(self):
        self.dataset_manager.cur_test = \
            len(self.dataset_manager.test_list) - \
            (self.batch_size / 2)
        images = self.dataset_manager.next_batch(50, "test")
        self.assertEqual(images[0].shape, (50, 227, 227, 3))

    def test_last_test_labels_load(self):
        self.dataset_manager.cur_test = \
            len(self.dataset_manager.test_list) - \
            (self.batch_size / 2)
        images = self.dataset_manager.next_batch(50, "test")
        self.assertEqual(images[1].shape, (50, 26))

    def test_create_label_vector(self):
        label_vector = self.dataset_manager.create_label_vector(
            [" Action", " Documentary", " Drama", " Horror", " News", " War"])
        self.assertEqual(label_vector, [
            1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
            0, 0, 1, 0
        ])

    def test_create_label_vector_end(self):
        label_vector = self.dataset_manager.create_label_vector([
            " Action", " Documentary", " Drama", " Horror", " News", " War",
            " Western"
        ])
        self.assertEqual(label_vector, [
            1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0,
            0, 0, 1, 1
        ])

    def test_no_duplicate_between_test_and_train(self):
        self.assertEqual(self.training_dict.intersection(self.test_dict),
                         set())
    def test_should_remove_unknown_dataset(self):

        data = DatasetManager("./tests/resources/local_data", fs=self.os)
        with self.assertRaises(IOError):
            data.remove_dataset("unknown_dataset")
Exemple #29
0
    def add_data_to_dataset(self, dataset, data):
        """Add point and cell data to the dataset."""
        dim = self.dim
        sym = (dim + 1) * dim / 2

        dm = DatasetManager(dataset=dataset)
        for key, val in data.iteritems():
            vd = val.data
##             print vd.shape
            if val.mode == 'vertex':
                if vd.shape[1] == 1:
                    aux = vd.reshape((vd.shape[0],))

                elif vd.shape[1] == 2:
                    zz = nm.zeros((vd.shape[0], 1), dtype=vd.dtype)
                    aux = nm.c_[vd, zz]

                elif vd.shape[1] == 3:
                    aux = vd

                else:
                    raise ValueError('unknown vertex data format! (%s)'\
                                     % vd.shape)

                dm.add_array(aux, key, 'point')

            elif val.mode == 'cell':
                ne, aux, nr, nc = vd.shape
                if (nr == 1) and (nc == 1):
                    aux = vd.reshape((ne,))

                elif (nr == dim) and (nc == 1):
                    if dim == 3:
                        aux = vd.reshape((ne, dim))
                    else:
                        zz = nm.zeros((vd.shape[0], 1), dtype=vd.dtype);
                        aux = nm.c_[vd.squeeze(), zz]

                elif (((nr == sym) or (nr == (dim * dim))) and (nc == 1)) \
                         or ((nr == dim) and (nc == dim)):
                    vd = vd.squeeze()

                    if dim == 3:
                        if nr == sym:
                            aux = vd[:,[0,3,4,3,1,5,4,5,2]]
                        elif nr == (dim * dim):
                            aux = vd[:,[0,3,4,6,1,5,7,8,2]]
                        else:
                            aux = vd.reshape((vd.shape[0], dim*dim))
                    else:
                        zz = nm.zeros((vd.shape[0], 1), dtype=vd.dtype);
                        if nr == sym:
                            aux = nm.c_[vd[:,[0,2]], zz, vd[:,[2,1]],
                                        zz, zz, zz, zz]
                        elif nr == (dim * dim):
                            aux = nm.c_[vd[:,[0,2]], zz, vd[:,[3,1]],
                                        zz, zz, zz, zz]
                        else:
                            aux = nm.c_[vd[:,0,[0,1]], zz, vd[:,1,[0,1]],
                                        zz, zz, zz, zz]

                dm.add_array(aux, key, 'cell')
def main():
    # Load dataset manager
    with open('training_set_list.pickle', 'rb') as handle:
        training_set = pickle.load(handle)
    with open('validation_set_list.pickle', 'rb') as handle:
        validation_set = pickle.load(handle)
    with open('test_set_list.pickle', 'rb') as handle:
        test_set = pickle.load(handle)
    with open('genres.json') as json_data:
        genres = json.load(json_data)
    with open('labels.json') as json_data:
        labels = json.load(json_data)
    dataset_manager = DatasetManager(training_set, validation_set, test_set,
                                     genres, labels)

    batch_size = 1
    n_classes = 26

    # Graph input
    x = tf.placeholder(tf.float32, [batch_size, 227, 227, 3])
    y = tf.placeholder(tf.float32, [None, n_classes])
    keep_var = tf.placeholder(tf.float32)

    # Model
    pred = Model.alexnet(x, keep_var)  # definition of the network architecture

    # Loss and optimize

    # Init
    init = tf.global_variables_initializer()

    # Initialize an saver for store model checkpoints
    saver = tf.train.Saver()

    # Launch the graph
    with tf.Session() as sess:
        sess.run(init)

        # Load pretrained model
        # Skip weights from fc8 (fine-tuning)
        # load_with_skip('pretrained_alexnet.npy', sess, ['fc8'])
        # saver.restore(sess, "saved_models/MSE_without_data_augmentation_0.75_0.001/film_genre_model.ckpt")
        saver.restore(
            sess, "saved_models/models/model_dropout05_mean_square_error.ckpt")
        print('Model Restored')

        test_map_global = 0.
        test_count = 0
        # test accuracy by group of batch_size images
        for _ in range(int(len(dataset_manager.test_list) / batch_size) + 1):
            batch_tx, batch_ty = dataset_manager.next_batch(batch_size, 'test')
            # print(batch_tx[0], batch_ty[0])
            test_output = sess.run(pred, feed_dict={x: batch_tx, keep_var: 1})
            # print(test_output[0])
            MAP = mean_average_precision(test_output, batch_ty)
            test_map_global += MAP
            test_count += 1
        test_map_global /= test_count
        print("Global Test Accuracy = {:.4f}".format(test_map_global))
        # Load one image
        img = cv2.imread('saved_models/images_tests/yellow.jpg')

        img = cv2.resize(img, (227, 227))
        img = img.astype(np.float32)
        img -= np.array([104., 117., 124.])
        print(img)
        test_output = sess.run(pred,
                               feed_dict={
                                   x: np.reshape(img, (1, 227, 227, 3)),
                                   keep_var: 1
                               })
        score_dict = {}
        for score, genre in zip(test_output[0], genres):
            score_dict[genre] = score
        print(list(reversed(sorted(score_dict.items(), key=lambda x: x[1]))))
Exemple #31
0
def predict(list_sentences,
            output_file,
            experiment_name,
            step='',
            list_labels=[]):
    dataset_manager = DatasetManager()
    dataset_manager.boot()
    list_preprocessed_sentences = preprocessor.preprocess(list_sentences)
    list_vecs = dataset_manager.text2vec.doc_to_vec(
        list_preprocessed_sentences)
    list_vecs = dataset_manager.equalize_vector_length_to_np(
        list_vectors=list_vecs, max_length=model_v1.SENTENCE_LENGTH_MAX)
    list_labels = dataset_manager.convert_labels_to_np(list_labels)

    if step == '':
        interesting_checkpoint = tf.train.latest_checkpoint(
            os.path.join(CURRENT_DIR, '..', 'checkpoint', experiment_name))
    else:
        interesting_checkpoint = os.path.join(CURRENT_DIR, '..', 'checkpoint',
                                              experiment_name,
                                              'step-{}'.format(step))

    with tf.Graph().as_default() as gr:
        logging.info('-- Restoring graph for model: %s',
                     interesting_checkpoint)
        saver = tf.train.import_meta_graph(
            '{}.meta'.format(interesting_checkpoint))
        logging.info('-- Restored graph for model named: %s',
                     interesting_checkpoint)

        with tf.Session(config=tf.ConfigProto(
                allow_soft_placement=True)).as_default() as sess:
            saver.restore(sess=sess, save_path=interesting_checkpoint)
            logging.info('-- Restored variables for model named: %s',
                         interesting_checkpoint)

            tf_input = gr.get_tensor_by_name('input/tf_input:0')
            tf_predictions = gr.get_tensor_by_name('prediction:0')

            prediction = sess.run(tf_predictions,
                                  feed_dict={tf_input: list_vecs})

            if len(list_labels) != 0:
                logging.info('-- Report for model: %s', experiment_name)
                logging.info(
                    classification_report(y_true=list_labels,
                                          y_pred=prediction))

            result_dict = dict()
            result_dict['sentence'] = list_sentences
            result_dict['pre-processed'] = list_preprocessed_sentences
            result_dict[
                'pre-processed_recover'] = dataset_manager.text2vec.vec_to_doc(
                    list_vecs)
            result_dict['predict'] = prediction

            if len(list_labels) != 0:
                result_dict['label'] = list_labels

            pd.DataFrame(result_dict).to_csv(output_file, index=None)
            logging.debug('Saved result at %s', output_file)
    def test_should_get_dataset_unknown(self):

        data = DatasetManager("./tests/resources/local_data")
        with self.assertRaises(IOError):
            data.get_dataset("unknown_test")