Ejemplo n.º 1
0
 def _getaveragedocsize(model, customclient=None):
     if customclient is None:
         client = mongoclient()
     else:
         client = customclient
     try:
         stats = client['IWLearn'].command('collstats', model.sampletype.__name__ + 's')
         return stats['avgObjSize']*2.0 if 'avgObjSize' in stats else 100000
     finally:
         if customclient is None:
             client.close()
Ejemplo n.º 2
0
def get(model_id):
    """
    :param model_id:
    :return: model
    """
    collection = mongoclient()['IWLearn']['Models']

    doc = collection.find_one(
        filter={'_id': ObjectId(model_id)},
        projection={'_id': 0, 'filepath': 1})

    with open(doc['filepath'], 'rb') as f:
        logging.info('load model from %s' % (doc['filepath']))
        return cPickle.load(f)
Ejemplo n.º 3
0
 def __init__(self):
     self.rule = RelocationRule()
     self.collection = mongo.mongoclient()['Tutorial']['Predictions']
Ejemplo n.º 4
0
    def generate(experiment_name,
                 model,
                 maxRAM=None,
                 numclasses=None,
                 part_size=None,
                 customclient=None,
                 **kwargs):
        """
        Generate new or extend existing dataset by loading it from MongoDB, and cache it on disk. The disk cache will
        be split in parts samplewise, each part containing only part_size of samples. Inside of each part, one file
        per feature will be saved, in the .npy (numpy.save format).

        The separation into parts allows both for generating datasets larger than RAM as well as reading from such
        datasets during the training.

        The saving of features into separate files allows for easy extension of removal of features for existing
        datasets.

        :param experiment_name: The name of the subdirectory containing cached dataset.
        :param model: The model this dataset has to be created for. The model defines both the features that are
        to be extracted from the samples, as well as the shape of the input matrix. Also, the model defines the
        type of samples to load in case the query parameter is passed.
        :param maxRAM: maximum RAM to use for generating the dataset. If you don't pass batch_size in kwargs, the
        average sample size will be calculated and maxRAM will be used to determine maximal batch_size fitting into
        the maxRAM limit. If maxRAM is not passed, we take 50% of all RAM on this PC.
        :param numclasses: in case the data set is for a classifier, pass number of classes. This information cannot
        be retrieved from the model in case if model.output_shape = (1,).
        :param part_size: pass number of samples to be contained within each part of dataset separately written to
        disk. Usually you don't need to care about it, as the part size will be chosen automatically. In case your
        samples are extremely small or extremely big though, you might need to tweak this parameter. We find the parts
        of below 4 MiB in byte size to work best.
        :param customclient: optionally, a mongo client to use (for example if you want to pass a mock client for tests)
        :param **kwargs: pass arguments to the mongo client find method to load the samples, eg. filter, batch_size or
        projection
        :return: a new DataSet instance
        """

        if customclient is None:
            client = mongoclient()
        else:
            client = customclient

        try:
            coll = client['IWLearn'][model.sampletype.__name__ + 's']

            if 'batch_size' not in kwargs:
                if maxRAM is None:
                    maxRAM = int(0.5 * os.sysconf('SC_PAGE_SIZE') *
                                 os.sysconf('SC_PHYS_PAGES'))

                average_doc_size = DataSet._getaveragedocsize(model, client)
                batch_size = int(maxRAM / average_doc_size)

            if 'filter' in kwargs:
                if 'batch_size' not in kwargs:
                    kwargs['batch_size'] = batch_size
                cursor = coll.find(**kwargs)
            elif 'pipeline' in kwargs:
                if 'cursor' not in kwargs:
                    kwargs['cursor'] = {'batchSize': batch_size}

                cursor = coll.aggregate(**kwargs)
            else:
                raise Exception('provide filter or pipeline')

            logging.info('Determined batch_size is %d' % batch_size)

            if DataSet._generateImpl(
                    experiment_name, model,
                    lambda: model.sampletype.fromjson(cursor.next()),
                    part_size, numclasses) == 0:
                raise Exception('Cannot generate set: no samples')
        except Exception as e:
            logging.error(e.message)
        finally:
            if customclient is None and client is not None:  #
                client.close()