Ejemplo n.º 1
0
    def partition_mapobjects(self, mapobject_type_name, n):
        '''Splits mapobjects into partitions of size `n`.

        Parameters
        ----------
        mapobject_type_name: str
            name of the selected
            :class:`MapobjectType <tmlib.models.mapobject.MapobjectType>`
        n: int
            number of mapobjects per partition

        Returns
        -------
        List[List[int]]
            mapobject IDs

        Note
        ----
        Mapobjects are ordered by ID.
        '''
        with tm.utils.ExperimentSession(self.experiment_id) as session:
            mapobject_type = session.query(tm.MapobjectType.id).\
                filter_by(name=mapobject_type_name).\
                one()
            mapobjects = session.query(tm.Mapobject.id).\
                filter_by(mapobject_type_id=mapobject_type.id).\
                order_by(tm.Mapobject.id).\
                all()
            return create_partitions([m.id for m in mapobjects], n)
Ejemplo n.º 2
0
    def partition_mapobjects(self, mapobject_type_name, n):
        '''Splits mapobjects into partitions of size `n`.

        Parameters
        ----------
        mapobject_type_name: str
            name of the selected
            :class:`MapobjectType <tmlib.models.mapobject.MapobjectType>`
        n: int
            number of mapobjects per partition

        Returns
        -------
        List[List[int]]
            mapobject IDs

        Note
        ----
        Mapobjects are ordered by ID.
        '''
        with tm.utils.ExperimentSession(self.experiment_id) as session:
            mapobject_type = session.query(tm.MapobjectType.id).\
                filter_by(name=mapobject_type_name).\
                one()
            mapobjects = session.query(tm.Mapobject.id).\
                filter_by(mapobject_type_id=mapobject_type.id).\
                order_by(tm.Mapobject.id).\
                all()
            return create_partitions([m.id for m in mapobjects], n)
Ejemplo n.º 3
0
def parallelize_query(func, args):
    '''Parallelize database query. This can be useful for targeting different
    shards of a distributed table. The number of parallel connections depends
    on the value of :attr:`POOL_SIZE <tmlib.models.utils.POOL_SIZE>`.

    Parameters
    ----------
    func: function
        a function that establishes a database connection and executes a given
        SQL query; function must return a list
    args: Union[list, generator]
        arguments that should be parsed to the function

    Returns
    -------
    list
        aggregated output of `function` return values

    Warning
    -------
    Don't use this function for distributed processing on the cluster, since
    this would establish too many simultaneous database connections.
    '''
    logger.debug('execute query in %d parallel threads', POOL_SIZE)
    n = len(args) / POOL_SIZE
    arg_batches = create_partitions(args, n)

    output = [None] * len(arg_batches)

    def wrapper(func, args, index):
        output[index] = func(args)

    threads = []
    for i, batch in enumerate(arg_batches):
        logger.debug('start thread #%d', i)
        # TODO: use queue or generator?
        t = Thread(target=wrapper, args=(func, batch, i))
        # TODO: use Event
        t.start()
        threads.append(t)

    for t in threads:
        t.join()

    return list(chain(*output))
Ejemplo n.º 4
0
 def _delete_cascade(cls, connection, mapobject_ids):
     logger.debug('delete mapobjects')
     # NOTE: Using ANY with an ARRAY is more performant than using IN.
     # TODO: Ideally we would like to join with mapobject_types.
     # However, at the moment there seems to be no way to DELETE entries
     # from a distributed table with a complex WHERE clause.
     # If the number of objects is too large this will lead to issues.
     # Therefore, we delete rows in batches.
     mapobject_id_partitions = create_partitions(mapobject_ids, 100000)
     # This will DELETE all records of referenced tables as well.
     # FIXME: How to cast to correct BigInteger type in $$ escaped query?
     sql = '''
          DELETE FROM mapobjects
          WHERE id = ANY(%(mapobject_ids)s)
     '''
     for mids in mapobject_id_partitions:
         connection.execute(_compile_distributed_query(sql),
                            {'mapobject_ids': mids})
Ejemplo n.º 5
0
def parallelize_query(func, args):
    '''Parallelize database query. This can be useful for targeting different
    shards of a distributed table. The number of parallel connections depends
    on the value of :attr:`POOL_SIZE <tmlib.models.utils.POOL_SIZE>`.

    Parameters
    ----------
    func: function
        a function that establishes a database connection and executes a given
        SQL query; function must return a list
    args: Union[list, generator]
        arguments that should be parsed to the function

    Returns
    -------
    list
        aggregated output of `function` return values

    Warning
    -------
    Don't use this function for distributed processing on the cluster, since
    this would establish too many simultaneous database connections.
    '''
    logger.debug('execute query in %d parallel threads', POOL_SIZE)
    n = len(args) / POOL_SIZE
    arg_batches = create_partitions(args, n)

    output = [None] * len(arg_batches)
    def wrapper(func, args, index):
        output[index] = func(args)

    threads = []
    for i, batch in enumerate(arg_batches):
        logger.debug('start thread #%d', i)
        # TODO: use queue or generator?
        t = Thread(target=wrapper, args=(func, batch, i))
        # TODO: use Event
        t.start()
        threads.append(t)

    for t in threads:
        t.join()

    return list(chain(*output))
Ejemplo n.º 6
0
 def _delete_cascade(cls, connection, mapobject_ids):
     logger.debug('delete mapobjects')
     # NOTE: Using ANY with an ARRAY is more performant than using IN.
     # TODO: Ideally we would like to join with mapobject_types.
     # However, at the moment there seems to be no way to DELETE entries
     # from a distributed table with a complex WHERE clause.
     # If the number of objects is too large this will lead to issues.
     # Therefore, we delete rows in batches.
     mapobject_id_partitions = create_partitions(mapobject_ids, 100000)
     # This will DELETE all records of referenced tables as well.
     # FIXME: How to cast to correct BigInteger type in $$ escaped query?
     sql = '''
          DELETE FROM mapobjects
          WHERE id = ANY(%(mapobject_ids)s)
     '''
     for mids in mapobject_id_partitions:
         connection.execute(
             _compile_distributed_query(sql), {'mapobject_ids': mids}
         )
Ejemplo n.º 7
0
 def _create_batches(li, n):
     return utils.create_partitions(li, n)
Ejemplo n.º 8
0
 def _create_batches(li, n):
     return utils.create_partitions(li, n)