Beispiel #1
0
 def setUp(self):
     overrides['server'] = 'disco://localhost'
     overrides['dump'] = False
     overrides['nest'] = False
     self.settings = Settings()
     self.ddfs = self.settings['ddfs']
     self.table = ensure_tables()
Beispiel #2
0
def ensure_tables():
    overrides['server'] = 'disco://localhost'
    overrides['dump'] = False
    overrides['nest'] = False
    settings = Settings()
    ddfs = settings['ddfs']

    imps = Table.create(IMPS,
                        fields=[
                            '=$token', '%url', '+%site_id', '@cpm_millis',
                            '+#ad_id', '+$date', '+@time'
                        ],
                        partition='date',
                        force=True)
    pixels = Table.create(PIXELS,
                          fields=[
                              '=$token', '+@1isActive', '+%site_id', '@amount',
                              '+#account_id', '+%city', '+%2state', '+#2metro',
                              '$ip', '*keyword', '+$date'
                          ],
                          partition='date',
                          force=True)

    tags = ddfs.list("hustle:%s:" % IMPS)
    if len(tags) == 0:
        # insert the files
        insert(imps, phile='fixtures/imps.json', preprocess=imp_process)

    tags = ddfs.list("hustle:%s:" % PIXELS)
    if len(tags) == 0:
        # insert the files
        insert(pixels, phile='fixtures/pixel.json')
Beispiel #3
0
def stat(where, limit=16, **kwargs):
    from hustle.core.settings import Settings
    from hustle.core.stat import StatPipe
    from disco.core import result_iterator
    from collections import defaultdict

    settings = Settings(**kwargs)
    ddfs = settings['ddfs']
    job_blobs = set(tuple(sorted(w)) for w in _get_blobs(where, ddfs, limit))
    # print job_blobs
    job = StatPipe(settings['server'])
    job.run(name="stat_" + where._name, input=job_blobs, **settings)
    res = job.wait()

    # first we need the total, so that we can calculate weighted average
    total = float(sum(v['_'] for _, v in result_iterator(res)))
    final = defaultdict(int)
    for _, cols in result_iterator(res):
        weight = cols.pop('_') / total
        for col, card in cols.iteritems():
            final[col] += card * weight

    # round everything up to a number between 0 .. 100
    really_final = {}
    for key in final:
        card = int(final[key] * 100)
        if card > 0:
            really_final[key] = card
    really_final['_'] = int(total)

    return really_final
Beispiel #4
0
def delete(table_or_expr, **kwargs):
    """
    Delete data and partitions for a given table, keep the table definition.

    :type table_or_expr: :class:`Table <hustle.Table>` | :class:`Expr <hustle.core.marble.Expr>`
    :param table_or_expr: A table object or an expression with only a partition column

    :type kwargs: dict
    :param kwargs: custom settings for this query see :mod:`hustle.core.settings`

    .. warning::
        Given a table object, all partitions will be deleted. Use a Hustle expression to delete
        a specific range of partitions, e.g. 'impression.date < 2014-01-01'.
    """
    from hustle.core.settings import Settings
    settings = Settings(**kwargs)
    ddfs = settings["ddfs"]

    if not isinstance(table_or_expr, (Expr, Table)):
        raise ValueError("The first argument must be a table or an exprssion.")

    if isinstance(table_or_expr, Expr) and not table_or_expr.is_partition:
        raise ValueError(
            "Column in the expression must be a partition column.")

    tags = _get_tags(table_or_expr, ddfs)
    for tag in tags:
        ddfs.delete(tag)
Beispiel #5
0
def ensure_tables():
    overrides['server'] = 'disco://localhost'
    overrides['dump'] = False
    overrides['nest'] = False
    settings = Settings()
    ddfs = settings['ddfs']

    imps = Table.create(IMPS,
                        columns=[
                            'wide index string token', 'trie url',
                            'index trie site_id', 'uint cpm_millis',
                            'index int ad_id', 'index string date',
                            'index uint time', 'bit click',
                            'index bit impression', 'bit conversion'
                        ],
                        partition='date',
                        force=True)
    pixels = Table.create(PIXELS,
                          columns=[
                              'wide index string token', 'index bit isActive',
                              'index trie site_id', 'uint amount',
                              'index int account_id', 'index trie city',
                              'index trie16 state', 'index int16 metro',
                              'string ip', 'lz4 keyword', 'index string date'
                          ],
                          partition='date',
                          force=True)
    pixel_hlls = Table.create(PIXELS_HLL,
                              columns=[
                                  'index bit isActive', 'index trie site_id',
                                  'index int account_id', 'index trie city',
                                  'index trie16 state', 'index string date',
                                  'binary hll'
                              ],
                              partition='date',
                              force=True)
    ips = Table.create(IPS,
                       columns=['index trie16 exchange_id', 'index uint32 ip'],
                       force=True)

    tags = ddfs.list("hustle:%s:" % IMPS)
    if len(tags) == 0:
        # insert the files
        insert(imps, File='fixtures/imps.json', preprocess=imp_process)

    tags = ddfs.list("hustle:%s:" % PIXELS)
    if len(tags) == 0:
        # insert the files
        insert(pixels, File='fixtures/pixel.json')

    tags = ddfs.list("hustle:%s:" % IPS)
    if len(tags) == 0:
        # insert the files
        insert(ips, File='fixtures/ip.json')

    tags = ddfs.list("hustle:%s:" % PIXELS_HLL)
    if len(tags) == 0:
        # insert the files
        insert_hll(pixel_hlls, file='./fixtures/pixel.json', hll_field='token')
Beispiel #6
0
    def create(cls, name, columns=(), fields=(), partition=None, force=False, **kwargs):
        """
        Create a new :class:`Table <hustle.Table>`, replace existing table if force=True.

        :type  name: string
        :param name: the name of the table to create

        :type  columns: sequence of string
        :param columns: the list of *columns* and their extended index/type information

        :type  fields: sequence of string
        :param fields: the list of *columns* and their encoded index/type information

        :type  partition: string
        :param partition: the name of the column to act as the partition for this table

        :type  force: bool
        :param force: overwrite the existing DDFS base tag with this schema

        If *columns* is set, the *fields* parameter is ignored.

        Example::

            pixels = Table.create('pixels',
                  columns=['index string token', 'index uint8 isActive', 'index site_id', 'uint32 amount',
                           'index int32 account_id', 'index city', 'index trie16 state', 'index int16 metro',
                           'string ip', 'lz4 keyword', 'index string date'],
                  partition='date',
                  force=True)

        .. warning::
            This function will not delete or update existing data in any way.  If you use :code:`force=True` to change
            the schema, make sure you either make the change backward compatible (by only adding new columns), or
            by deleting and reloading your data.

        .. seealso::
            For a good example of creating a partitioned Hustle database see :ref:`integrationtests`
            For detailed schema design docs look no further than :ref:`schemadesign`
        """
        from hustle.core.settings import Settings
        settings = Settings(**kwargs)
        ddfs = settings['ddfs']

        if ddfs.exists(cls.base_tag(name)):
            print "Table already exists..."
            if force:
                print "   Overwriting schema..."
            else:
                return None

        if len(columns):
            fields = cls.parse_column_specs(columns)

        ddfs.setattr(cls.base_tag(name), '_fields_', ujson.dumps(fields))
        ddfs.setattr(cls.base_tag(name), '_partition_', ujson.dumps(partition))
        return cls(name=name, fields=fields, partition=partition)
Beispiel #7
0
def _create_job(*project, **kwargs):
    from hustle import _get_blobs
    from hustle.core.settings import Settings
    from hustle.core.pipeline import SelectPipe
    from hustle.core.util import ensure_list

    settings = Settings(**kwargs)
    wheres = ensure_list(settings.pop('where', ()))
    order_by = ensure_list(settings.pop('order_by', ()))
    join = settings.pop('join', ())
    distinct = settings.pop('distinct', False)
    desc = settings.pop('desc', False)
    limit = settings.pop('limit', None)
    ddfs = settings['ddfs']
    partition = settings.get('partition', 0)
    if partition < 0:
        partition = 0
    nest = settings.get('nest', False)

    try:
        # if join is a string, extract the actual join columns.
        # do it here to make the query checker happy.
        join = _resolve_join(wheres, join)
        check_query(project, join, order_by, limit, wheres)
    except ValueError as e:
        print "  Invalid query:\n    %s" % e
        return None

    name = '-'.join([where._name for where in wheres])[:64]
    job_blobs = set()
    for where in wheres:
        job_blobs.update(tuple(sorted(w)) for w in _get_blobs(where, ddfs))

    job = SelectPipe(settings['server'],
                     wheres=wheres,
                     project=project,
                     order_by=order_by,
                     join=join,
                     distinct=distinct,
                     desc=desc,
                     limit=limit,
                     partition=partition,
                     nest=nest)
    return job, job_blobs, name
Beispiel #8
0
    def from_tag(cls, name, **kwargs):
        """
        Instantiate a named :class:`Table <hustle.Table>` based on meta data from a *DDFS* tag.

        :type  name: string
        :param name: the name of the table
        """
        from hustle.core.settings import Settings
        settings = Settings(**kwargs)
        ddfs = settings['ddfs']

        partition = ujson.loads(ddfs.getattr(cls.base_tag(name), '_partition_'))
        fields = ujson.loads(ddfs.getattr(cls.base_tag(name), '_fields_'))
        return cls(name=name, fields=fields, partition=partition)
Beispiel #9
0
 def tag(self, **kwargs):
     from hustle.core.settings import Settings
     if not self.tagged:
         settings = Settings(**kwargs)
         ddfs = settings['ddfs']
         # check whether the table is already existed
         t = self.create(self._name, fields=self._fields, force=False, **kwargs)
         try:
             ddfs.tag(self.base_tag(self._name), self._blobs or [])
             self.tagged = True
         except Exception:
             print('Error tagging result %s', self._name)
             raise
         return t
Beispiel #10
0
def ensure_tables():
    overrides['server'] = 'disco://localhost'
    overrides['dump'] = False
    overrides['nest'] = False
    settings = Settings()
    ddfs = settings['ddfs']

    imps = Table.create(IMPS,
                        fields=['=$token', '%url', '+%site_id', '@cpm_millis', '+#ad_id', '+$date', '+@time'],
                        partition='date',
                        force=True)

    tags = ddfs.list("hustle:%s:" % IMPS)
    if len(tags) == 0:
        # insert the files
        insert(imps, File='fixtures/imps.json', preprocess=imp_process)
    return imps
Beispiel #11
0
def drop(table, **kwargs):
    """
    Drop all data, partitions, and table definition for a given table.

    :type table_or_expr: :class:`Table <hustle.Table>`
    :param table_or_expr: A table object

    :type kwargs: dict
    :param kwargs: custom settings for this query see :mod:`hustle.core.settings`
    """
    from hustle.core.settings import Settings
    settings = Settings(**kwargs)
    ddfs = settings["ddfs"]

    if not isinstance(table, Table):
        raise ValueError("Only table is allowed here.")

    delete(table, **kwargs)
    ddfs.delete(Table.base_tag(table._name))
Beispiel #12
0
def stat(where, limit=16, **kwargs):
    """
    Fetch statistical information of a collection of selected `Table <hustle.Table>`.

    :type where: sequence of :class:`Table <hustle.Table>` | :class:`Expr <hustle.core.marble.Expr>`
    :param where: the Tables to fetch data from, as well as the conditions in the *where clause*

    :type limit: int
    :param limit: the maximum number of blobs from the where clause, default value is 16

    Return a dict of column key cardinalities [0-100] for indexed columns in a table
    """
    from hustle.core.settings import Settings
    from hustle.core.stat import StatPipe
    from disco.core import result_iterator
    from collections import defaultdict

    settings = Settings(**kwargs)
    ddfs = settings['ddfs']
    job_blobs = set(tuple(sorted(w)) for w in _get_blobs(where, ddfs, limit))
    # print job_blobs
    job = StatPipe(settings['server'])
    job.run(name="stat_" + where._name, input=job_blobs, **settings)
    res = job.wait()

    # first we need the total, so that we can calculate weighted average
    total = float(sum(v['_'] for _, v in result_iterator(res)))
    final = defaultdict(int)
    for _, cols in result_iterator(res):
        weight = cols.pop('_') / total
        for col, card in cols.iteritems():
            final[col] += card * weight

    # round everything up to a number between 0 .. 100
    really_final = {}
    for key in final:
        card = int(final[key] * 100)
        if card > 0:
            really_final[key] = card
    really_final['_'] = int(total)

    return really_final
Beispiel #13
0
    def create(cls, name, fields=(), partition=None, force=False, **kwargs):
        """
        Create a new :class:`Table <hustle.Table>`, replace existing table if force=True.

        :type  name: string
        :param name: the name of the table to create

        :type  fields: sequence of string
        :param fields: the list of *columns* and their encoded index/type information

        :type  partition: string
        :param partition: the name of the column to act as the partition for this table

        :type  force: bool
        :param force: overwrite the existing DDFS base tag with this schema

        .. warning::
            This function will not delete or update existing data in any way.  If you use :code:`force=True` to change
            the schema, make sure you either make the change backward compatible (by only adding new columns), or
            by deleting and reloading your data.

        .. seealso::
            For a good example of creating a partitioned Hustle database see
            :ref:`integrationtests`
        """
        from hustle.core.settings import Settings
        settings = Settings(**kwargs)
        ddfs = settings['ddfs']

        if ddfs.exists(cls.base_tag(name)):
            print "Table already exists..."
            if force:
                print "   Overwriting schema..."
            else:
                return None

        ddfs.setattr(cls.base_tag(name), '_fields_', ujson.dumps(fields))
        ddfs.setattr(cls.base_tag(name), '_partition_', ujson.dumps(partition))
        return cls(name=name, fields=fields, partition=partition)
Beispiel #14
0
def get_tables(**kwargs):
    """
    return the visible Hustle tables in the currently configured DDFS server.  Hustle finds tables by looking
    for DDFS tags that have a *hustle:* prefix.

    :type kwargs: dict
    :param kwargs: custom settings for this query see :mod:`hustle.core.settings`
    """
    from hustle.core.settings import Settings
    settings = Settings(**kwargs)
    tags = settings["ddfs"].list(_TAG_PREFIX)
    uniqs = set()
    for tag in tags:
        l = tag.find(':')
        if l > 0:
            ctag = tag[l + 1:]
            r = ctag.find(':')
            if r > 0:
                uniqs.add(ctag[:r])
            else:
                uniqs.add(ctag)

    return sorted(uniqs)
Beispiel #15
0
def get_partitions(table, **kwargs):
    """
    Get partitions for a given table.

    :type kwargs: dict
    :param kwargs: custom settings for this query see :mod:`hustle.core.settings`
    """
    from hustle.core.settings import Settings
    settings = Settings(**kwargs)
    ddfs = settings["ddfs"]

    if isinstance(table, Marble):
        tablename = table._name
    else:
        tablename = table

    tags = ddfs.list(Table.base_tag(tablename) + ":")
    uniqs = set()
    for tag in tags:
        l = tag.find(':')
        r = tag.rfind(':')
        if r != l:
            uniqs.add(tag)
    return sorted(uniqs)
Beispiel #16
0
def insert(table, phile=None, streams=None, preprocess=None,
           maxsize=100 * 1024 * 1024, tmpdir='/tmp', decoder=None,
           lru_size=10000, **kwargs):
    """
    Insert data into a Hustle :class:`Table <hustle.Table>`.

    Create a  :class:`Marble <hustle.core.marble.Marble>` file given the input file or streams according to the
    schema of the table.  Push this (these) file(s) into *DDFS* under the appropriated (possibly) partitioned *DDFS*
    tags.

    Note that a call to :func:`insert() <hustle.insert>` may actually create and push more than one file,
    depending on how many partition values exist in the input.  Be careful.

    For a good example of inserting into a partitioned Hustle database see :ref:`insertguide`

    :type  table: :class:`Table <hustle.Table>`
    :param table: the table to perform the insert on

    :type  phile: string
    :param phile: the file path to open

    :type  streams: sequence of iterable
    :param streams: as an alternative to the *phile* argument, you can specify a list of generators as input

    :type  preprocess: function
    :param preprocess: a function that accepts and returns a dict()

        The input is transformed into a :class:`dict` by the *decoder* param, then the *preprocess* function is
        called for every record.  This gives you the opportunity to transform, filter or otherwise clean your
        data before it is inserted into the :class:`Marble <hustle.core.marble.Marble>`

    :type  maxsize: int
    :param maxsize: the initial size in bytes of the *LMDB* memory mapped file

        Note that the actual underlying LMDB file will grow as data is added to it - this setting is just for its
        initial size.

    :type  tmpdir: string
    :param tmpdir: the temporary directory to write the LMDB memory mapped file

        Note that choosing a directory on an SSD drive will nicely increase throughput.

    :type  decoder: function
    :param decoder: accepts a line of raw input from the input and returns a :class:`dict`

        The dict is expected to have keys that correspond to the column names in the table you are inserting to.  There
        are two built-in decoders in Hustle: :func:`json_decoder() <hustle.core.marble.json_decoder>` (default) and
        :func:`kv_decoder() <hustle.core.marble.kv_decoder>` for processing JSON and Disco *chain* input files,
        respectively.

    :type  lru_size: int
    :param lru_size: the size in records of the LRU cache for holding bitmapped indexes

        You probably won't have to worry about this unless you find your insert is running out of memory or is too
        slow when inserting gigantic files or on nodes with limited memory resources.
    """
    from hustle.core.settings import Settings
    settings = Settings(**kwargs)
    ddfs = settings['ddfs']

    if not decoder:
        decoder = json_decoder

    # print 'committed'

    def part_tag(name, partition=None):
        rval = "hustle:" + name
        if partition:
            rval += ':' + str(partition)
        return rval
    if phile:
        streams = [open(phile)]
    lines, partition_files = table._insert(streams, preprocess=preprocess,
                                           maxsize=maxsize, tmpdir=tmpdir,
                                           decoder=decoder, lru_size=lru_size)
    if partition_files is not None:
        for part, pfile in partition_files.iteritems():
            tag = part_tag(table._name, part)
            ddfs.push(tag, [pfile])
            print 'pushed %s, %s to %s' % (part, tag, ddfs)
            os.unlink(pfile)
    return table._name, lines
Beispiel #17
0
def select(*project, **kwargs):
    """
    Perform a relational query, by selecting rows and columns from one or more tables.

    The return value is either::

    * an iterator over the resulting tuples when :code:`nest==False`
    * a :class:`Table <hustle.Table>` instance when :code:`nest==True`
    * in the case of :code:`nest==False and dump==True` return None (this is the default CLI interaction)

    For all of the examples below, *imps* and *pix* are instances of :class:`Table <hustle.Table>`.

    :type project: list of :class:`Column <hustle.core.marble.Column>` | :class:`Aggregation <hustle.core.marble.Aggregation>`
    :param project: a positional argument list of columns and aggregate expressions to return in the result

        A simple projection::

            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps)

        Selects three columns from the *imps* table.

        Hustle also allows for *aggregation functions* such as :func:`h_sum() <hustle.h_sum>`,
        :func:`h_count <hustle.h_count>`, :func:`h_min() <hustle.h_min>`, :func:`h_max() <hustle.h_max>`,
        :func:`h_avg <hustle.h_avg>` as in this example which sums the :code:`imps.cpm_millis`
        column::

            select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27')

        Note that Hustle doesn't have a *group by* clause.  In this query, the output will be *grouped* by the
        :code:`imps.ad_id` column implicitly.  Note that in Hustle, if there is an aggregation function present in the
        :code:`project` param, the query results will be *grouped* by all non-aggregation present.

    :type where: (optional) sequence of :class:`Table <hustle.Table>` | :class:`Expr <hustle.core.marble.Expr>`
    :param where: the Tables to fetch data from, as well as the conditions in the *where clause*

        This two purposes: to specify the tables that are to be queried and to allow for the
        selection of data under specific criteria with our Python DSL selection syntax, much the like SQL's *where
        clause*::

            # simple projection with restriction
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27')

        Note the :code:`==` operation between the :code:`imps.date` column and the date string.
        The :class:`Column <hustle.core.marble.Column>`
        class overrides all of Python's comparison operators, which, along with the *&*, *|* and *~* logical
        operators allows you to build arbitrarily complex column selection expressions like this::

            select(imps.ad_id, imps.date, imps.cpm_millis,
                    where=((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23')) |
                          ~(imps.site_id == 'google.com))

        Note that for these expressions, the column must come first.  This means that the following expression is
        **illegal**::

            select(imps.ad_id, imps.date, imps.cpm_millis, where='2014-01-27' == imps.date)

        Where clause also supports *in* and *not in* statements by using special operators "<<" and ">>" respectively::

            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.ad_id << [1000, 1005])
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.ad_id >> [1000, 1005])

        Note that the right value "<<" and ">>" could be any type of iterable with each element must be
        a valid single right value.

        In addition, multiple tables can be specified in the where clause like this::

            select(imps.ad_id, pix.amount, where=(imps.date < '2014-01-13', pix))

        which specifies an expression, :code:`imps.date < '2014-01-13'` and a :class:`Table <hustle.Table>` tuple.
        This query will simply return all of the *ad_id* values in *imps* for dates less than January 13th followed
        by all of the *amount* values in the *pix* table.

        Using multiple columns is typically reserved for when you use a *join clause*

    :type join: string | sequence of exactly length 2 of :class:`Column <hustle.core.marble.Column>`
    :param join: specified the columns to perform a relational join operation on for the query

        The join columns can be specified either as a list of 2 columns, or a list of 2 strings. In particular, if
        two columns have the same names, a single string is valid as well.

        Here's an example of a Hustle join::

            select(imps.ad_id, imps.site_id, h_sum(pix.amount), h_count(),
                   where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                   join=(imps.site_id, pix.site_id))

        or equivalently::

            select(imps.ad_id, imps.site_id, h_sum(pix.amount), h_count(),
                   where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                   join='site_id')

        which joins the *imps* and *pix* tables on their common *site_id* column, then returns the sum of the
        *pix.amount* columns and a count, grouped by the *ad_id* and the *site_id*.  The equivalent query in SQL
        is::

            select i.ad_id, i.site_id, sum(p.amount), count(*)
            from imps i
            join pix p on p.site_id = p.site_id
            where i.date < '2014-01-13' and i.date < '2014-01-13'
            group by i.ad_id, i.site_id

    :type full_join: bool
    :param full_join:

        if True, specifies that a full join between the specified tables in the *where clause* should
        be joined in a full cross-product.  Note that if both *full_join* and *join* are specified, *join* will
        be ignored.

    :type order_by: string | :class:`Column <hustle.core.marble.Column>` | int |
        (sequence of string | :class:`Column <hustle.core.marble.Column>` | int)
    :param order_by: the column(s) to sort the result by

        The sort columns can be specified either as a Column or a list of Columns.  Alternatively, you can specify
        a column by using a string with either the name of the column or the *table.column* string notation.
        Furthermore, you can also represent the column using a zero based index of the *projected* columns.  This
        last case would be used for *Aggregations*.  Here are a few examples::

            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=imps.date)
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=(imps.date, imps.ad_id))
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by='date')
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by='imps.date')
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=('date', imps.ad_id))
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=('date', 2))
            select(imps.ad_id, imps.date, h_sum(imps.cpm_millis), where=imps, order_by=2)

    :type desc: boolean
    :param desc: affects sort order of the *order_by clause* to descending (default ascending)

    :type distinct: boolean
    :param distinct: indicates whether to remove duplicates in results

    :type limit: int
    :param limit: limits the total number of records in the output

    :type block: boolean
    :param block: make select call either blocking (default) or non-blocking.  If True, causes select() to return
    a :class:`Future <hustle.Future>` object

    :type nest: boolean (default = False)
    :param nest: specify that the return value is a :class:`Table <hustle.Table>` to be used in another query

        This allows us to build nested queries.  You may want to do this to join more than two tables, or to reuse
        the results of a query in more than one subsequent query.  For example::

            active_pix = select(*star(pix), where=pix.isActive > 0, nest=True)
            select(h_sum(active_pix.amount), where=active_pix)

    :type tag: string (default = None)
    :param tag: specify the tag name for a nested query, note it must be used with option "nest". If this option
    is not specified, a random name will be given to the result of this nested query.

    :type max_cores: int (default = 0)
    :param max_cores: specify the max number of cores (disco workers) this query could utilize. 0 means no limit

    :type profile: boolean (default = False)
    :param profile: specify whether generate disco job's profile

    :type purge: boolean (default = True)
    :param purge: specify whether purge the query related data. This only works when "dump = True" and "profile = False".

    :type kwargs: dict
    :param kwargs: custom settings for this query see :mod:`hustle.core.settings`

    """

    from hustle.core.settings import Settings
    from hustle.core.pipeline import SelectPipe
    from hustle.core.util import ensure_list

    settings = Settings(**kwargs)
    wheres = ensure_list(settings.pop('where', ()))
    order_by = ensure_list(settings.pop('order_by', ()))
    join = settings.pop('join', ())
    full_join = settings.pop('full_join', False)
    distinct = settings.pop('distinct', False)
    desc = settings.pop('desc', False)
    limit = settings.pop('limit', None)
    wide = settings.pop('wide', False)
    nest = settings.pop('nest', False)
    tag = settings.pop('tag', None)
    block = settings.pop('block', True)
    autodump = settings.pop('dump', False)
    pre_order_stage = settings.pop('pre_order_stage', ())
    ddfs = settings['ddfs']
    partition = settings.pop('partition', 0)
    max_cores = settings.pop('max_cores', 0)
    profile = settings.pop('profile', False)
    purge = settings.pop('purge', True)
    if partition < 0:
        partition = 0
    if tag:
        t = Table.from_tag(tag)
        if t is not None:
            print "The tag name %s is already existed. Try another tag name"
            " or drop the old one" % tag
            return

    try:
        # if join is a string, extract the actual join columns.
        # do it here to make the query checker happy.
        join = _resolve_join(wheres, join)
        check_query(project, join, order_by, limit, wheres)
    except ValueError as e:
        print "  Invalid query:\n    %s" % e
        return None

    name = '-'.join([where._name for where in wheres])[:64]
    job_blobs = set()
    for where in wheres:
        job_blobs.update(tuple(sorted(w)) for w in _get_blobs(where, ddfs))

    job = SelectPipe(settings['server'],
                     wheres=wheres,
                     project=project,
                     order_by=order_by,
                     join=join,
                     full_join=full_join,
                     distinct=distinct,
                     desc=desc,
                     limit=limit,
                     partition=partition,
                     wide=wide,
                     nest=nest,
                     tag=tag,
                     pre_order_stage=pre_order_stage,
                     max_cores=max_cores,
                     profile=profile)

    job.run(name='select_from_%s' % name, input=job_blobs, **settings)
    if block:
        blobs = job.wait()
        if nest:
            rtab = job.get_result_schema(project)
            rtab._blobs = blobs
            return rtab
        elif autodump:
            # the result will be just dumped to stdout
            cols = [c.name for c in project]
            _print_separator(80)
            _print_line(cols,
                        width=80,
                        cols=len(cols),
                        alignments=[
                            _ALG_RIGHT if c.is_numeric else _ALG_LEFT
                            for c in project
                        ])
            _print_separator(80)
            cat(_query_iterator(blobs), 80)
            if purge and not profile:
                settings['server'].purge(_safe_str(job.name))
            return
        return QueryResult(job.name, blobs, settings['server'])
    else:
        return Future(job.name, job, settings['server'], nest, *project)
Beispiel #18
0
def insert(table,
           File=None,
           streams=None,
           preprocess=None,
           maxsize=100 * 1024 * 1024,
           tmpdir='/tmp',
           decoder=None,
           lru_size=10000,
           header=False,
           partition_filter=None,
           purge_local=True,
           **kwargs):
    """
    Insert data into a Hustle :class:`Table <hustle.Table>`.

    Create a  :class:`Marble <hustle.core.marble.Marble>` file given the input file or streams according to the
    schema of the table.  Push this (these) file(s) into *DDFS* under the appropriated (possibly) partitioned *DDFS*
    tags.

    Note that a call to :func:`insert() <hustle.insert>` may actually create and push more than one file,
    depending on how many partition values exist in the input.  Be careful.

    For a good example of inserting into a partitioned Hustle database see :ref:`insertguide`

    :type  table: :class:`Table <hustle.Table>`
    :param table: the table to perform the insert on

    :type  File: string
    :param File: the file path to open

    :type  streams: sequence of iterable
    :param streams: as an alternative to the *File* argument, you can specify a list of generators as input

    :type  preprocess: function
    :param preprocess: a function that acts as transformer or filter and returns a boolean or None

        The input is transformed into a :class:`dict` by the *decoder* param, then the *preprocess* function is
        called for every record.  This gives you the opportunity to transform, filter or otherwise clean your
        data before it is inserted into the :class:`Marble <hustle.core.marble.Marble>`.

        As transformer: it modifies the original data in place, the return value shoule be either None or True
        As filter: it returns a boolean to flag whether the current data record should be inserted or not

    :type  maxsize: int
    :param maxsize: the initial size in bytes of the *LMDB* memory mapped file

        Note that the actual underlying LMDB file will grow as data is added to it - this setting is just for its
        initial size.

    :type  tmpdir: string
    :param tmpdir: the temporary directory to write the LMDB memory mapped file

        Note that choosing a directory on an SSD drive will nicely increase throughput.

    :type  decoder: function
    :param decoder: accepts a line of raw input from the input and returns a :class:`dict`

        The dict is expected to have keys that correspond to the column names in the table you are inserting to.  There
        are two built-in decoders in Hustle: :func:`json_decoder() <hustle.core.marble.json_decoder>` (default) and
        :func:`kv_decoder() <hustle.core.marble.kv_decoder>` for processing JSON and Disco *chain* input files,
        respectively.

    :type  lru_size: int
    :param lru_size: the size in records of the LRU cache for holding bitmapped indexes

        You probably won't have to worry about this unless you find your insert is running out of memory or is too
        slow when inserting gigantic files or on nodes with limited memory resources.

    :type header: boolean
    :param header: whether or not the streams contain a header (as with CSV)

        If you are using CSV and it contains a header with the column names, set this so it gets skipped. Only works if
        the header is on the first line otherwise you will skip the first line of data.

    :type partition_filter: a single value or a list of values
    :param partition_filter: a single value or a list of partition values you want to filter your *streams*

        This list will filter the insert to only acknowledge the partition(s) defined if set. Useful for reloads where single
        files may hold data for multiple partitions.

    :type purge_local: boolean
    :param purge_local: whether or not to delete the local marble after creation

        If you want to do additional processing with the marble after it has been pushed to DDFS, set this flag to False and it
        will not be automatically cleaned up after successful insertion.
        Setting this to False will also return the partition file information.

    """
    from hustle.core.settings import Settings
    settings = Settings(**kwargs)
    ddfs = settings['ddfs']

    if not decoder:
        decoder = json_decoder

    def part_tag(name, partition=None):
        rval = "hustle:" + name
        if partition:
            rval += ':' + str(partition)
        return rval

    if File:
        streams = [open(File)]
    lines, partition_files = table._insert(streams,
                                           preprocess=preprocess,
                                           maxsize=maxsize,
                                           tmpdir=tmpdir,
                                           decoder=decoder,
                                           lru_size=lru_size,
                                           header=header,
                                           partition_filter=partition_filter)
    if partition_files is not None:
        for part, pfile in partition_files.iteritems():
            tag = part_tag(table._name, part)
            st = os.stat(pfile)
            ddfs.push(tag, [pfile])
            print 'pushed %s(%.2fG), %s to %s' % \
                (part, st.st_size * 1.0 / 1073741824, tag, ddfs)
            if purge_local:
                os.unlink(pfile)
    return table._name, lines, partition_files
Beispiel #19
0
def select(*project, **kwargs):
    """
    Perform a relational query, by selecting rows and columns from one or more tables.

    The return value is either:

    * a list of urls containing the result records.  This is the same as normal results from Disco
    * a :class:`Table <hustle.Table>` instance when :code:`nest==True`

    For all of the examples below, *imps* and *pix* are instances of :class:`Table <hustle.Table>`.

    :type project: list of :class:`Column <hustle.core.marble.Column>` | :class:`Aggregation <hustle.core.marble.Aggregation>`
    :param project: a positional argument list of columns and aggregate expressions to return in the result

        A simple projection::

            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps)

        Selects three columns from the *imps* table.

        Hustle also allows for *aggregation functions* such as :func:`h_sum() <hustle.h_sum>`,
        :func:`h_count <hustle.h_count>`, :func:`h_min() <hustle.h_min>`, :func:`h_max() <hustle.h_max>`,
        :func:`h_avg <hustle.h_avg>` as in this example which sums the :code:`imps.cpm_millis`
        column::

            select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27')

        Note that Hustle doesn't have a *group by* clause.  In this query, the output will be *grouped* by the
        :code:`imps.ad_id` column implicitly.  Note that in Hustle, if there is an aggregation function present in the
        :code:`project` param, the query results will be *grouped* by all non-aggregation present.

    :type where: (optional) sequence of :class:`Table <hustle.Table>` | :class:`Expr <hustle.core.marble.Expr>`
    :param where: the Tables to fetch data from, as well as the conditions in the *where clause*

        This two purposes: to specify the tables that are to be queried and to allow for the
        selection of data under specific criteria with our Python DSL selection syntax, much the like SQL's *where
        clause*::

            # simple projection with restriction
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27')

        Note the :code:`==` operation between the :code:`imps.date` column and the date string.
        The :class:`Column <hustle.core.marble.Column>`
        class overrides all of Python's comparison operators, which, along with the *&*, *|* and *~* logical
        operators allows you to build arbitrarily complex column selection expressions like this::

            select(imps.ad_id, imps.date, imps.cpm_millis,
                    where=((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23')) |
                          ~(imps.site_id == 'google.com))

        Note that for these expressions, the column must come first.  This means that the following expression is
        **illegal**::

            select(imps.ad_id, imps.date, imps.cpm_millis, where='2014-01-27' == imps.date)

        In addition, multiple tables can be specified in the where clause like this::

            select(imps.ad_id, pix.amount, where=(imps.date < '2014-01-13', pix))

        which specifies an expression, :code:`imps.date < '2014-01-13'` and a :class:`Table <hustle.Table>` tuple.
        This query will simply return all of the *ad_id* values in *imps* for dates less than January 13th followed
        by all of the *amount* values in the *pix* table.

        Using multiple columns is typically reserved for when you use a *join clause*

    :type join: sequence of exactly length 2 of :class:`Column <hustle.core.marble.Column>`
    :param join: specified the columns to perform a relational join operation on for the query

        Here's an example of a Hustle join::

            select(imps.ad_id, imps.site_id, h_sum(pix.amount), h_count(),
                   where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                   join=(imps.site_id, pix.site_id))

        which joins the *imps* and *pix* tables on their common *site_id* column, then returns the sum of the
        *pix.amount* columns and a count, grouped by the *ad_id* and the *site_id*.  The equivalent query in SQL
        is::

            select i.ad_id, i.site_id, sum(p.amount), count(*)
            from imps i
            join pix p on p.site_id = p.site_id
            where i.date < '2014-01-13' and i.date < '2014-01-13'
            group by i.ad_id, i.site_id

    :type order_by: string | :class:`Column <hustle.core.marble.Column>` | int |
        (sequence of string | :class:`Column <hustle.core.marble.Column>` | int)
    :param order_by: the column(s) to sort the result by

        The sort columns can be specified either as a Column or a list of Columns.  Alternatively, you can specify
        a column by using a string with either the name of the column or the *table.column* string notation.
        Furthermore, you can also represent the column using a zero based index of the *projected* columns.  This
        last case would be used for *Aggregations*.  Here are a few examples::

            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=imps.date)
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=(imps.date, imps.ad_id))
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by='date')
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by='imps.date')
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=('date', imps.ad_id))
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=('date', 2))
            select(imps.ad_id, imps.date, h_sum(imps.cpm_millis), where=imps, order_by=2)

    :type desc: boolean
    :param desc: affects sort order of the *order_by clause* to descending (default ascending)

    :type distinct: boolean
    :param distinct: indicates whether to remove duplicates in results

    :type limit: int
    :param limit: limits the total number of records in the output

    :type nest: boolean (default = False)
    :param nest: specify that the return value is a :class:`Table <hustle.Table>` to be used in another query

        This allows us to build nested queries.  You may want to do this to join more than two tables, or to reuse
        the results of a query in more than one subsequent query.  For example::

            active_pix = select(*star(pix), where=pix.isActive > 0, nest=True)
            select(h_sum(active_pix.amount), where=active_pix)

    :type kwargs: dict
    :param kwargs: custom settings for this query see :mod:`hustle.core.settings`

    """

    from hustle import _get_blobs
    from hustle.core.settings import Settings
    from hustle.core.pipeline import SelectPipe
    from hustle.core.util import ensure_list

    settings = Settings(**kwargs)
    wheres = ensure_list(settings.pop('where', ()))
    order_by = ensure_list(settings.pop('order_by', ()))
    join = settings.pop('join', ())
    distinct = settings.pop('distinct', False)
    desc = settings.pop('desc', False)
    limit = settings.pop('limit', None)
    ddfs = settings['ddfs']
    autodump = settings['dump']
    partition = settings.get('partition', 0)
    if partition < 0:
        partition = 0
    nest = settings.get('nest', False)
    try:
        check_query(project, join, order_by, limit, wheres)
    except ValueError as e:
        print "  Invalid query:\n    %s" % e
        return None

    name = '-'.join([where._name for where in wheres])[:64]
    job_blobs = set()
    for where in wheres:
        job_blobs.update(tuple(sorted(w)) for w in _get_blobs(where, ddfs))

    job = SelectPipe(settings['server'],
                     wheres=wheres,
                     project=project,
                     order_by=order_by,
                     join=join,
                     distinct=distinct,
                     desc=desc,
                     limit=limit,
                     partition=partition,
                     nest=nest)

    job.run(name='select_from_%s' % name, input=job_blobs, **settings)
    blobs = job.wait()
    if nest:
        rtab = job.get_result_schema(project)
        rtab._blobs = blobs
        return rtab
    elif autodump:
        # the result will be just dumped to stdout
        cols = [c.name for c in project]
        _print_separator(80)
        _print_line(cols, width=80, cols=len(cols),
                   alignments=[_ALG_RIGHT if c.is_numeric else _ALG_LEFT for c in project])
        _print_separator(80)
        dump(blobs, 80)
        return
    return blobs
Beispiel #20
0
def select(*project, **kwargs):
    """
    Perform a relational query, by selecting rows and columns from one or more tables.

    The return value is either::

    * an iterator over the resulting tuples when :code:`nest==False`
    * a :class:`Table <hustle.Table>` instance when :code:`nest==True`
    * in the case of :code:`nest==False and dump==True` return None (this is the default CLI interaction)

    For all of the examples below, *imps* and *pix* are instances of :class:`Table <hustle.Table>`.

    :type project: list of :class:`Column <hustle.core.marble.Column>` | :class:`Aggregation <hustle.core.marble.Aggregation>`
    :param project: a positional argument list of columns and aggregate expressions to return in the result

        A simple projection::

            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps)

        Selects three columns from the *imps* table.

        Hustle also allows for *aggregation functions* such as :func:`h_sum() <hustle.h_sum>`,
        :func:`h_count <hustle.h_count>`, :func:`h_min() <hustle.h_min>`, :func:`h_max() <hustle.h_max>`,
        :func:`h_avg <hustle.h_avg>` as in this example which sums the :code:`imps.cpm_millis`
        column::

            select(imps.ad_id, h_sum(imps.cpm_millis), h_count(), where=imps.date == '2014-01-27')

        Note that Hustle doesn't have a *group by* clause.  In this query, the output will be *grouped* by the
        :code:`imps.ad_id` column implicitly.  Note that in Hustle, if there is an aggregation function present in the
        :code:`project` param, the query results will be *grouped* by all non-aggregation present.

    :type where: (optional) sequence of :class:`Table <hustle.Table>` | :class:`Expr <hustle.core.marble.Expr>`
    :param where: the Tables to fetch data from, as well as the conditions in the *where clause*

        This two purposes: to specify the tables that are to be queried and to allow for the
        selection of data under specific criteria with our Python DSL selection syntax, much the like SQL's *where
        clause*::

            # simple projection with restriction
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.date == '2014-01-27')

        Note the :code:`==` operation between the :code:`imps.date` column and the date string.
        The :class:`Column <hustle.core.marble.Column>`
        class overrides all of Python's comparison operators, which, along with the *&*, *|* and *~* logical
        operators allows you to build arbitrarily complex column selection expressions like this::

            select(imps.ad_id, imps.date, imps.cpm_millis,
                    where=((imps.date >= '2014-01-21') & (imps.date <= '2014-01-23')) |
                          ~(imps.site_id == 'google.com))

        Note that for these expressions, the column must come first.  This means that the following expression is
        **illegal**::

            select(imps.ad_id, imps.date, imps.cpm_millis, where='2014-01-27' == imps.date)

        Where clause also supports *in* and *not in* statements by using special operators "<<" and ">>" respectively::

            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.ad_id << [1000, 1005])
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps.ad_id >> [1000, 1005])

        Note that the right value "<<" and ">>" could be any type of iterable with each element must be
        a valid single right value.

        In addition, multiple tables can be specified in the where clause like this::

            select(imps.ad_id, pix.amount, where=(imps.date < '2014-01-13', pix))

        which specifies an expression, :code:`imps.date < '2014-01-13'` and a :class:`Table <hustle.Table>` tuple.
        This query will simply return all of the *ad_id* values in *imps* for dates less than January 13th followed
        by all of the *amount* values in the *pix* table.

        Using multiple columns is typically reserved for when you use a *join clause*

    :type join: string | sequence of exactly length 2 of :class:`Column <hustle.core.marble.Column>`
    :param join: specified the columns to perform a relational join operation on for the query

        The join columns can be specified either as a list of 2 columns, or a list of 2 strings. In particular, if
        two columns have the same names, a single string is valid as well.

        Here's an example of a Hustle join::

            select(imps.ad_id, imps.site_id, h_sum(pix.amount), h_count(),
                   where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                   join=(imps.site_id, pix.site_id))

        or equivalently::

            select(imps.ad_id, imps.site_id, h_sum(pix.amount), h_count(),
                   where=(imps.date < '2014-01-13', pix.date < '2014-01-13'),
                   join='site_id')

        which joins the *imps* and *pix* tables on their common *site_id* column, then returns the sum of the
        *pix.amount* columns and a count, grouped by the *ad_id* and the *site_id*.  The equivalent query in SQL
        is::

            select i.ad_id, i.site_id, sum(p.amount), count(*)
            from imps i
            join pix p on p.site_id = p.site_id
            where i.date < '2014-01-13' and i.date < '2014-01-13'
            group by i.ad_id, i.site_id

    :type full_join: bool
    :param full_join:

        if True, specifies that a full join between the specified tables in the *where clause* should
        be joined in a full cross-product.  Note that if both *full_join* and *join* are specified, *join* will
        be ignored.

    :type order_by: string | :class:`Column <hustle.core.marble.Column>` | int |
        (sequence of string | :class:`Column <hustle.core.marble.Column>` | int)
    :param order_by: the column(s) to sort the result by

        The sort columns can be specified either as a Column or a list of Columns.  Alternatively, you can specify
        a column by using a string with either the name of the column or the *table.column* string notation.
        Furthermore, you can also represent the column using a zero based index of the *projected* columns.  This
        last case would be used for *Aggregations*.  Here are a few examples::

            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=imps.date)
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=(imps.date, imps.ad_id))
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by='date')
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by='imps.date')
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=('date', imps.ad_id))
            select(imps.ad_id, imps.date, imps.cpm_millis, where=imps, order_by=('date', 2))
            select(imps.ad_id, imps.date, h_sum(imps.cpm_millis), where=imps, order_by=2)

    :type desc: boolean
    :param desc: affects sort order of the *order_by clause* to descending (default ascending)

    :type distinct: boolean
    :param distinct: indicates whether to remove duplicates in results

    :type limit: int
    :param limit: limits the total number of records in the output

    :type block: boolean
    :param block: make select call either blocking (default) or non-blocking.  If True, causes select() to return
    a :class:`Future <hustle.Future>` object

    :type nest: boolean (default = False)
    :param nest: specify that the return value is a :class:`Table <hustle.Table>` to be used in another query

        This allows us to build nested queries.  You may want to do this to join more than two tables, or to reuse
        the results of a query in more than one subsequent query.  For example::

            active_pix = select(*star(pix), where=pix.isActive > 0, nest=True)
            select(h_sum(active_pix.amount), where=active_pix)

    :type tag: string (default = None)
    :param tag: specify the tag name for a nested query, note it must be used with option "nest". If this option
    is not specified, a random name will be given to the result of this nested query.

    :type max_cores: int (default = 0)
    :param max_cores: specify the max number of cores (disco workers) this query could utilize. 0 means no limit

    :type profile: boolean (default = False)
    :param profile: specify whether generate disco job's profile

    :type purge: boolean (default = True)
    :param purge: specify whether purge the query related data. This only works when "dump = True" and "profile = False".

    :type kwargs: dict
    :param kwargs: custom settings for this query see :mod:`hustle.core.settings`

    """

    from hustle.core.settings import Settings
    from hustle.core.pipeline import SelectPipe
    from hustle.core.util import ensure_list

    settings = Settings(**kwargs)
    wheres = ensure_list(settings.pop('where', ()))
    order_by = ensure_list(settings.pop('order_by', ()))
    join = settings.pop('join', ())
    full_join = settings.pop('full_join', False)
    distinct = settings.pop('distinct', False)
    desc = settings.pop('desc', False)
    limit = settings.pop('limit', None)
    wide = settings.pop('wide', False)
    nest = settings.pop('nest', False)
    tag = settings.pop('tag', None)
    block = settings.pop('block', True)
    autodump = settings.pop('dump', False)
    pre_order_stage = settings.pop('pre_order_stage', ())
    ddfs = settings['ddfs']
    partition = settings.pop('partition', 0)
    max_cores = settings.pop('max_cores', 0)
    profile = settings.pop('profile', False)
    purge = settings.pop('purge', True)
    if partition < 0:
        partition = 0
    if tag:
        t = Table.from_tag(tag)
        if t is not None:
            print "The tag name %s is already existed. Try another tag name"
            " or drop the old one" % tag
            return

    try:
        # if join is a string, extract the actual join columns.
        # do it here to make the query checker happy.
        join = _resolve_join(wheres, join)
        check_query(project, join, order_by, limit, wheres)
    except ValueError as e:
        print "  Invalid query:\n    %s" % e
        return None

    name = '-'.join([where._name for where in wheres])[:64]
    job_blobs = set()
    for where in wheres:
        job_blobs.update(tuple(sorted(w)) for w in _get_blobs(where, ddfs))

    job = SelectPipe(settings['server'],
                     wheres=wheres,
                     project=project,
                     order_by=order_by,
                     join=join,
                     full_join=full_join,
                     distinct=distinct,
                     desc=desc,
                     limit=limit,
                     partition=partition,
                     wide=wide,
                     nest=nest,
                     tag=tag,
                     pre_order_stage=pre_order_stage,
                     max_cores=max_cores,
                     profile=profile)

    job.run(name='select_from_%s' % name, input=job_blobs, **settings)
    if block:
        blobs = job.wait()
        if nest:
            rtab = job.get_result_schema(project)
            rtab._blobs = blobs
            return rtab
        elif autodump:
            # the result will be just dumped to stdout
            cols = [c.name for c in project]
            _print_separator(80)
            _print_line(cols, width=80, cols=len(cols),
                        alignments=[_ALG_RIGHT if c.is_numeric else _ALG_LEFT
                                    for c in project])
            _print_separator(80)
            cat(_query_iterator(blobs), 80)
            if purge and not profile:
                settings['server'].purge(_safe_str(job.name))
            return
        return QueryResult(job.name, blobs, settings['server'])
    else:
        return Future(job.name, job, settings['server'], nest, *project)
Beispiel #21
0
 def setUp(self):
     overrides['server'] = 'disco://localhost'
     overrides['dump'] = False
     overrides['nest'] = False
     self.settings = Settings()
Beispiel #22
0
def insert_hll(table,
               file=None,
               streams=None,
               preprocess=None,
               maxsize=100 * 1024 * 1024,
               tmpdir='/tmp',
               decoder=ujson.decode,
               lru_size=10000,
               hll_field=None,
               **kwargs):
    from cardunion import Cardunion
    import os

    settings = Settings(**kwargs)
    ddfs = settings['ddfs']

    def part_tag(name, partition=None):
        rval = "hustle:" + name
        if partition:
            rval += ':' + str(partition)
        return rval

    def hll_iter(strms):
        buf = {}
        fields = table._field_names
        fields.remove('hll')
        #  fields.remove('maxhash')

        for stream in strms:
            for line in stream:
                try:
                    data = decoder(line)
                except Exception as e:
                    print "Exception decoding record (skipping): %s %s" % (
                        e, line)
                else:
                    if preprocess:
                        if not preprocess(data):
                            continue
                    key = ujson.dumps([data[f] for f in fields])
                    if key not in buf:
                        hll = Cardunion(12)
                        buf[key] = hll
                    else:
                        hll = buf[key]

                    hll.add(data[hll_field])

        for key, hll in buf.iteritems():
            data = dict(zip(fields, ujson.loads(key)))
            data['hll'] = hll.dumps()
            yield data

    if file:
        streams = [open(file)]
    lines, partition_files = table._insert([hll_iter(streams)],
                                           maxsize=maxsize,
                                           tmpdir=tmpdir,
                                           decoder=lambda x: x,
                                           lru_size=lru_size)
    if partition_files is not None:
        for part, pfile in partition_files.iteritems():
            tag = part_tag(table._name, part)
            ddfs.push(tag, [pfile])
            print 'pushed %s, %s' % (part, tag)
            os.unlink(pfile)
    return table._name, lines