Ejemplo n.º 1
0
    def cook_recipe(self):
        """
        Full pipeline.
        """
        # indicate that the recipe is running.
        self.recipe.last_run = dates.now()
        self.recipe.status = "running"
        db.session.add(self.recipe)
        db.session.commit()

        # generate a job id
        job_id = gen_uuid()

        # import the sous chef here to get the timeout
        # and raise import errors before it attempts to run
        # in the queue
        sc = import_sous_chef(self.sous_chef_path)

        # stash kwargs
        kw_key = self.stash_kw(job_id)

        # send it to the queue
        self.q.enqueue(run_sous_chef,
                       self.sous_chef_path,
                       self.recipe.id,
                       kw_key,
                       job_id=job_id,
                       timeout=sc.timeout,
                       result_ttl=self.kw_ttl)

        # return the job id
        return job_id
Ejemplo n.º 2
0
def _provenance(obj, recipe, type='event'):
    """
    Determine provenance for events or content items.
    Handle source ids for events.
    """
    if not recipe:
        obj['provenance'] = 'manual'
        obj['recipe_id'] = None

        if type == 'event':
            src_id = obj.get('source_id')
            if not src_id:
                src_id = gen_uuid()
            obj['source_id'] = "manual:{}".format(src_id)

    else:
        if type == 'event':
            # recipe-generated events must pass in a source id
            if 'source_id' not in obj:
                raise RequestError(
                    'Recipe generated events must include a source_id.')
            # reformant source id.
            obj['source_id'] = "{}:{}"\
                .format(str(recipe.slug), str(obj['source_id']))
        obj['provenance'] = 'recipe'
        obj['recipe_id'] = recipe.id
    return obj
Ejemplo n.º 3
0
def _provenance(obj, recipe, type='event'):
    """
    Determine provenance for events or content items.
    Handle source ids for events.
    """
    if not recipe:
        obj['provenance'] = 'manual'
        obj['recipe_id'] = None

        if type == 'event':
            src_id = obj.get('source_id')
            if not src_id:
                src_id = gen_uuid()
            obj['source_id'] = "manual:{}".format(src_id)

    else:
        if type == 'event':
            # recipe-generated events must pass in a source id
            if 'source_id' not in obj:
                raise RequestError(
                    'Recipe generated events must include a source_id.')
            # reformant source id.
            obj['source_id'] = "{}:{}"\
                .format(str(recipe.slug), str(obj['source_id']))
        obj['provenance'] = 'recipe'
        obj['recipe_id'] = recipe.id
    return obj
Ejemplo n.º 4
0
    def format(self, obj):
        """
        For now all of these options are standard to twitter events.
        """
        # set the status.
        obj['status'] = self.options.get('event_status', 'pending')

        # prepare url (these are formatted as redirects).
        obj['url'] = url.prepare(obj['url'], expand=False, canonicalize=False)

        # ignore bad domains / org's own domains.
        if self._is_bad_domain(obj['url']):
            return

        # extract and merge article data.
        if url.is_article(obj['url']):
            data = article.extract(obj['url'], type=None)
            if data:
                obj.update(data)
                obj.pop('type', None)
                obj.pop('site_name', None)
                obj.pop('favicon', None)

        # set source id:
        _id = obj.pop('id', obj.get('url', gen_uuid()))
        if ":" in _id:
            _id = _id.split(':')[-1]
        obj['source_id'] = _id

        # TODO: Make formatting more elegant.
        if self.options.get('set_event_title', None):
            obj['title'] = self.options.get(
                'set_event_title').format(**self._fmt(obj))

        if self.options.get('set_event_description', None):
            obj['description'] = self.options.get(
                'set_event_description').format(**self._fmt(obj))

        if self.options.get('set_event_tag_ids', None) and \
           len(self.options.get('set_event_tag_ids')):

            obj['tag_ids'] = self.options.get('set_event_tag_ids')

        # hack because the app cant handle this field being a list.
        if self.options.get('set_event_content_items', None):
            if 'content_item_ids' not in obj:
                obj['content_item_ids'] = []
            for c in self.options.get('set_event_content_items', []):
                if isinstance(c, dict):
                    if c.get('id', None):
                        obj['content_item_ids'].append(c.get('id'))
                elif isinstance(c, int):
                    obj['content_item_ids'].append(c)
        # filter links.
        if self.options.get('must_link', False) \
           and not len(obj.get('links', [])):
            return None
        return obj
Ejemplo n.º 5
0
 def metric_summary_query(self, metric):
     return \
         """ SELECT array_agg(metric) as metric_arr,
                    ROUND(avg(metric), 2) as mean,
                    ROUND(min(metric), 2) as min,
                    ROUND(median(metric), 2) as median,
                    ROUND(max(metric), 2) as max
                    FROM ({0}) AS "{1}"
         """.format(self.init_query(metric), gen_uuid())
Ejemplo n.º 6
0
 def metric_summary_query(self, metric):
     return \
         """ SELECT array_agg(metric) as metric_arr,
                    ROUND(avg(metric), 2) as mean,
                    ROUND(min(metric), 2) as min,
                    ROUND(median(metric), 2) as median,
                    ROUND(max(metric), 2) as max
                    FROM ({}) AS "{}"
         """.format(self.init_query(metric), gen_uuid())
Ejemplo n.º 7
0
 def metric_query(self, metric):
     kw = {
         'name': metric.get('name'),
         'percentiles': self.select_percentiles,
         'summary_query': self.metric_summary_query(metric),
         'alias': gen_uuid()
     }
     return \
         """SELECT '{name}' as metric,
                   mean, median, min, max,
                   {percentiles}
            FROM (\n{summary_query}\n) AS "{alias}"
         """.format(**kw)
Ejemplo n.º 8
0
 def metric_query(self, metric):
     kw = {
         'name': metric.get('name'),
         'percentiles': self.select_percentiles,
         'summary_query': self.metric_summary_query(metric),
         'alias': gen_uuid()
     }
     return \
         """SELECT '{name}' as metric,
                   mean, median, min, max,
                   {percentiles}
            FROM (\n{summary_query}\n) AS "{alias}"
         """.format(**kw)
Ejemplo n.º 9
0
def bulkload(data, **kw):
    """
    Bulk Load any data.
    """
    kw['src'] = kw.pop('q_src', kw.pop('src', None))
    if not kw['src']:
        raise ValueError('Missing src.')

    job_id = gen_uuid()

    # set queue defaults
    qkw = dict(
        queued=kw.pop('queued', True),
        job_id=job_id,
        timeout=kw.pop('q_timeout', 1000),
        serializer=kw.pop('q_serializer', 'json'),
        result_ttl=kw.pop('q_result_ttl', 60),
        kwargs_ttl=kw.pop('q_kwargs_ttl', 120),
        name=kw.pop('q_name', 'bulk'),
        max_workers=kw.pop('q_max_workers', MAX_WORKERS),
        job_key_fmt=kw.pop('q_job_key', 'rq:{src}:bulk:'.format(**kw)+"{}"),
        chunk_size=kw.pop('q_chunk_size', MAX_CHUNK_SIZE)
    )
    kw.update({'queued': qkw.get('queued', True)})

    # if this is not a queued job, just run ingest.
    if not qkw.get('queued'):
        return ingest.source(data, **kw)

    q = queues.get(qkw.pop('name', 'bulk'))

    # store the data + kwargs in redis temporarily
    # this makes the enqueuing process much, much more
    # efficient by allowing us to only pass a single key
    # into the queue rather than a massive dump of data
    # however it also means that all kwargs must be
    # json serializable
    job_key = qkw['job_key_fmt'].format(job_id)
    job = {'data': data, 'kw': kw}

    if qkw['serializer'] == 'json':
        job = obj_to_json(job)

    elif qkw['serializer'] == 'pickle':
        job = obj_to_pickle(job)

    rds.set(job_key, job, ex=qkw['kwargs_ttl'])

    q.enqueue(bulkworker, job_id, **qkw)
    return job_id
Ejemplo n.º 10
0
def bulkload(data, **kw):
    """
    Bulk Load any data.
    """
    kw['src'] = kw.pop('q_src', kw.pop('src', None))
    if not kw['src']:
        raise ValueError('Missing src.')

    job_id = gen_uuid()

    # set queue defaults
    qkw = dict(queued=kw.pop('queued', True),
               job_id=job_id,
               timeout=kw.pop('q_timeout', 1000),
               serializer=kw.pop('q_serializer', 'json'),
               result_ttl=kw.pop('q_result_ttl', 60),
               kwargs_ttl=kw.pop('q_kwargs_ttl', 120),
               name=kw.pop('q_name', 'bulk'),
               max_workers=kw.pop('q_max_workers', MAX_WORKERS),
               job_key_fmt=kw.pop('q_job_key',
                                  'rq:{src}:bulk:'.format(**kw) + "{}"),
               chunk_size=kw.pop('q_chunk_size', MAX_CHUNK_SIZE))
    kw.update({'queued': qkw.get('queued', True)})

    # if this is not a queued job, just run ingest.
    if not qkw.get('queued'):
        return ingest.source(data, **kw)

    q = queues.get(qkw.pop('name', 'bulk'))

    # store the data + kwargs in redis temporarily
    # this makes the enqueuing process much, much more
    # efficient by allowing us to only pass a single key
    # into the queue rather than a massive dump of data
    # however it also means that all kwargs must be
    # json serializable
    job_key = qkw['job_key_fmt'].format(job_id)
    job = {'data': data, 'kw': kw}

    if qkw['serializer'] == 'json':
        job = obj_to_json(job)

    elif qkw['serializer'] == 'pickle':
        job = obj_to_pickle(job)

    rds.set(job_key, job, ex=qkw['kwargs_ttl'])

    q.enqueue(bulkworker, job_id, **qkw)
    return job_id
Ejemplo n.º 11
0
def _event_provenance(o, org_id, session):
    """
    if there's not a recipe_id set a random source id +
    set the recipe_id as "None" and preface the source_id
    as "manual".

    if there is recipe_id, add in the
    sous-chef-name to ensure that there
    aren't duplicate events generated by
    multiple child recipes of the same
    sous-chef
    """

    if 'recipe_id' not in o or not o['recipe_id']:
        o['source_id'] = "manual:{}".format(gen_uuid())
        o['provenance'] = 'manual'
        o['recipe_id'] = None

    else:
        # recipe-generated events must pass in a source id
        if 'source_id' not in o:
            raise RequestError(
                'Recipe-generated events must include a source_id.')

        # fetch the associated recipe
        r = session.query(Recipe)\
            .filter_by(id=o['recipe_id'])\
            .filter_by(org_id=org_id)\
            .first()

        if not r:
            raise RequestError(
                'Recipe id "{recipe_id}" does not exist.'
                .format(**o))

        # reformant source id.
        o['source_id'] = "{}:{}"\
            .format(str(r.slug), str(o['source_id']))

        # set this event as non-manual
        o['provenance'] = 'recipe'

    return o
Ejemplo n.º 12
0
def _event_provenance(o, org_id, session):
    """
    if there's not a recipe_id set a random source id +
    set the recipe_id as "None" and preface the source_id
    as "manual".

    if there is recipe_id, add in the
    sous-chef-name to ensure that there
    aren't duplicate events generated by
    multiple child recipes of the same
    sous-chef
    """

    if 'recipe_id' not in o or not o['recipe_id']:
        o['source_id'] = "manual:{}".format(gen_uuid())
        o['provenance'] = 'manual'
        o['recipe_id'] = None

    else:
        # recipe-generated events must pass in a source id
        if 'source_id' not in o:
            raise RequestError(
                'Recipe-generated events must include a source_id.')

        # fetch the associated recipe
        r = session.query(Recipe)\
            .filter_by(id=o['recipe_id'])\
            .filter_by(org_id=org_id)\
            .first()

        if not r:
            raise RequestError(
                'Recipe id "{recipe_id}" does not exist.'.format(**o))

        # reformant source id.
        o['source_id'] = "{}:{}"\
            .format(str(r.slug), str(o['source_id']))

        # set this event as non-manual
        o['provenance'] = 'recipe'

    return o
Ejemplo n.º 13
0
    def run(self, data, **kw):

        # store the data + kwargs in redis temporarily
        # this makes the enqueuing process much, much more
        # efficient by allowing us to only pass a single key
        # into the queue rather than a massive dump of data
        # however it also means that all kwargs must be
        # json serializable
        job_id = gen_uuid()
        kwargs_key = self.kwargs_key.format(job_id)
        kwargs = {'data': data, 'kw': kw}
        self.redis.set(kwargs_key, obj_to_pickle(kwargs), ex=self.kwargs_ttl)

        # send the job to the task queue
        self.q.enqueue(
            self.load_all, kwargs_key,
            job_id=job_id, timeout=self.timeout,
            result_ttl=self.result_ttl)

        return job_id
Ejemplo n.º 14
0
    def run(self, data, **kw):

        # store the data + kwargs in redis temporarily
        # this makes the enqueuing process much, much more
        # efficient by allowing us to only pass a single key
        # into the queue rather than a massive dump of data
        # however it also means that all kwargs must be
        # json serializable
        job_id = gen_uuid()
        kwargs_key = self.kwargs_key.format(job_id)
        kwargs = {'data': data, 'kw': kw}
        self.redis.set(kwargs_key, obj_to_pickle(kwargs), ex=self.kwargs_ttl)

        # send the job to the task queue
        self.q.enqueue(self.load_all,
                       kwargs_key,
                       job_id=job_id,
                       timeout=self.timeout,
                       result_ttl=self.result_ttl)

        return job_id
Ejemplo n.º 15
0
    def cook_recipe(self):
        """
        Full pipeline.
        """
        # generate a job id
        job_id = gen_uuid()

        # import the sous chef here to get the timeout
        # and raise import errors before it attempts to run
        # in the queue
        _sc = sc_exec.from_import_path(self.sous_chef_path)

        # send it to the queue
        if not self.passthrough:

            # stash kwargs
            kw_key = self.stash_kw(job_id)

            # indicate that the recipe is running.
            self.recipe.status = "queued"
            db.session.add(self.recipe)
            db.session.commit()

            self.q.enqueue(run,
                           self.sous_chef_path,
                           self.recipe.id,
                           kw_key,
                           job_id=job_id,
                           timeout=_sc.timeout,
                           result_ttl=self.kw_ttl)

            # return the job id
            return job_id

        # directly stream the results out.
        return run(self.sous_chef_path,
                   self.recipe.id,
                   kw_key=None,
                   **self.sous_chef_kwargs)
Ejemplo n.º 16
0
    def cook_recipe(self):
        """
        Full pipeline.
        """
        # generate a job id
        job_id = gen_uuid()

        # import the sous chef here to get the timeout
        # and raise import errors before it attempts to run
        # in the queue
        sc = import_sous_chef(self.sous_chef_path)

        # stash kwargs
        kw_key = self.stash_kw(job_id)

        # send it to the queue
        if not self.passthrough:

            # indicate that the recipe is running.
            self.recipe.status = "queued"
            db.session.add(self.recipe)
            db.session.commit()

            self.q.enqueue(
                run_sous_chef,
                self.sous_chef_path,
                self.recipe.id,
                kw_key,
                job_id=job_id,
                timeout=sc.timeout,
                result_ttl=self.kw_ttl,
            )

            # return the job id
            return job_id

        # directly stream the results out.
        return run_sous_chef(self.sous_chef_path, self.recipe.id, kw_key)