Ejemplo n.º 1
0
    def load_all(self, kwargs_key):
        """
        Do the work.
        """
        start = time.time()
        try:
            # create a session specific to this task
            session = gen_session()

            # get the inputs from redis
            kwargs = self.redis.get(kwargs_key)
            if not kwargs:
                raise InternalServerError(
                    'An unexpected error occurred while processing bulk upload.'
                )

            kwargs = pickle_to_obj(kwargs)
            data = kwargs.get('data')
            kw = kwargs.get('kw')

            # delete them
            self.redis.delete(kwargs_key)

            outputs = []
            errors = []

            fx = partial(self._load_one, **kw)

            if self.concurrent:
                pool = Pool(min([len(data), self.max_workers]))
                for res in pool.imap_unordered(fx, data):
                    if isinstance(res, Exception):
                        errors.append(res)
                    else:
                        outputs.append(res)
            else:
                for item in data:
                    res = fx(item)
                    if isinstance(res, Exception):
                        errors.append(res)
                    else:
                        outputs.append(res)

            # return errors
            if len(errors):
                self._handle_errors(errors)

            # add objects and execute
            if self.returns == 'model':
                for o in outputs:
                    if o is not None:
                        try:
                            session.add(o)
                            session.commit(o)
                        except Exception as e:
                            self._handle_errors(e)

            # union all queries
            elif self.returns == 'query':
                for query in outputs:
                    if query is not None:
                        try:
                            session.execute(query)
                        except Exception as e:
                            self._handle_errors(e)

            try:
                session.commit()

            except Exception as e:
                session.rollback()
                session.remove()
                self._handle_errors(e)

            # return true if everything worked.
            session.close()
            return True

        except JobTimeoutException:
            end = time.time()
            return InternalServerError(
                'Bulk loading timed out after {} seconds'
                .format(end-start))
Ejemplo n.º 2
0
 def set_session(self):
     if hasattr(self, 'session'):
         self.session.close()
     self.session = gen_session()
Ejemplo n.º 3
0
def ingest(obj,
           org_id,
           url_fields=['body'],
           requires=['url', 'type'],
           extract=True,
           kill_session=True):
    """
    Ingest an Event.
    """

    # distinct session for this eventlet.
    session = gen_session()

    # check required fields
    ingest_util.check_requires(obj, requires, type='Content Item')

    # validate type
    validate_content_item_types(obj['type'])

    # check if the org_id is in the body
    # TODO: I don't think this is necessary.
    org_id = obj.pop('org_id', org_id)

    # get rid of ``id`` if it somehow got in here.
    obj.pop('id', None)

    # normalize the url
    obj['url'] = ingest_util.prepare_url(obj, 'url')

    # run article extraction.
    if extract:
        cache_response = extract_cache.get(url=obj['url'], type=obj['type'])
        if not cache_response:

            # make sure to kill this key.
            extract_cache.invalidate(url=obj['url'], type=obj['type'])
            raise RequestError(
                'Extraction failed on {type} - {url}'.format(**obj))

        # extraction succeeded
        else:
            data = cache_response.value
            obj.update(data)

    else:
        obj['title'] = ingest_util.prepare_str(obj, 'title')
        obj['description'] = ingest_util.prepare_str(obj, 'description')
        obj['body'] = ingest_util.prepare_str(obj, 'body')
        obj['created'] = ingest_util.prepare_str(obj, 'created')
        if not obj['created']:
            obj.pop('created')

    # get thumbnail
    obj['thumbnail'] = ingest_util.prepare_thumbnail(obj, 'img_url')

    # split out tags_ids + authors + links
    tag_ids = obj.pop('tag_ids', [])
    authors = obj.pop('author_ids', [])
    authors.extend(obj.pop('authors', []))  # accept names too
    # links = obj.pop('links', {})

    # determine event provenance
    obj = _content_item_provenance(obj, org_id)

    # split out meta fields
    obj = ingest_util.split_meta(obj, get_table_columns(ContentItem))

    # see if the event already exists.
    c = session.query(ContentItem)\
        .filter_by(org_id=org_id, type=obj['type'], url=obj['url'])\
        .first()

    # if not, create it
    if not c:

        # create event
        c = ContentItem(org_id=org_id, **obj)

    # else, update it
    else:
        for k, v in obj.items():
            setattr(c, k, v)

    # extract urls and normalize urls asynchronously.
    # urls = ingest_util.extract_urls(
    #     obj,
    #     url_fields,
    #     source=data.get('url'),
    #     links=_links)

    # detect content_items
    # if len(_links):
    #     c = _associate_content_items(c, org_id, _links)

    # associate tags
    if len(tag_ids):
        c = _associate_tags(c, org_id, tag_ids, session)

    # associate tags
    if len(authors):
        _authors = _associate_authors(c, org_id, authors, session)
        for a in _authors:
            if a.id not in c.author_ids:
                c.authors.append(a)

    session.add(c)
    session.commit()
    if kill_session:
        session.close()
    return c
Ejemplo n.º 4
0
def ingest(
        obj,
        org_id,
        url_fields=['body'],
        requires=['url', 'type'],
        extract=True,
        kill_session=True):
    """
    Ingest an Event.
    """

    # distinct session for this eventlet.
    session = gen_session()

    # check required fields
    ingest_util.check_requires(obj, requires, type='Content Item')

    # validate type
    validate_content_item_types(obj['type'])

    # check if the org_id is in the body
    # TODO: I don't think this is necessary.
    org_id = obj.pop('org_id', org_id)

    # get rid of ``id`` if it somehow got in here.
    obj.pop('id', None)

    # normalize the url
    obj['url'] = ingest_util.prepare_url(obj, 'url')

    # run article extraction.
    if extract:
        cache_response = extract_cache.get(url=obj['url'], type=obj['type'])
        if not cache_response:

            # make sure to kill this key.
            extract_cache.invalidate(url=obj['url'], type=obj['type'])
            raise RequestError(
                'Extraction failed on {type} - {url}'
                .format(**obj))

        # extraction succeeded
        else:
            data = cache_response.value
            obj.update(data)

    else:
        obj['title'] = ingest_util.prepare_str(obj, 'title')
        obj['description'] = ingest_util.prepare_str(obj, 'description')
        obj['body'] = ingest_util.prepare_str(obj, 'body')
        obj['created'] = ingest_util.prepare_str(obj, 'created')
        if not obj['created']:
            obj.pop('created')

    # get thumbnail
    obj['thumbnail'] = ingest_util.prepare_thumbnail(obj, 'img_url')

    # split out tags_ids + authors + links
    tag_ids = obj.pop('tag_ids', [])
    authors = obj.pop('author_ids', [])
    authors.extend(obj.pop('authors', []))  # accept names too
    # links = obj.pop('links', {})

    # determine event provenance
    obj = _content_item_provenance(obj, org_id)

    # split out meta fields
    obj = ingest_util.split_meta(obj, get_table_columns(ContentItem))

    # see if the event already exists.
    c = session.query(ContentItem)\
        .filter_by(org_id=org_id, type=obj['type'], url=obj['url'])\
        .first()

    # if not, create it
    if not c:

        # create event
        c = ContentItem(org_id=org_id, **obj)

    # else, update it
    else:
        for k, v in obj.items():
            setattr(c, k, v)

    # extract urls and normalize urls asynchronously.
    # urls = ingest_util.extract_urls(
    #     obj,
    #     url_fields,
    #     source=data.get('url'),
    #     links=_links)

    # detect content_items
    # if len(_links):
    #     c = _associate_content_items(c, org_id, _links)

    # associate tags
    if len(tag_ids):
        c = _associate_tags(c, org_id, tag_ids, session)

    # associate tags
    if len(authors):
        _authors = _associate_authors(c, org_id, authors, session)
        for a in _authors:
            if a.id not in c.author_ids:
                c.authors.append(a)

    session.add(c)
    session.commit()
    if kill_session:
        session.close()
    return c
Ejemplo n.º 5
0
import unittest

from newslynx.exc import SousChefSchemaError
from newslynx.models import sous_chef_schema, SousChef
from newslynx.core import gen_session

db_session = gen_session()


class TestSousChefJSONSchema(unittest.TestCase):

    def test_good_schema(self):
        sc = {
            "name": "Twitter List",
            "slug": "twitter-list",
            "description": "Extracts events from a twitter list.",
            "runs": "newslynx.sc.events.twitter.List",
            "creates": "events",
            "options": {
                "owner_screen_name": {
                    "input_type": "text",
                    "value_types": ["string"],
                    "accepts_list": True,
                    "required": True,
                    "help": {
                        "placeholder": "cspan"
                    },
                },
                "min_followers": {
                    "input_type": "number",
                    "value_types": ["numeric"],
Ejemplo n.º 6
0
 def set_session(self):
     self.session = gen_session()
Ejemplo n.º 7
0
    def load_all(self, kwargs_key):
        """
        Do the work.
        """
        start = time.time()
        try:
            # create a session specific to this task
            session = gen_session()

            # get the inputs from redis
            kwargs = self.redis.get(kwargs_key)
            if not kwargs:
                raise InternalServerError(
                    'An unexpected error occurred while processing bulk upload.'
                )

            kwargs = pickle_to_obj(kwargs)
            data = kwargs.get('data')
            kw = kwargs.get('kw')

            # delete them
            self.redis.delete(kwargs_key)

            outputs = []
            errors = []

            fx = partial(self._load_one, **kw)

            if self.concurrent:
                pool = Pool(min([len(data), self.max_workers]))
                for res in pool.imap_unordered(fx, data):
                    if isinstance(res, Exception):
                        errors.append(res)
                    else:
                        outputs.append(res)
            else:
                for item in data:
                    res = fx(item)
                    if isinstance(res, Exception):
                        errors.append(res)
                    else:
                        outputs.append(res)

            # return errors
            if len(errors):
                self._handle_errors(errors)

            # add objects and execute
            if self.returns == 'model':
                for o in outputs:
                    if o is not None:
                        try:
                            session.add(o)
                            session.commit(o)
                        except Exception as e:
                            self._handle_errors(e)

            # union all queries
            elif self.returns == 'query':
                for query in outputs:
                    if query is not None:
                        try:
                            session.execute(query)
                        except Exception as e:
                            self._handle_errors(e)

            try:
                session.commit()

            except Exception as e:
                session.rollback()
                session.remove()
                self._handle_errors(e)

            # return true if everything worked.
            session.close()
            return True

        except JobTimeoutException:
            end = time.time()
            return InternalServerError(
                'Bulk loading timed out after {} seconds'.format(end - start))
Ejemplo n.º 8
0
 def set_session(self):
     if hasattr(self, 'session'):
         self.session.close()
     self.session = gen_session()
Ejemplo n.º 9
0
def ingest(
        obj,
        org_id,
        org_domains,
        url_fields=['title', 'body', 'description'],
        requires=['title'],
        must_link=False,
        kill_session=True):
    """
    Ingest an Event.
    """

    # distinct session for this eventlet.
    session = gen_session()

    has_content_items = False

    # check required fields
    ingest_util.check_requires(obj, requires, type='Event')

    # validate status
    if 'status' in obj:
        validate_event_status(obj['status'])
        if obj['status'] == 'deleted':
            raise RequestError(
                'You cannot create an Event with status "deleted."')

    # check if the org_id is in the body
    # TODO: I don't think this is necessary.
    org_id = obj.pop('org_id', org_id)

    # get rid of ``id`` if it somehow got in here.
    obj.pop('id', None)

    # normalize the url
    obj['url'] = ingest_util.prepare_url(obj, 'url')

    # sanitize creation date
    obj['created'] = ingest_util.prepare_date(obj, 'created')
    if not obj['created']:
        obj.pop('created')

    # sanitize text/html fields
    obj['title'] = ingest_util.prepare_str(obj, 'title', obj['url'])
    obj['description'] = ingest_util.prepare_str(
        obj, 'description', obj['url'])
    obj['body'] = ingest_util.prepare_str(obj, 'body', obj['url'])

    # get thumbnail
    obj['thumbnail'] = ingest_util.prepare_thumbnail(obj, 'img_url')

    # split out tags_ids + content_item_ids
    tag_ids = obj.pop('tag_ids', [])
    content_item_ids = obj.pop('content_item_ids', [])
    links = obj.pop('links', [])

    # determine event provenance
    obj = _event_provenance(obj, org_id, session)

    # split out meta fields
    obj = ingest_util.split_meta(obj, get_table_columns(Event))

    # see if the event already exists.
    e = session.query(Event)\
        .filter_by(org_id=org_id)\
        .filter_by(source_id=obj['source_id'])\
        .first()

    # if not, create it
    if not e:

        # create event
        e = Event(org_id=org_id, **obj)

    # else, update it
    else:
        # if it's deleted, issue a message.
        if e.status == 'deleted':
            raise UnprocessableEntityError(
                'Event {} already exists and has been previously deleted.'
                .format(e.id))

        for k, v in obj.items():
            setattr(e, k, v)

    # extract urls and normalize urls asynchronously.
    links = ingest_util.prepare_links(links, org_domains)

    # detect content_items
    if len(links):
        e, has_content_items = _associate_content_items(
            e, org_id, links, content_item_ids, session)

    # associate tags
    if len(tag_ids):
        e = _associate_tags(e, org_id, tag_ids, session)

    # dont commit event if we're only looking
    # for events that link to content_items
    if not has_content_items and must_link:
        return None

    session.add(e)
    session.commit()
    if kill_session:
        session.close()
    return e
Ejemplo n.º 10
0
def ingest(obj,
           org_id,
           org_domains,
           url_fields=['title', 'body', 'description'],
           requires=['title'],
           must_link=False,
           kill_session=True):
    """
    Ingest an Event.
    """

    # distinct session for this eventlet.
    session = gen_session()

    has_content_items = False

    # check required fields
    ingest_util.check_requires(obj, requires, type='Event')

    # validate status
    if 'status' in obj:
        validate_event_status(obj['status'])
        if obj['status'] == 'deleted':
            raise RequestError(
                'You cannot create an Event with status "deleted."')

    # check if the org_id is in the body
    # TODO: I don't think this is necessary.
    org_id = obj.pop('org_id', org_id)

    # get rid of ``id`` if it somehow got in here.
    obj.pop('id', None)

    # normalize the url
    obj['url'] = ingest_util.prepare_url(obj, 'url')

    # sanitize creation date
    obj['created'] = ingest_util.prepare_date(obj, 'created')
    if not obj['created']:
        obj.pop('created')

    # sanitize text/html fields
    obj['title'] = ingest_util.prepare_str(obj, 'title', obj['url'])
    obj['description'] = ingest_util.prepare_str(obj, 'description',
                                                 obj['url'])
    obj['body'] = ingest_util.prepare_str(obj, 'body', obj['url'])

    # get thumbnail
    obj['thumbnail'] = ingest_util.prepare_thumbnail(obj, 'img_url')

    # split out tags_ids + content_item_ids
    tag_ids = obj.pop('tag_ids', [])
    content_item_ids = obj.pop('content_item_ids', [])
    links = obj.pop('links', [])

    # determine event provenance
    obj = _event_provenance(obj, org_id, session)

    # split out meta fields
    obj = ingest_util.split_meta(obj, get_table_columns(Event))

    # see if the event already exists.
    e = session.query(Event)\
        .filter_by(org_id=org_id)\
        .filter_by(source_id=obj['source_id'])\
        .first()

    # if not, create it
    if not e:

        # create event
        e = Event(org_id=org_id, **obj)

    # else, update it
    else:
        # if it's deleted, issue a message.
        if e.status == 'deleted':
            raise UnprocessableEntityError(
                'Event {} already exists and has been previously deleted.'.
                format(e.id))

        for k, v in obj.items():
            setattr(e, k, v)

    # extract urls and normalize urls asynchronously.
    links = ingest_util.prepare_links(links, org_domains)

    # detect content_items
    if len(links):
        e, has_content_items = _associate_content_items(
            e, org_id, links, content_item_ids, session)

    # associate tags
    if len(tag_ids):
        e = _associate_tags(e, org_id, tag_ids, session)

    # dont commit event if we're only looking
    # for events that link to content_items
    if not has_content_items and must_link:
        return None

    session.add(e)
    session.commit()
    if kill_session:
        session.close()
    return e