Python prepare_str Examples, newslynx.tasks.ingest_util.prepare_str Python Examples

Example #1

0

Show file

def ingest(obj,
           org_id,
           url_fields=['body'],
           requires=['url', 'type'],
           extract=True,
           kill_session=True):
    """
    Ingest an Event.
    """

    # distinct session for this eventlet.
    session = gen_session()

    # check required fields
    ingest_util.check_requires(obj, requires, type='Content Item')

    # validate type
    validate_content_item_types(obj['type'])

    # check if the org_id is in the body
    # TODO: I don't think this is necessary.
    org_id = obj.pop('org_id', org_id)

    # get rid of ``id`` if it somehow got in here.
    obj.pop('id', None)

    # normalize the url
    obj['url'] = ingest_util.prepare_url(obj, 'url')

    # run article extraction.
    if extract:
        cache_response = extract_cache.get(url=obj['url'], type=obj['type'])
        if not cache_response:

            # make sure to kill this key.
            extract_cache.invalidate(url=obj['url'], type=obj['type'])
            raise RequestError(
                'Extraction failed on {type} - {url}'.format(**obj))

        # extraction succeeded
        else:
            data = cache_response.value
            obj.update(data)

    else:
        obj['title'] = ingest_util.prepare_str(obj, 'title')
        obj['description'] = ingest_util.prepare_str(obj, 'description')
        obj['body'] = ingest_util.prepare_str(obj, 'body')
        obj['created'] = ingest_util.prepare_str(obj, 'created')
        if not obj['created']:
            obj.pop('created')

    # get thumbnail
    obj['thumbnail'] = ingest_util.prepare_thumbnail(obj, 'img_url')

    # split out tags_ids + authors + links
    tag_ids = obj.pop('tag_ids', [])
    authors = obj.pop('author_ids', [])
    authors.extend(obj.pop('authors', []))  # accept names too
    # links = obj.pop('links', {})

    # determine event provenance
    obj = _content_item_provenance(obj, org_id)

    # split out meta fields
    obj = ingest_util.split_meta(obj, get_table_columns(ContentItem))

    # see if the event already exists.
    c = session.query(ContentItem)\
        .filter_by(org_id=org_id, type=obj['type'], url=obj['url'])\
        .first()

    # if not, create it
    if not c:

        # create event
        c = ContentItem(org_id=org_id, **obj)

    # else, update it
    else:
        for k, v in obj.items():
            setattr(c, k, v)

    # extract urls and normalize urls asynchronously.
    # urls = ingest_util.extract_urls(
    #     obj,
    #     url_fields,
    #     source=data.get('url'),
    #     links=_links)

    # detect content_items
    # if len(_links):
    #     c = _associate_content_items(c, org_id, _links)

    # associate tags
    if len(tag_ids):
        c = _associate_tags(c, org_id, tag_ids, session)

    # associate tags
    if len(authors):
        _authors = _associate_authors(c, org_id, authors, session)
        for a in _authors:
            if a.id not in c.author_ids:
                c.authors.append(a)

    session.add(c)
    session.commit()
    if kill_session:
        session.close()
    return c

Example #2

0

Show file

File: ingest_event.py Project: jjelosua/newslynx-core

def ingest(
        obj,
        org_id,
        org_domains,
        url_fields=['title', 'body', 'description'],
        requires=['title'],
        must_link=False,
        kill_session=True):
    """
    Ingest an Event.
    """

    # distinct session for this eventlet.
    session = gen_session()

    has_content_items = False

    # check required fields
    ingest_util.check_requires(obj, requires, type='Event')

    # validate status
    if 'status' in obj:
        validate_event_status(obj['status'])
        if obj['status'] == 'deleted':
            raise RequestError(
                'You cannot create an Event with status "deleted."')

    # check if the org_id is in the body
    # TODO: I don't think this is necessary.
    org_id = obj.pop('org_id', org_id)

    # get rid of ``id`` if it somehow got in here.
    obj.pop('id', None)

    # normalize the url
    obj['url'] = ingest_util.prepare_url(obj, 'url')

    # sanitize creation date
    obj['created'] = ingest_util.prepare_date(obj, 'created')
    if not obj['created']:
        obj.pop('created')

    # sanitize text/html fields
    obj['title'] = ingest_util.prepare_str(obj, 'title', obj['url'])
    obj['description'] = ingest_util.prepare_str(
        obj, 'description', obj['url'])
    obj['body'] = ingest_util.prepare_str(obj, 'body', obj['url'])

    # get thumbnail
    obj['thumbnail'] = ingest_util.prepare_thumbnail(obj, 'img_url')

    # split out tags_ids + content_item_ids
    tag_ids = obj.pop('tag_ids', [])
    content_item_ids = obj.pop('content_item_ids', [])
    links = obj.pop('links', [])

    # determine event provenance
    obj = _event_provenance(obj, org_id, session)

    # split out meta fields
    obj = ingest_util.split_meta(obj, get_table_columns(Event))

    # see if the event already exists.
    e = session.query(Event)\
        .filter_by(org_id=org_id)\
        .filter_by(source_id=obj['source_id'])\
        .first()

    # if not, create it
    if not e:

        # create event
        e = Event(org_id=org_id, **obj)

    # else, update it
    else:
        # if it's deleted, issue a message.
        if e.status == 'deleted':
            raise UnprocessableEntityError(
                'Event {} already exists and has been previously deleted.'
                .format(e.id))

        for k, v in obj.items():
            setattr(e, k, v)

    # extract urls and normalize urls asynchronously.
    links = ingest_util.prepare_links(links, org_domains)

    # detect content_items
    if len(links):
        e, has_content_items = _associate_content_items(
            e, org_id, links, content_item_ids, session)

    # associate tags
    if len(tag_ids):
        e = _associate_tags(e, org_id, tag_ids, session)

    # dont commit event if we're only looking
    # for events that link to content_items
    if not has_content_items and must_link:
        return None

    session.add(e)
    session.commit()
    if kill_session:
        session.close()
    return e

Example #3

0

Show file

File: ingest_content_item.py Project: jjelosua/newslynx-core

def ingest(
        obj,
        org_id,
        url_fields=['body'],
        requires=['url', 'type'],
        extract=True,
        kill_session=True):
    """
    Ingest an Event.
    """

    # distinct session for this eventlet.
    session = gen_session()

    # check required fields
    ingest_util.check_requires(obj, requires, type='Content Item')

    # validate type
    validate_content_item_types(obj['type'])

    # check if the org_id is in the body
    # TODO: I don't think this is necessary.
    org_id = obj.pop('org_id', org_id)

    # get rid of ``id`` if it somehow got in here.
    obj.pop('id', None)

    # normalize the url
    obj['url'] = ingest_util.prepare_url(obj, 'url')

    # run article extraction.
    if extract:
        cache_response = extract_cache.get(url=obj['url'], type=obj['type'])
        if not cache_response:

            # make sure to kill this key.
            extract_cache.invalidate(url=obj['url'], type=obj['type'])
            raise RequestError(
                'Extraction failed on {type} - {url}'
                .format(**obj))

        # extraction succeeded
        else:
            data = cache_response.value
            obj.update(data)

    else:
        obj['title'] = ingest_util.prepare_str(obj, 'title')
        obj['description'] = ingest_util.prepare_str(obj, 'description')
        obj['body'] = ingest_util.prepare_str(obj, 'body')
        obj['created'] = ingest_util.prepare_str(obj, 'created')
        if not obj['created']:
            obj.pop('created')

    # get thumbnail
    obj['thumbnail'] = ingest_util.prepare_thumbnail(obj, 'img_url')

    # split out tags_ids + authors + links
    tag_ids = obj.pop('tag_ids', [])
    authors = obj.pop('author_ids', [])
    authors.extend(obj.pop('authors', []))  # accept names too
    # links = obj.pop('links', {})

    # determine event provenance
    obj = _content_item_provenance(obj, org_id)

    # split out meta fields
    obj = ingest_util.split_meta(obj, get_table_columns(ContentItem))

    # see if the event already exists.
    c = session.query(ContentItem)\
        .filter_by(org_id=org_id, type=obj['type'], url=obj['url'])\
        .first()

    # if not, create it
    if not c:

        # create event
        c = ContentItem(org_id=org_id, **obj)

    # else, update it
    else:
        for k, v in obj.items():
            setattr(c, k, v)

    # extract urls and normalize urls asynchronously.
    # urls = ingest_util.extract_urls(
    #     obj,
    #     url_fields,
    #     source=data.get('url'),
    #     links=_links)

    # detect content_items
    # if len(_links):
    #     c = _associate_content_items(c, org_id, _links)

    # associate tags
    if len(tag_ids):
        c = _associate_tags(c, org_id, tag_ids, session)

    # associate tags
    if len(authors):
        _authors = _associate_authors(c, org_id, authors, session)
        for a in _authors:
            if a.id not in c.author_ids:
                c.authors.append(a)

    session.add(c)
    session.commit()
    if kill_session:
        session.close()
    return c

Example #4

0

Show file

File: ingest_event.py Project: jjelosua/newslynx-core

def ingest(obj,
           org_id,
           org_domains,
           url_fields=['title', 'body', 'description'],
           requires=['title'],
           must_link=False,
           kill_session=True):
    """
    Ingest an Event.
    """

    # distinct session for this eventlet.
    session = gen_session()

    has_content_items = False

    # check required fields
    ingest_util.check_requires(obj, requires, type='Event')

    # validate status
    if 'status' in obj:
        validate_event_status(obj['status'])
        if obj['status'] == 'deleted':
            raise RequestError(
                'You cannot create an Event with status "deleted."')

    # check if the org_id is in the body
    # TODO: I don't think this is necessary.
    org_id = obj.pop('org_id', org_id)

    # get rid of ``id`` if it somehow got in here.
    obj.pop('id', None)

    # normalize the url
    obj['url'] = ingest_util.prepare_url(obj, 'url')

    # sanitize creation date
    obj['created'] = ingest_util.prepare_date(obj, 'created')
    if not obj['created']:
        obj.pop('created')

    # sanitize text/html fields
    obj['title'] = ingest_util.prepare_str(obj, 'title', obj['url'])
    obj['description'] = ingest_util.prepare_str(obj, 'description',
                                                 obj['url'])
    obj['body'] = ingest_util.prepare_str(obj, 'body', obj['url'])

    # get thumbnail
    obj['thumbnail'] = ingest_util.prepare_thumbnail(obj, 'img_url')

    # split out tags_ids + content_item_ids
    tag_ids = obj.pop('tag_ids', [])
    content_item_ids = obj.pop('content_item_ids', [])
    links = obj.pop('links', [])

    # determine event provenance
    obj = _event_provenance(obj, org_id, session)

    # split out meta fields
    obj = ingest_util.split_meta(obj, get_table_columns(Event))

    # see if the event already exists.
    e = session.query(Event)\
        .filter_by(org_id=org_id)\
        .filter_by(source_id=obj['source_id'])\
        .first()

    # if not, create it
    if not e:

        # create event
        e = Event(org_id=org_id, **obj)

    # else, update it
    else:
        # if it's deleted, issue a message.
        if e.status == 'deleted':
            raise UnprocessableEntityError(
                'Event {} already exists and has been previously deleted.'.
                format(e.id))

        for k, v in obj.items():
            setattr(e, k, v)

    # extract urls and normalize urls asynchronously.
    links = ingest_util.prepare_links(links, org_domains)

    # detect content_items
    if len(links):
        e, has_content_items = _associate_content_items(
            e, org_id, links, content_item_ids, session)

    # associate tags
    if len(tag_ids):
        e = _associate_tags(e, org_id, tag_ids, session)

    # dont commit event if we're only looking
    # for events that link to content_items
    if not has_content_items and must_link:
        return None

    session.add(e)
    session.commit()
    if kill_session:
        session.close()
    return e