コード例 #1
0
 def valid_url(self, key, opt):
     """
     Validate a url field.
     """
     if url.validate(opt):
         return opt
     return RecipeSchemaError(
         "{} should be a 'url' field but was passed '{}'.".format(key, opt))
コード例 #2
0
 def valid_url(self, key, opt):
     """
     Validate a url field.
     """
     if url.validate(opt):
         return opt
     return RecipeSchemaError(
         "{} should be a 'url' field but was passed '{}'."
         .format(key, opt))
コード例 #3
0
ファイル: ingest.py プロジェクト: abelsonlive/newslynx-core
def _prepare(obj, requires=[], recipe=None, type='event', org_id=None, extract=True):
    """
    Prepare a content item or an event.
    """

    # check required fields
    _check_requires(obj, requires, type=type)

    # validate status
    if type == 'event':
        if 'status' in obj:
            if not obj.get('status', None) in EVENT_STATUSES:
                raise RequestError(
                    'Invalid event status: {status}'.format(**obj))
            if obj['status'] == 'deleted':
                raise RequestError(
                    'You cannot create an Event with status of "deleted."')

    # validate type
    if type == 'content_item':
        if not obj.get('type', None) in CONTENT_ITEM_TYPES:
            raise RequestError(
                'Invalid content item type: {type}'.format(**obj))

    # get rid of ``id`` if it somehow got in here.
    obj.pop('id', None)
    obj.pop('org_id', None)

    # normalize the url
    if type == 'event':
        obj['url'] = _prepare_url(obj, 'url', expand=True, canonicalize=False)

    elif type == 'content_item':
        obj['url'] = _prepare_url(obj, 'url', expand=True, canonicalize=True)

    # sanitize creation date
    obj['created'] = _prepare_date(obj, 'created')
    if not obj['created']:
        obj.pop('created')

    # sanitize text/html fields
    obj['title'] = _prepare_str(obj, 'title', obj['url'])
    obj['description'] = _prepare_str(
        obj, 'description', obj['url'])
    obj['body'] = _prepare_str(obj, 'body', obj['url'])

    # set org id
    obj['org_id'] = org_id

    # check img url
    if not url.validate(obj.get('img_url', None)):
        obj['img_url'] = None

    # determine provenance.
    obj = _provenance(obj, recipe, type)

    # if type is content items and we're extracting. do it.
    if type == 'content_item' and extract and obj.get('url', None):
        cr = extract_cache.get(obj.get('url'), type=obj.get('type', None))

        if not cr.value:
            extract_cache.invalidate(
                obj.get('url'), type=obj.get('type', None))
            pass

        # merge extracted data with object.
        else:
            # merge extracted authors.
            for k, v in cr.value.items():
                if not obj.get(k, None):
                    obj[k] = v
                # preference extracted data
                if k in ['description', 'body']:
                    obj[k] = v
                elif k == 'authors':
                    if not k in obj:
                        obj[k] = v
                    else:
                        for vv in v:
                            if vv not in obj[k]:
                                obj[k].append(vv)

            # swap bad images.
            tn = _prepare_thumbnail(obj, 'img_url')
            if not tn:
                img = cr.value.get('img_url', None)
                if img:
                    obj['img_url'] = img
                    obj['thumbnail'] = _prepare_thumbnail(obj, 'img_url')
            else:
                obj['thumbnail'] = tn
    else:
        obj['thumbnail'] = _prepare_thumbnail(obj, 'img_url')

    # set domain
    obj['domain'] = url.get_domain(obj['url'])

    # return prepped object
    return obj
コード例 #4
0
ファイル: ingest.py プロジェクト: newslynx/newslynx-core
def _prepare(obj,
             requires=[],
             recipe=None,
             type='event',
             org_id=None,
             extract=True):
    """
    Prepare a content item or an event.
    """

    # check required fields
    _check_requires(obj, requires, type=type)

    # validate status
    if type == 'event':
        if 'status' in obj:
            if not obj.get('status', None) in EVENT_STATUSES:
                raise RequestError(
                    'Invalid event status: {status}'.format(**obj))
            if obj['status'] == 'deleted':
                raise RequestError(
                    'You cannot create an Event with status of "deleted."')

    # validate type
    if type == 'content_item':
        if not obj.get('type', None) in CONTENT_ITEM_TYPES:
            raise RequestError(
                'Invalid content item type: {type}'.format(**obj))

    # get rid of ``id`` if it somehow got in here.
    obj.pop('id', None)
    obj.pop('org_id', None)

    # normalize the url
    if type == 'event':
        obj['url'] = _prepare_url(obj, 'url', expand=True, canonicalize=False)

    elif type == 'content_item':
        obj['url'] = _prepare_url(obj, 'url', expand=True, canonicalize=True)

    # sanitize creation date
    obj['created'] = _prepare_date(obj, 'created')
    if not obj['created']:
        obj.pop('created')

    # sanitize text/html fields
    obj['title'] = _prepare_str(obj, 'title', obj['url'])
    obj['description'] = _prepare_str(obj, 'description', obj['url'])
    obj['body'] = _prepare_str(obj, 'body', obj['url'])

    # set org id
    obj['org_id'] = org_id

    # check img url
    if not url.validate(obj.get('img_url', None)):
        obj['img_url'] = None

    # determine provenance.
    obj = _provenance(obj, recipe, type)

    # if type is content items and we're extracting. do it.
    if type == 'content_item' and extract and obj.get('url', None):
        cr = extract_cache.get(obj.get('url'), type=obj.get('type', None))

        if not cr.value:
            extract_cache.invalidate(obj.get('url'),
                                     type=obj.get('type', None))
            pass

        # merge extracted data with object.
        else:
            # merge extracted authors.
            for k, v in cr.value.items():
                if not obj.get(k, None):
                    obj[k] = v
                # preference extracted data
                if k in ['description', 'body']:
                    obj[k] = v
                elif k == 'authors':
                    if not k in obj:
                        obj[k] = v
                    else:
                        for vv in v:
                            if vv not in obj[k]:
                                obj[k].append(vv)

            # swap bad images.
            tn = _prepare_thumbnail(obj, 'img_url')
            if not tn:
                img = cr.value.get('img_url', None)
                if img:
                    obj['img_url'] = img
                    obj['thumbnail'] = _prepare_thumbnail(obj, 'img_url')
            else:
                obj['thumbnail'] = tn
    else:
        obj['thumbnail'] = _prepare_thumbnail(obj, 'img_url')

    # set domain
    obj['domain'] = url.get_domain(obj['url'])

    # return prepped object
    return obj