def ingest(obj, org_id, url_fields=['body'], requires=['url', 'type'], extract=True, kill_session=True): """ Ingest an Event. """ # distinct session for this eventlet. session = gen_session() # check required fields ingest_util.check_requires(obj, requires, type='Content Item') # validate type validate_content_item_types(obj['type']) # check if the org_id is in the body # TODO: I don't think this is necessary. org_id = obj.pop('org_id', org_id) # get rid of ``id`` if it somehow got in here. obj.pop('id', None) # normalize the url obj['url'] = ingest_util.prepare_url(obj, 'url') # run article extraction. if extract: cache_response = extract_cache.get(url=obj['url'], type=obj['type']) if not cache_response: # make sure to kill this key. extract_cache.invalidate(url=obj['url'], type=obj['type']) raise RequestError( 'Extraction failed on {type} - {url}'.format(**obj)) # extraction succeeded else: data = cache_response.value obj.update(data) else: obj['title'] = ingest_util.prepare_str(obj, 'title') obj['description'] = ingest_util.prepare_str(obj, 'description') obj['body'] = ingest_util.prepare_str(obj, 'body') obj['created'] = ingest_util.prepare_str(obj, 'created') if not obj['created']: obj.pop('created') # get thumbnail obj['thumbnail'] = ingest_util.prepare_thumbnail(obj, 'img_url') # split out tags_ids + authors + links tag_ids = obj.pop('tag_ids', []) authors = obj.pop('author_ids', []) authors.extend(obj.pop('authors', [])) # accept names too # links = obj.pop('links', {}) # determine event provenance obj = _content_item_provenance(obj, org_id) # split out meta fields obj = ingest_util.split_meta(obj, get_table_columns(ContentItem)) # see if the event already exists. c = session.query(ContentItem)\ .filter_by(org_id=org_id, type=obj['type'], url=obj['url'])\ .first() # if not, create it if not c: # create event c = ContentItem(org_id=org_id, **obj) # else, update it else: for k, v in obj.items(): setattr(c, k, v) # extract urls and normalize urls asynchronously. # urls = ingest_util.extract_urls( # obj, # url_fields, # source=data.get('url'), # links=_links) # detect content_items # if len(_links): # c = _associate_content_items(c, org_id, _links) # associate tags if len(tag_ids): c = _associate_tags(c, org_id, tag_ids, session) # associate tags if len(authors): _authors = _associate_authors(c, org_id, authors, session) for a in _authors: if a.id not in c.author_ids: c.authors.append(a) session.add(c) session.commit() if kill_session: session.close() return c
def ingest( obj, org_id, org_domains, url_fields=['title', 'body', 'description'], requires=['title'], must_link=False, kill_session=True): """ Ingest an Event. """ # distinct session for this eventlet. session = gen_session() has_content_items = False # check required fields ingest_util.check_requires(obj, requires, type='Event') # validate status if 'status' in obj: validate_event_status(obj['status']) if obj['status'] == 'deleted': raise RequestError( 'You cannot create an Event with status "deleted."') # check if the org_id is in the body # TODO: I don't think this is necessary. org_id = obj.pop('org_id', org_id) # get rid of ``id`` if it somehow got in here. obj.pop('id', None) # normalize the url obj['url'] = ingest_util.prepare_url(obj, 'url') # sanitize creation date obj['created'] = ingest_util.prepare_date(obj, 'created') if not obj['created']: obj.pop('created') # sanitize text/html fields obj['title'] = ingest_util.prepare_str(obj, 'title', obj['url']) obj['description'] = ingest_util.prepare_str( obj, 'description', obj['url']) obj['body'] = ingest_util.prepare_str(obj, 'body', obj['url']) # get thumbnail obj['thumbnail'] = ingest_util.prepare_thumbnail(obj, 'img_url') # split out tags_ids + content_item_ids tag_ids = obj.pop('tag_ids', []) content_item_ids = obj.pop('content_item_ids', []) links = obj.pop('links', []) # determine event provenance obj = _event_provenance(obj, org_id, session) # split out meta fields obj = ingest_util.split_meta(obj, get_table_columns(Event)) # see if the event already exists. e = session.query(Event)\ .filter_by(org_id=org_id)\ .filter_by(source_id=obj['source_id'])\ .first() # if not, create it if not e: # create event e = Event(org_id=org_id, **obj) # else, update it else: # if it's deleted, issue a message. if e.status == 'deleted': raise UnprocessableEntityError( 'Event {} already exists and has been previously deleted.' .format(e.id)) for k, v in obj.items(): setattr(e, k, v) # extract urls and normalize urls asynchronously. links = ingest_util.prepare_links(links, org_domains) # detect content_items if len(links): e, has_content_items = _associate_content_items( e, org_id, links, content_item_ids, session) # associate tags if len(tag_ids): e = _associate_tags(e, org_id, tag_ids, session) # dont commit event if we're only looking # for events that link to content_items if not has_content_items and must_link: return None session.add(e) session.commit() if kill_session: session.close() return e
def ingest( obj, org_id, url_fields=['body'], requires=['url', 'type'], extract=True, kill_session=True): """ Ingest an Event. """ # distinct session for this eventlet. session = gen_session() # check required fields ingest_util.check_requires(obj, requires, type='Content Item') # validate type validate_content_item_types(obj['type']) # check if the org_id is in the body # TODO: I don't think this is necessary. org_id = obj.pop('org_id', org_id) # get rid of ``id`` if it somehow got in here. obj.pop('id', None) # normalize the url obj['url'] = ingest_util.prepare_url(obj, 'url') # run article extraction. if extract: cache_response = extract_cache.get(url=obj['url'], type=obj['type']) if not cache_response: # make sure to kill this key. extract_cache.invalidate(url=obj['url'], type=obj['type']) raise RequestError( 'Extraction failed on {type} - {url}' .format(**obj)) # extraction succeeded else: data = cache_response.value obj.update(data) else: obj['title'] = ingest_util.prepare_str(obj, 'title') obj['description'] = ingest_util.prepare_str(obj, 'description') obj['body'] = ingest_util.prepare_str(obj, 'body') obj['created'] = ingest_util.prepare_str(obj, 'created') if not obj['created']: obj.pop('created') # get thumbnail obj['thumbnail'] = ingest_util.prepare_thumbnail(obj, 'img_url') # split out tags_ids + authors + links tag_ids = obj.pop('tag_ids', []) authors = obj.pop('author_ids', []) authors.extend(obj.pop('authors', [])) # accept names too # links = obj.pop('links', {}) # determine event provenance obj = _content_item_provenance(obj, org_id) # split out meta fields obj = ingest_util.split_meta(obj, get_table_columns(ContentItem)) # see if the event already exists. c = session.query(ContentItem)\ .filter_by(org_id=org_id, type=obj['type'], url=obj['url'])\ .first() # if not, create it if not c: # create event c = ContentItem(org_id=org_id, **obj) # else, update it else: for k, v in obj.items(): setattr(c, k, v) # extract urls and normalize urls asynchronously. # urls = ingest_util.extract_urls( # obj, # url_fields, # source=data.get('url'), # links=_links) # detect content_items # if len(_links): # c = _associate_content_items(c, org_id, _links) # associate tags if len(tag_ids): c = _associate_tags(c, org_id, tag_ids, session) # associate tags if len(authors): _authors = _associate_authors(c, org_id, authors, session) for a in _authors: if a.id not in c.author_ids: c.authors.append(a) session.add(c) session.commit() if kill_session: session.close() return c
def ingest(obj, org_id, org_domains, url_fields=['title', 'body', 'description'], requires=['title'], must_link=False, kill_session=True): """ Ingest an Event. """ # distinct session for this eventlet. session = gen_session() has_content_items = False # check required fields ingest_util.check_requires(obj, requires, type='Event') # validate status if 'status' in obj: validate_event_status(obj['status']) if obj['status'] == 'deleted': raise RequestError( 'You cannot create an Event with status "deleted."') # check if the org_id is in the body # TODO: I don't think this is necessary. org_id = obj.pop('org_id', org_id) # get rid of ``id`` if it somehow got in here. obj.pop('id', None) # normalize the url obj['url'] = ingest_util.prepare_url(obj, 'url') # sanitize creation date obj['created'] = ingest_util.prepare_date(obj, 'created') if not obj['created']: obj.pop('created') # sanitize text/html fields obj['title'] = ingest_util.prepare_str(obj, 'title', obj['url']) obj['description'] = ingest_util.prepare_str(obj, 'description', obj['url']) obj['body'] = ingest_util.prepare_str(obj, 'body', obj['url']) # get thumbnail obj['thumbnail'] = ingest_util.prepare_thumbnail(obj, 'img_url') # split out tags_ids + content_item_ids tag_ids = obj.pop('tag_ids', []) content_item_ids = obj.pop('content_item_ids', []) links = obj.pop('links', []) # determine event provenance obj = _event_provenance(obj, org_id, session) # split out meta fields obj = ingest_util.split_meta(obj, get_table_columns(Event)) # see if the event already exists. e = session.query(Event)\ .filter_by(org_id=org_id)\ .filter_by(source_id=obj['source_id'])\ .first() # if not, create it if not e: # create event e = Event(org_id=org_id, **obj) # else, update it else: # if it's deleted, issue a message. if e.status == 'deleted': raise UnprocessableEntityError( 'Event {} already exists and has been previously deleted.'. format(e.id)) for k, v in obj.items(): setattr(e, k, v) # extract urls and normalize urls asynchronously. links = ingest_util.prepare_links(links, org_domains) # detect content_items if len(links): e, has_content_items = _associate_content_items( e, org_id, links, content_item_ids, session) # associate tags if len(tag_ids): e = _associate_tags(e, org_id, tag_ids, session) # dont commit event if we're only looking # for events that link to content_items if not has_content_items and must_link: return None session.add(e) session.commit() if kill_session: session.close() return e