def create(self, docs, **kwargs): new_guids = [] provider = self.get_provider() for doc in docs: if not doc.get('desk'): # if no desk is selected then it is bad request raise SuperdeskApiError.badRequestError("Destination desk cannot be empty.") try: archived_doc = self.fetch(doc['guid']) except FileNotFoundError as ex: raise ProviderError.externalProviderError(ex, provider) dest_doc = dict(archived_doc) new_id = generate_guid(type=GUID_TAG) new_guids.append(new_id) dest_doc['_id'] = new_id generate_unique_id_and_name(dest_doc) if provider: dest_doc['ingest_provider'] = str(provider[superdesk.config.ID_FIELD]) dest_doc[config.VERSION] = 1 send_to(doc=dest_doc, update=None, desk_id=doc.get('desk'), stage_id=doc.get('stage')) dest_doc[ITEM_STATE] = doc.get(ITEM_STATE, CONTENT_STATE.FETCHED) dest_doc[INGEST_ID] = archived_doc['_id'] dest_doc[FAMILY_ID] = archived_doc['_id'] remove_unwanted(dest_doc) set_original_creator(dest_doc) superdesk.get_resource_service(ARCHIVE).post([dest_doc]) insert_into_versions(dest_doc.get('_id')) return new_guids
def create(self, docs, **kwargs): new_guids = [] provider = get_resource_service('ingest_providers').find_one(source='aapmm', req=None) for doc in docs: if not doc.get('desk'): # if no desk is selected then it is bad request raise SuperdeskApiError.badRequestError("Destination desk cannot be empty.") archived_doc = self.backend.find_one_raw(doc['guid'], doc['guid']) dest_doc = dict(archived_doc) new_id = generate_guid(type=GUID_TAG) new_guids.append(new_id) dest_doc['_id'] = new_id generate_unique_id_and_name(dest_doc) if provider: dest_doc['ingest_provider'] = str(provider[superdesk.config.ID_FIELD]) dest_doc[config.VERSION] = 1 send_to(doc=dest_doc, update=None, desk_id=doc.get('desk'), stage_id=doc.get('stage')) dest_doc[config.CONTENT_STATE] = doc.get('state', STATE_FETCHED) dest_doc[INGEST_ID] = archived_doc['_id'] dest_doc[FAMILY_ID] = archived_doc['_id'] remove_unwanted(dest_doc) set_original_creator(dest_doc) superdesk.get_resource_service(ARCHIVE).post([dest_doc]) insert_into_versions(dest_doc.get('_id')) return new_guids
def create(self, docs, **kwargs): new_guids = [] provider = get_resource_service("ingest_providers").find_one(source="aapmm", req=None) if provider and "config" in provider and "username" in provider["config"]: self.backend.set_credentials(provider["config"]["username"], provider["config"]["password"]) for doc in docs: if not doc.get("desk"): # if no desk is selected then it is bad request raise SuperdeskApiError.badRequestError("Destination desk cannot be empty.") try: archived_doc = self.backend.find_one_raw(doc["guid"], doc["guid"]) except FileNotFoundError as ex: raise ProviderError.externalProviderError(ex, provider) dest_doc = dict(archived_doc) new_id = generate_guid(type=GUID_TAG) new_guids.append(new_id) dest_doc["_id"] = new_id generate_unique_id_and_name(dest_doc) if provider: dest_doc["ingest_provider"] = str(provider[superdesk.config.ID_FIELD]) dest_doc[config.VERSION] = 1 send_to(doc=dest_doc, update=None, desk_id=doc.get("desk"), stage_id=doc.get("stage")) dest_doc[ITEM_STATE] = doc.get(ITEM_STATE, CONTENT_STATE.FETCHED) dest_doc[INGEST_ID] = archived_doc["_id"] dest_doc[FAMILY_ID] = archived_doc["_id"] remove_unwanted(dest_doc) set_original_creator(dest_doc) superdesk.get_resource_service(ARCHIVE).post([dest_doc]) insert_into_versions(dest_doc.get("_id")) return new_guids
def create(self, docs, **kwargs): new_guids = [] provider = get_resource_service('ingest_providers').find_one( source='aapmm', req=None) for doc in docs: if not doc.get('desk'): # if no desk is selected then it is bad request raise SuperdeskApiError.badRequestError( "Destination desk cannot be empty.") archived_doc = self.backend.find_one_raw(doc['guid'], doc['guid']) dest_doc = dict(archived_doc) new_id = generate_guid(type=GUID_TAG) new_guids.append(new_id) dest_doc['_id'] = new_id generate_unique_id_and_name(dest_doc) if provider: dest_doc['ingest_provider'] = str( provider[superdesk.config.ID_FIELD]) dest_doc[config.VERSION] = 1 send_to(dest_doc, doc.get('desk'), doc.get('stage')) dest_doc[config.CONTENT_STATE] = doc.get('state', STATE_FETCHED) dest_doc[INGEST_ID] = archived_doc['_id'] dest_doc[FAMILY_ID] = archived_doc['_id'] remove_unwanted(dest_doc) set_original_creator(dest_doc) superdesk.get_resource_service(ARCHIVE).post([dest_doc]) insert_into_versions(dest_doc.get('_id')) return new_guids
def fetch(self, docs, id=None, **kwargs): id_of_fetched_items = [] for doc in docs: id_of_item_to_be_fetched = doc.get('_id') if id is None else id desk_id = doc.get('desk') stage_id = doc.get('stage') ingest_service = get_resource_service('ingest') ingest_doc = ingest_service.find_one(req=None, _id=id_of_item_to_be_fetched) if not ingest_doc: raise SuperdeskApiError.notFoundError( 'Fail to found ingest item with _id: %s' % id_of_item_to_be_fetched) if not is_workflow_state_transition_valid( 'fetch_from_ingest', ingest_doc[config.CONTENT_STATE]): raise InvalidStateTransitionError() if doc.get('macro'): # there is a macro so transform it ingest_doc = get_resource_service('macros').execute_macro( ingest_doc, doc.get('macro')) archived = utcnow() ingest_service.patch(id_of_item_to_be_fetched, {'archived': archived}) dest_doc = dict(ingest_doc) new_id = generate_guid(type=GUID_TAG) id_of_fetched_items.append(new_id) dest_doc['_id'] = new_id dest_doc['guid'] = new_id dest_doc['destination_groups'] = doc.get('destination_groups') generate_unique_id_and_name(dest_doc) dest_doc[config.VERSION] = 1 send_to(dest_doc, desk_id, stage_id) dest_doc[config.CONTENT_STATE] = doc.get('state', STATE_FETCHED) dest_doc[INGEST_ID] = dest_doc[FAMILY_ID] = ingest_doc['_id'] remove_unwanted(dest_doc) set_original_creator(dest_doc) self.__fetch_items_in_package(dest_doc, desk_id, stage_id, doc.get('state', STATE_FETCHED), doc.get('destination_groups')) get_resource_service(ARCHIVE).post([dest_doc]) insert_into_versions(doc=dest_doc) build_custom_hateoas(custom_hateoas, dest_doc) doc.update(dest_doc) if kwargs.get('notify', True): push_notification('item:fetch', fetched=1) return id_of_fetched_items
def create(self, docs, **kwargs): search_provider = get_resource_service('search_providers').find_one( search_provider=PROVIDER_NAME, req=None) if not search_provider or search_provider.get('is_closed', False): raise SuperdeskApiError.badRequestError( 'No search provider found or the search provider is closed.') if 'config' in search_provider: self.backend.set_credentials(search_provider['config']) new_guids = [] for doc in docs: if not doc.get( 'desk'): # if no desk is selected then it is bad request raise SuperdeskApiError.badRequestError( "Destination desk cannot be empty.") try: archived_doc = self.backend.find_one_raw( doc['guid'], doc['guid']) except FileNotFoundError as ex: raise ProviderError.externalProviderError(ex, search_provider) dest_doc = dict(archived_doc) new_id = generate_guid(type=GUID_TAG) new_guids.append(new_id) dest_doc[config.ID_FIELD] = new_id generate_unique_id_and_name(dest_doc) if search_provider: dest_doc['ingest_provider'] = str( search_provider[config.ID_FIELD]) dest_doc[config.VERSION] = 1 send_to(doc=dest_doc, update=None, desk_id=doc.get('desk'), stage_id=doc.get('stage')) dest_doc[ITEM_STATE] = doc.get(ITEM_STATE, CONTENT_STATE.FETCHED) dest_doc[INGEST_ID] = archived_doc[config.ID_FIELD] dest_doc[FAMILY_ID] = archived_doc[config.ID_FIELD] dest_doc[ITEM_OPERATION] = ITEM_FETCH remove_unwanted(dest_doc) set_original_creator(dest_doc) superdesk.get_resource_service(ARCHIVE).post([dest_doc]) insert_into_versions(dest_doc[config.ID_FIELD]) get_resource_service('search_providers').system_update( search_provider[config.ID_FIELD], {'last_item_update': utcnow()}, search_provider) return new_guids
def fetch(self, docs, id=None, **kwargs): id_of_fetched_items = [] for doc in docs: id_of_item_to_be_fetched = doc.get('_id') if id is None else id desk_id = doc.get('desk') stage_id = doc.get('stage') ingest_service = get_resource_service('ingest') ingest_doc = ingest_service.find_one(req=None, _id=id_of_item_to_be_fetched) if not ingest_doc: raise SuperdeskApiError.notFoundError('Fail to found ingest item with _id: %s' % id_of_item_to_be_fetched) if not is_workflow_state_transition_valid('fetch_from_ingest', ingest_doc[config.CONTENT_STATE]): raise InvalidStateTransitionError() if doc.get('macro'): # there is a macro so transform it ingest_doc = get_resource_service('macros').execute_macro(ingest_doc, doc.get('macro')) archived = utcnow() ingest_service.patch(id_of_item_to_be_fetched, {'archived': archived}) dest_doc = dict(ingest_doc) new_id = generate_guid(type=GUID_TAG) id_of_fetched_items.append(new_id) dest_doc['_id'] = new_id dest_doc['guid'] = new_id dest_doc['destination_groups'] = doc.get('destination_groups') generate_unique_id_and_name(dest_doc) dest_doc[config.VERSION] = 1 send_to(dest_doc, desk_id, stage_id) dest_doc[config.CONTENT_STATE] = doc.get('state', STATE_FETCHED) dest_doc[INGEST_ID] = dest_doc[FAMILY_ID] = ingest_doc['_id'] remove_unwanted(dest_doc) set_original_creator(dest_doc) self.__fetch_items_in_package(dest_doc, desk_id, stage_id, doc.get('state', STATE_FETCHED), doc.get('destination_groups')) get_resource_service(ARCHIVE).post([dest_doc]) insert_into_versions(doc=dest_doc) build_custom_hateoas(custom_hateoas, dest_doc) doc.update(dest_doc) if kwargs.get('notify', True): push_notification('item:fetch', fetched=1) return id_of_fetched_items
def fetch(self, docs, id=None, **kwargs): id_of_fetched_items = [] for doc in docs: id_of_item_to_be_fetched = doc.get("_id") if id is None else id desk_id = doc.get("desk") stage_id = doc.get("stage") ingest_service = get_resource_service("ingest") ingest_doc = ingest_service.find_one(req=None, _id=id_of_item_to_be_fetched) if not ingest_doc: raise SuperdeskApiError.notFoundError( "Fail to found ingest item with _id: %s" % id_of_item_to_be_fetched ) if not is_workflow_state_transition_valid("fetch_from_ingest", ingest_doc[ITEM_STATE]): raise InvalidStateTransitionError() if doc.get("macro"): # there is a macro so transform it ingest_doc = get_resource_service("macros").execute_macro(ingest_doc, doc.get("macro")) archived = utcnow() ingest_service.patch(id_of_item_to_be_fetched, {"archived": archived}) dest_doc = dict(ingest_doc) new_id = generate_guid(type=GUID_TAG) id_of_fetched_items.append(new_id) dest_doc["_id"] = new_id dest_doc["guid"] = new_id generate_unique_id_and_name(dest_doc) dest_doc[config.VERSION] = 1 send_to(doc=dest_doc, desk_id=desk_id, stage_id=stage_id) dest_doc[ITEM_STATE] = doc.get(ITEM_STATE, CONTENT_STATE.FETCHED) dest_doc[INGEST_ID] = dest_doc[FAMILY_ID] = ingest_doc["_id"] dest_doc[ITEM_OPERATION] = ITEM_FETCH remove_unwanted(dest_doc) set_original_creator(dest_doc) self.__fetch_items_in_package(dest_doc, desk_id, stage_id, doc.get(ITEM_STATE, CONTENT_STATE.FETCHED)) get_resource_service(ARCHIVE).post([dest_doc]) insert_into_versions(doc=dest_doc) build_custom_hateoas(custom_hateoas, dest_doc) doc.update(dest_doc) if kwargs.get("notify", True): push_notification("item:fetch", fetched=1) return id_of_fetched_items
def ingest_items_for(self, desk, no_of_stories, skip_index): desk_id = desk['_id'] stage_id = desk['incoming_stage'] bucket_size = min(100, no_of_stories) no_of_buckets = len(range(0, no_of_stories, bucket_size)) for x in range(0, no_of_buckets): skip = x * bucket_size * skip_index logger.info('Page : {}, skip: {}'.format(x + 1, skip)) cursor = get_resource_service('published').get_from_mongo(None, {}) cursor.skip(skip) cursor.limit(bucket_size) items = list(cursor) logger.info('Inserting {} items'.format(len(items))) archive_items = [] for item in items: dest_doc = dict(item) new_id = generate_guid(type=GUID_TAG) dest_doc[app.config['ID_FIELD']] = new_id dest_doc['guid'] = new_id generate_unique_id_and_name(dest_doc) dest_doc[app.config['VERSION']] = 1 dest_doc[ITEM_STATE] = CONTENT_STATE.FETCHED user_id = desk.get('members', [{'user': None}])[0].get('user') dest_doc['original_creator'] = user_id dest_doc['version_creator'] = user_id from apps.tasks import send_to send_to(dest_doc, desk_id=desk_id, stage_id=stage_id, user_id=user_id) dest_doc[app.config[ 'VERSION']] = 1 # Above step increments the version and needs to reset dest_doc[FAMILY_ID] = item['_id'] remove_unwanted(dest_doc) archive_items.append(dest_doc) get_resource_service(ARCHIVE).post(archive_items) for item in archive_items: insert_into_versions(id_=item[app.config['ID_FIELD']])
def create(self, docs, **kwargs): search_provider = get_resource_service('search_providers').find_one(search_provider=PROVIDER_NAME, req=None) if not search_provider or search_provider.get('is_closed', False): raise SuperdeskApiError.badRequestError('No search provider found or the search provider is closed.') if 'config' in search_provider: self.backend.set_credentials(search_provider['config']) new_guids = [] for doc in docs: if not doc.get('desk'): # if no desk is selected then it is bad request raise SuperdeskApiError.badRequestError("Destination desk cannot be empty.") try: archived_doc = self.backend.find_one_raw(doc['guid'], doc['guid']) except FileNotFoundError as ex: raise ProviderError.externalProviderError(ex, search_provider) dest_doc = dict(archived_doc) new_id = generate_guid(type=GUID_TAG) new_guids.append(new_id) dest_doc[config.ID_FIELD] = new_id generate_unique_id_and_name(dest_doc) if search_provider: dest_doc['ingest_provider'] = str(search_provider[config.ID_FIELD]) dest_doc[config.VERSION] = 1 send_to(doc=dest_doc, update=None, desk_id=doc.get('desk'), stage_id=doc.get('stage')) dest_doc[ITEM_STATE] = doc.get(ITEM_STATE, CONTENT_STATE.FETCHED) dest_doc[INGEST_ID] = archived_doc[config.ID_FIELD] dest_doc[FAMILY_ID] = archived_doc[config.ID_FIELD] dest_doc[ITEM_OPERATION] = ITEM_FETCH remove_unwanted(dest_doc) set_original_creator(dest_doc) superdesk.get_resource_service(ARCHIVE).post([dest_doc]) insert_into_versions(dest_doc[config.ID_FIELD]) get_resource_service('search_providers').system_update(search_provider[config.ID_FIELD], {'last_item_update': utcnow()}, search_provider) return new_guids
def create(self, docs, **kwargs): new_guids = [] provider = get_resource_service('ingest_providers').find_one( source='aapmm', req=None) if provider and 'config' in provider and 'username' in provider[ 'config']: self.backend.set_credentials(provider['config']['username'], provider['config']['password']) for doc in docs: if not doc.get('desk'): # if no desk is selected then it is bad request raise SuperdeskApiError.badRequestError( "Destination desk cannot be empty.") try: archived_doc = self.backend.find_one_raw( doc['guid'], doc['guid']) except FileNotFoundError as ex: raise ProviderError.externalProviderError(ex, provider) dest_doc = dict(archived_doc) new_id = generate_guid(type=GUID_TAG) new_guids.append(new_id) dest_doc['_id'] = new_id generate_unique_id_and_name(dest_doc) if provider: dest_doc['ingest_provider'] = str( provider[superdesk.config.ID_FIELD]) dest_doc[config.VERSION] = 1 send_to(doc=dest_doc, update=None, desk_id=doc.get('desk'), stage_id=doc.get('stage')) dest_doc[ITEM_STATE] = doc.get(ITEM_STATE, CONTENT_STATE.FETCHED) dest_doc[INGEST_ID] = archived_doc['_id'] dest_doc[FAMILY_ID] = archived_doc['_id'] remove_unwanted(dest_doc) set_original_creator(dest_doc) superdesk.get_resource_service(ARCHIVE).post([dest_doc]) insert_into_versions(dest_doc.get('_id')) return new_guids
def create(self, docs, **kwargs): new_guids = [] provider = self.get_provider() for doc in docs: if not doc.get('desk'): # if no desk is selected then it is bad request raise SuperdeskApiError.badRequestError( _("Destination desk cannot be empty.")) try: archived_doc = self.fetch(doc['guid']) except FileNotFoundError as ex: raise ProviderError.externalProviderError(ex, provider) dest_doc = dict(archived_doc) new_id = generate_guid(type=GUID_TAG) new_guids.append(new_id) dest_doc['_id'] = new_id generate_unique_id_and_name(dest_doc) if provider: dest_doc['ingest_provider'] = str( provider[superdesk.config.ID_FIELD]) dest_doc[config.VERSION] = 1 send_to(doc=dest_doc, update=None, desk_id=doc.get('desk'), stage_id=doc.get('stage')) dest_doc[ITEM_STATE] = doc.get(ITEM_STATE, CONTENT_STATE.FETCHED) dest_doc[INGEST_ID] = archived_doc['_id'] dest_doc[FAMILY_ID] = archived_doc['_id'] dest_doc[ITEM_OPERATION] = ITEM_FETCH remove_unwanted(dest_doc) set_original_creator(dest_doc) superdesk.get_resource_service(ARCHIVE).post([dest_doc]) insert_into_versions(dest_doc.get('_id')) if new_guids: get_resource_service('search_providers').system_update( provider.get(config.ID_FIELD), {'last_item_update': utcnow()}, provider) return new_guids
def ingest_items_for(self, desk, no_of_stories, skip_index): desk_id = desk['_id'] stage_id = desk['incoming_stage'] bucket_size = min(100, no_of_stories) no_of_buckets = len(range(0, no_of_stories, bucket_size)) for x in range(0, no_of_buckets): skip = x * bucket_size * skip_index logger.info('Page : {}, skip: {}'.format(x + 1, skip)) cursor = get_resource_service('published').get_from_mongo(None, {}) cursor.skip(skip) cursor.limit(bucket_size) items = list(cursor) logger.info('Inserting {} items'.format(len(items))) archive_items = [] for item in items: dest_doc = dict(item) new_id = generate_guid(type=GUID_TAG) dest_doc[app.config['ID_FIELD']] = new_id dest_doc['guid'] = new_id generate_unique_id_and_name(dest_doc) dest_doc[app.config['VERSION']] = 1 dest_doc[ITEM_STATE] = CONTENT_STATE.FETCHED user_id = desk.get('members', [{'user': None}])[0].get('user') dest_doc['original_creator'] = user_id dest_doc['version_creator'] = user_id from apps.tasks import send_to send_to(dest_doc, desk_id=desk_id, stage_id=stage_id, user_id=user_id) dest_doc[app.config['VERSION']] = 1 # Above step increments the version and needs to reset dest_doc[FAMILY_ID] = item['_id'] remove_unwanted(dest_doc) archive_items.append(dest_doc) get_resource_service(ARCHIVE).post(archive_items) for item in archive_items: insert_into_versions(id_=item[app.config['ID_FIELD']])
def ingest_items_for(self, desk, no_of_stories, skip_index): desk_id = desk['_id'] stage_id = desk['incoming_stage'] bucket_size = min(100, no_of_stories) no_of_buckets = len(range(0, no_of_stories, bucket_size)) for x in range(0, no_of_buckets): skip = x * bucket_size * skip_index self.logger.info('Page : {}, skip: {}'.format(x + 1, skip)) cursor = get_resource_service('text_archive').get_from_mongo(None, {}) cursor.skip(skip) cursor.limit(bucket_size) items = list(cursor) self.logger.info('Inserting {} items'.format(len(items))) archive_items = [] for item in items: dest_doc = dict(item) new_id = generate_guid(type=GUID_TAG) dest_doc['_id'] = new_id dest_doc['guid'] = new_id generate_unique_id_and_name(dest_doc) dest_doc[app.config['VERSION']] = 1 dest_doc['state'] = 'fetched' user_id = desk.get('members', [{'user': None}])[0].get('user') dest_doc['original_creator'] = user_id dest_doc['version_creator'] = user_id send_to(dest_doc, desk_id=desk_id, stage_id=stage_id, user_id=user_id) dest_doc[FAMILY_ID] = item['_id'] remove_unwanted(dest_doc) archive_items.append(dest_doc) get_resource_service(ARCHIVE).post(archive_items) for item in archive_items: insert_into_versions(id_=item['_id'])
def _process_bunch(self, x): # x.findall('dc_rest_docs/dc_rest_doc')[0].get('href') items = [] for doc in x.findall('dc_rest_docs/dc_rest_doc'): try: # print(doc.get('href')) id = doc.find('dcdossier').get('id') if self._direction: if int(id) > self._id: self._id = int(id) else: if int(id) < self._id: self._id = int(id) item = {} item['guid'] = doc.find('dcdossier').get('guid') # if the item has been modified in the archive then it is due to a kill # there is an argument that this item should not be imported at all if doc.find('dcdossier').get('created') != doc.find('dcdossier').get('modified'): # item[ITEM_STATE] = CONTENT_STATE.KILLED continue else: item[ITEM_STATE] = CONTENT_STATE.PUBLISHED value = datetime.strptime(self._get_head_value(doc, 'PublicationDate'), '%Y%m%d%H%M%S') local_tz = pytz.timezone('Australia/Sydney') try: aus_dt = local_tz.localize(value, is_dst=None) except NonExistentTimeError as ex: aus_dt = local_tz.localize(value, is_dst=True) except AmbiguousTimeError: aus_dt = local_tz.localize(value, is_dst=False) item['firstcreated'] = aus_dt.astimezone(pytz.utc) item['versioncreated'] = item['firstcreated'] generate_unique_id_and_name(item) item['ingest_id'] = id item['source'] = self._get_head_value(doc, 'Agency') # self._addkeywords('AsiaPulseCodes', doc, item) byline = self._get_head_value(doc, 'Byline') if byline: item['byline'] = byline # item['service'] = self._get_head_value(doc,'Service') category = self._get_head_value(doc, 'Category') if not category: publication_name = self._get_head_value(doc, 'PublicationName') if publication_name in pubnames: category = pubnames[publication_name] if category: anpacategory = {} anpacategory['qcode'] = category for anpa_category in self._anpa_categories['items']: if anpacategory['qcode'].lower() == anpa_category['qcode'].lower(): anpacategory = {'qcode': anpacategory['qcode'], 'name': anpa_category['name']} break item['anpa_category'] = [anpacategory] # self._addkeywords('CompanyCodes', doc, item) type = self._get_head_value(doc, 'Format') if type == 'x': item[ITEM_TYPE] = CONTENT_TYPE.TEXT elif type == 't': item[ITEM_TYPE] = CONTENT_TYPE.PREFORMATTED else: item[ITEM_TYPE] = CONTENT_TYPE.TEXT item['keyword'] = self._get_head_value(doc, 'Keyword') item['ingest_provider_sequence'] = self._get_head_value(doc, 'Sequence') orginal_source = self._get_head_value(doc, 'Author') if orginal_source: item['original_source'] = orginal_source item['headline'] = self._get_head_value(doc, 'Headline') code = self._get_head_value(doc, 'SubjectRefNum') if code and len(code) == 7: code = '0' + code if code and code in subject_codes: item['subject'] = [] item['subject'].append({'qcode': code, 'name': subject_codes[code]}) try: process_iptc_codes(item, None) except: pass slug = self._get_head_value(doc, 'SLUG') if slug: item['slugline'] = slug else: item['slugline'] = self._get_head_value(doc, 'Keyword') take_key = self._get_head_value(doc, 'Takekey') if take_key: item['anpa_take_key'] = take_key # self._addkeywords('Topic', doc, item) # self._addkeywords('Selectors', doc, item) el = doc.find('dcdossier/document/body/BodyText') if el is not None: story = el.text if item[ITEM_TYPE] == CONTENT_TYPE.TEXT: story = story.replace('\n ', '<br><br>') story = story.replace('\n', '<br>') item['body_html'] = story else: item['body_html'] = story try: item['word_count'] = get_text_word_count(item['body_html']) except: pass item['pubstatus'] = 'usable' # this is required for the archived service additional lookup item['item_id'] = item['guid'] item[config.VERSION] = 1 item['flags'] = {'marked_archived_only': True} # item['_id'] = ObjectId(id.rjust(24,'0')) item['_id'] = ObjectId() items.append(item) if self._limit: self._limit -= 1 # print(item) except Exception as ex: print('Exception parsing DC documnent {}'.format(id)) pass try: res = superdesk.get_resource_service('archived') s = time.time() res.post(items) print('Post to Batch to Superdesk took {:.2f}'.format(time.time() - s)) except Exception as ex: if ex.code == 409: print('Key clash exceptionn detected') # create a list of the guids we tried to post guids = [g['guid'] for g in items] # create a query for all those id's query = { 'size': self.BATCH_SIZE, 'query': { 'filtered': { 'filter': { "terms": { "guid": [guids] } } } } } req = ParsedRequest() repos = 'archived' req.args = {'source': json.dumps(query), 'repo': repos} search_res = superdesk.get_resource_service('search') existing = search_res.get(req=req, lookup=None) existing_guids = [e['guid'] for e in existing] not_existing = [g for g in guids if g not in existing_guids] for missing_guid in not_existing: i = [m for m in items if m['guid'] == missing_guid] original = res.find_one(req=None, guid=i[0]['guid']) if not original: try: s = time.time() res.post(i) print('Post single item to Superdesk in {:.2f} seconds'.format(time.time() - s)) except Exception as ex: print('Exception posting single item') else: print('Exception posting batch')
def _process_bunch(self, x): # x.findall('dc_rest_docs/dc_rest_doc')[0].get('href') for doc in x.findall('dc_rest_docs/dc_rest_doc'): print(doc.get('href')) id = doc.find('dcdossier').get('id') if int(id) < self._id: self._id = int(id) item = {} item['guid'] = doc.find('dcdossier').get('guid') # if the item has been modified in the archive then it is due to a kill # there is an argument that this item should not be imported at all if doc.find('dcdossier').get('created') != doc.find( 'dcdossier').get('modified'): item[ITEM_STATE] = CONTENT_STATE.KILLED else: item[ITEM_STATE] = CONTENT_STATE.PUBLISHED value = datetime.strptime( self._get_head_value(doc, 'PublicationDate'), '%Y%m%d%H%M%S') item['firstcreated'] = utc.normalize( value) if value.tzinfo else value item['versioncreated'] = item['firstcreated'] generate_unique_id_and_name(item) item['ingest_id'] = id item['source'] = self._get_head_value(doc, 'Agency') self._addkeywords('AsiaPulseCodes', doc, item) byline = self._get_head_value(doc, 'Byline') if byline: item['byline'] = byline # item['service'] = self._get_head_value(doc,'Service') category = self._get_head_value(doc, 'Category') if not category: publication_name = self._get_head_value(doc, 'PublicationName') if publication_name in pubnames: category = pubnames[publication_name] if category: anpacategory = {} anpacategory['qcode'] = category for anpa_category in self._anpa_categories['items']: if anpacategory['qcode'].lower( ) == anpa_category['qcode'].lower(): anpacategory = { 'qcode': anpacategory['qcode'], 'name': anpa_category['name'] } break item['anpa_category'] = [anpacategory] self._addkeywords('CompanyCodes', doc, item) type = self._get_head_value(doc, 'Format') if type == 'x': item[ITEM_TYPE] = CONTENT_TYPE.TEXT elif type == 't': item[ITEM_TYPE] = CONTENT_TYPE.PREFORMATTED else: item[ITEM_TYPE] = CONTENT_TYPE.TEXT item['keyword'] = self._get_head_value(doc, 'Keyword') item['ingest_provider_sequence'] = self._get_head_value( doc, 'Sequence') orginal_source = self._get_head_value(doc, 'Author') if orginal_source: item['original_source'] = orginal_source item['headline'] = self._get_head_value(doc, 'Headline') code = self._get_head_value(doc, 'SubjectRefNum') if code and len(code) == 7: code = '0' + code if code and code in subject_codes: item['subject'] = [] item['subject'].append({ 'qcode': code, 'name': subject_codes[code] }) try: process_iptc_codes(item, None) except: pass slug = self._get_head_value(doc, 'SLUG') if slug: item['slugline'] = slug else: item['slugline'] = self._get_head_value(doc, 'Keyword') # self._addkeywords('Takekey', doc, item) take_key = self._get_head_value(doc, 'Takekey') if take_key: item['anpa_take_key'] = take_key self._addkeywords('Topic', doc, item) self._addkeywords('Selectors', doc, item) el = doc.find('dcdossier/document/body/BodyText') if el is not None: story = el.text if item[ITEM_TYPE] == CONTENT_TYPE.TEXT: story = story.replace('\n ', '<br><br>') story = story.replace('\n', '<br>') item['body_html'] = story else: item['body_html'] = story try: item['word_count'] = get_text_word_count(item['body_html']) except: pass item['pubstatus'] = 'usable' item['allow_post_publish_actions'] = False res = superdesk.get_resource_service('published') original = res.find_one(req=None, guid=item['guid']) if not original: item['_id'] = item['guid'] res.post([item]) else: res.patch(original['_id'], item) if self._limit: self._limit -= 1
def _process_bunch(self, x): # x.findall('dc_rest_docs/dc_rest_doc')[0].get('href') for doc in x.findall('dc_rest_docs/dc_rest_doc'): print(doc.get('href')) id = doc.find('dcdossier').get('id') if int(id) < self._id: self._id = int(id) item = {} item['guid'] = doc.find('dcdossier').get('guid') # if the item has been modified in the archive then it is due to a kill # there is an argument that this item should not be imported at all if doc.find('dcdossier').get('created') != doc.find('dcdossier').get('modified'): item[ITEM_STATE] = CONTENT_STATE.KILLED else: item[ITEM_STATE] = CONTENT_STATE.PUBLISHED value = datetime.strptime(self._get_head_value(doc, 'PublicationDate'), '%Y%m%d%H%M%S') item['firstcreated'] = utc.normalize(value) if value.tzinfo else value item['versioncreated'] = item['firstcreated'] generate_unique_id_and_name(item) item['ingest_id'] = id item['source'] = self._get_head_value(doc, 'Agency') self._addkeywords('AsiaPulseCodes', doc, item) byline = self._get_head_value(doc, 'Byline') if byline: item['byline'] = byline # item['service'] = self._get_head_value(doc,'Service') category = self._get_head_value(doc, 'Category') if not category: publication_name = self._get_head_value(doc, 'PublicationName') if publication_name in pubnames: category = pubnames[publication_name] if category: anpacategory = {} anpacategory['qcode'] = category for anpa_category in self._anpa_categories['items']: if anpacategory['qcode'].lower() == anpa_category['qcode'].lower(): anpacategory = {'qcode': anpacategory['qcode'], 'name': anpa_category['name']} break item['anpa_category'] = [anpacategory] self._addkeywords('CompanyCodes', doc, item) type = self._get_head_value(doc, 'Format') if type == 'x': item[ITEM_TYPE] = CONTENT_TYPE.TEXT elif type == 't': item[ITEM_TYPE] = CONTENT_TYPE.PREFORMATTED else: item[ITEM_TYPE] = CONTENT_TYPE.TEXT item['keyword'] = self._get_head_value(doc, 'Keyword') item['ingest_provider_sequence'] = self._get_head_value(doc, 'Sequence') orginal_source = self._get_head_value(doc, 'Author') if orginal_source: item['original_source'] = orginal_source item['headline'] = self._get_head_value(doc, 'Headline') code = self._get_head_value(doc, 'SubjectRefNum') if code and len(code) == 7: code = '0' + code if code and code in subject_codes: item['subject'] = [] item['subject'].append({'qcode': code, 'name': subject_codes[code]}) try: process_iptc_codes(item, None) except: pass slug = self._get_head_value(doc, 'SLUG') if slug: item['slugline'] = slug else: item['slugline'] = self._get_head_value(doc, 'Keyword') # self._addkeywords('Takekey', doc, item) take_key = self._get_head_value(doc, 'Takekey') if take_key: item['anpa_take_key'] = take_key self._addkeywords('Topic', doc, item) self._addkeywords('Selectors', doc, item) el = doc.find('dcdossier/document/body/BodyText') if el is not None: story = el.text if item[ITEM_TYPE] == CONTENT_TYPE.TEXT: story = story.replace('\n ', '<br><br>') story = story.replace('\n', '<br>') item['body_html'] = story else: item['body_html'] = story try: item['word_count'] = get_text_word_count(item['body_html']) except: pass item['pubstatus'] = 'usable' # this is required for the archived service additional lookup item['item_id'] = item['guid'] item[config.VERSION] = 1 res = superdesk.get_resource_service('archived') original = res.find_one(req=None, guid=item['guid']) if not original: item['_id'] = item['guid'] res.post([item]) else: res.patch(original['_id'], item) if self._limit: self._limit -= 1
def fetch(self, docs, id=None, **kwargs): id_of_fetched_items = [] for doc in docs: id_of_item_to_be_fetched = doc.get( config.ID_FIELD) if id is None else id desk_id = doc.get('desk') stage_id = doc.get('stage') ingest_service = get_resource_service('ingest') ingest_doc = ingest_service.find_one(req=None, _id=id_of_item_to_be_fetched) if not ingest_doc: raise SuperdeskApiError.notFoundError( _('Fail to found ingest item with _id: {id}').format( id=id_of_item_to_be_fetched)) if not is_workflow_state_transition_valid('fetch_from_ingest', ingest_doc[ITEM_STATE]): raise InvalidStateTransitionError() if doc.get('macro'): # there is a macro so transform it ingest_doc = get_resource_service('macros').execute_macro( ingest_doc, doc.get('macro')) archived = utcnow() ingest_service.patch(id_of_item_to_be_fetched, {'archived': archived}) dest_doc = dict(ingest_doc) if doc.get('target'): dest_doc.update(doc.get('target')) new_id = generate_guid(type=GUID_TAG) id_of_fetched_items.append(new_id) dest_doc[config.ID_FIELD] = new_id dest_doc[GUID_FIELD] = new_id generate_unique_id_and_name(dest_doc) dest_doc[config.VERSION] = 1 dest_doc['versioncreated'] = archived send_to(doc=dest_doc, desk_id=desk_id, stage_id=stage_id) dest_doc[ITEM_STATE] = doc.get(ITEM_STATE, CONTENT_STATE.FETCHED) dest_doc[INGEST_ID] = dest_doc[FAMILY_ID] = ingest_doc[ config.ID_FIELD] dest_doc[ITEM_OPERATION] = ITEM_FETCH remove_unwanted(dest_doc) set_original_creator(dest_doc) self.__fetch_items_in_package( dest_doc, desk_id, stage_id, doc.get(ITEM_STATE, CONTENT_STATE.FETCHED)) self.__fetch_associated_items( dest_doc, desk_id, stage_id, doc.get(ITEM_STATE, CONTENT_STATE.FETCHED)) desk = get_resource_service('desks').find_one(req=None, _id=desk_id) if desk and desk.get('default_content_profile'): dest_doc['profile'] = desk['default_content_profile'] if dest_doc.get('type', 'text') in MEDIA_TYPES: dest_doc['profile'] = None get_resource_service(ARCHIVE).post([dest_doc]) insert_into_versions(doc=dest_doc) build_custom_hateoas(custom_hateoas, dest_doc) superdesk.item_fetched.send(self, item=dest_doc, ingest_item=ingest_doc) doc.update(dest_doc) if kwargs.get('notify', True): ingest_doc.update({'task': dest_doc.get('task')}) push_item_move_notification(ingest_doc, doc, 'item:fetch') return id_of_fetched_items
def _process_bunch(self, x): # x.findall('dc_rest_docs/dc_rest_doc')[0].get('href') items = [] for doc in x.findall('dc_rest_docs/dc_rest_doc'): try: # print(doc.get('href')) id = doc.find('dcdossier').get('id') if self._direction: if int(id) > self._id: self._id = int(id) else: if int(id) < self._id: self._id = int(id) item = {} item['guid'] = doc.find('dcdossier').get('guid') item[ITEM_TYPE] = CONTENT_TYPE.TEXT format = self._get_head_value(doc, 'Format') if format == 't': item[FORMAT] = FORMATS.PRESERVED else: item[FORMAT] = FORMATS.HTML # item[FORMAT] = FORMATS.HTML # if the item has been modified in the archive then it is due to a kill # there is an argument that this item should not be imported at all if doc.find('dcdossier').get('created') != doc.find('dcdossier').get('modified'): # item[ITEM_STATE] = CONTENT_STATE.KILLED continue else: item[ITEM_STATE] = CONTENT_STATE.PUBLISHED value = datetime.strptime(self._get_head_value(doc, 'PublicationDate'), '%Y%m%d%H%M%S') local_tz = pytz.timezone('Australia/Sydney') try: aus_dt = local_tz.localize(value, is_dst=None) except NonExistentTimeError as ex: aus_dt = local_tz.localize(value, is_dst=True) except AmbiguousTimeError: aus_dt = local_tz.localize(value, is_dst=False) item['firstcreated'] = aus_dt.astimezone(pytz.utc) item['versioncreated'] = item['firstcreated'] generate_unique_id_and_name(item) item['ingest_id'] = id last_line = None el = doc.find('dcdossier/document/body/BodyText') if el is not None: story = el.text lines = story.split('\n') if len(lines) > 0: last_line = lines[-1] if item.get(FORMAT) == FORMATS.HTML: story = story.replace('\n ', '<p></p>') story = story.replace('\n', '<br>') item['body_html'] = '<p>' + story + '</p>' else: item['body_html'] = '<pre>' + story + '</pre>' try: item['word_count'] = get_text_word_count(item['body_html']) except: pass else: # Items with no body are ignored continue item['source'] = self._get_head_value(doc, 'Agency') # if the source document contains no agency then by definition it is unknown if item['source'] is None: item['source'] = 'UNKNOWN' else: # check if the source of the document was Newscentre dc_unique = doc.find('dcdossier').get('unique') if dc_unique.startswith('NC.') and last_line is not None: # The AFR summary articles all have agency values 25 chars long if len(item['source']) == 25: item['source'] = 'AAP' # is it a numeric Agency elif self._get_head_value(doc, 'Agency').isdigit(): sign_off = last_line.split(' ') if len(sign_off) > 0: item['source'] = sign_off[0].upper() else: item['source'] = sign_off.upper() # clean up what we have extracted if item['source'].startswith('AAP'): item['source'] = 'AAP' else: # make sure it is one of the known values if item['source'] not in {'AAP', 'AP', 'REUT', 'Asia Pulse', 'DPA', 'AFP', 'RAW', 'NZA', 'NZPA', 'KRT', 'PA', 'PAA', 'SNI', 'REUTERS'}: print('Source : {}'.format(item['source'])) item['source'] = 'UNKNOWN' # self._addkeywords('AsiaPulseCodes', doc, item) byline = self._get_head_value(doc, 'Byline') if byline: item['byline'] = byline # item['service'] = self._get_head_value(doc,'Service') category = self._get_head_value(doc, 'Category') if not category: publication_name = self._get_head_value(doc, 'PublicationName') if publication_name in pubnames: category = pubnames[publication_name] if category: anpacategory = {} anpacategory['qcode'] = category for anpa_category in self._anpa_categories['items']: if anpacategory['qcode'].lower() == anpa_category['qcode'].lower(): anpacategory = {'qcode': anpacategory['qcode'], 'name': anpa_category['name']} break item['anpa_category'] = [anpacategory] self._addkeywords('CompanyCodes', doc, item) item['keyword'] = self._get_head_value(doc, 'Keyword') item['ingest_provider_sequence'] = self._get_head_value(doc, 'Sequence') orginal_source = self._get_head_value(doc, 'Author') if orginal_source: item['original_source'] = orginal_source item['headline'] = self._get_head_value(doc, 'Headline') code = self._get_head_value(doc, 'SubjectRefNum') if code and len(code) == 7: code = '0' + code if code and code in subject_codes: item['subject'] = [] item['subject'].append({'qcode': code, 'name': subject_codes[code]}) try: process_iptc_codes(item, None) except: pass slug = self._get_head_value(doc, 'SLUG') if slug: item['slugline'] = slug else: item['slugline'] = self._get_head_value(doc, 'Keyword') take_key = self._get_head_value(doc, 'Takekey') if take_key: item['anpa_take_key'] = take_key self._addkeywords('Topic', doc, item) # self._addkeywords('Selectors', doc, item) item['pubstatus'] = 'usable' # this is required for the archived service additional lookup item['item_id'] = item['guid'] item[config.VERSION] = 1 item['flags'] = {'marked_archived_only': True} # item['_id'] = ObjectId(id.rjust(24,'0')) item['_id'] = ObjectId() items.append(item) if self._limit: self._limit -= 1 # print(item) except Exception as ex: print('Exception parsing DC documnent {}'.format(id)) pass try: res = superdesk.get_resource_service('archived') s = time.time() res.post(items) print('Post to Batch to Superdesk took {:.2f}'.format(time.time() - s)) except Exception as ex: if ex.code == 409: print('Key clash exceptionn detected') # create a list of the guids we tried to post guids = [g['guid'] for g in items] # create a query for all those id's query = { 'size': self.BATCH_SIZE, 'query': { 'filtered': { 'filter': { "terms": { "guid": [guids] } } } } } req = ParsedRequest() repos = 'archived' req.args = {'source': json.dumps(query), 'repo': repos} search_res = superdesk.get_resource_service('search') existing = search_res.get(req=req, lookup=None) existing_guids = [e['guid'] for e in existing] not_existing = [g for g in guids if g not in existing_guids] for missing_guid in not_existing: i = [m for m in items if m['guid'] == missing_guid] original = res.find_one(req=None, guid=i[0]['guid']) if not original: try: s = time.time() res.post(i) print('Post single item to Superdesk in {:.2f} seconds'.format(time.time() - s)) except Exception as ex: print('Exception posting single item') else: print('Exception posting batch')
def _process_bunch(self, x): # x.findall('dc_rest_docs/dc_rest_doc')[0].get('href') items = [] for doc in x.findall('dc_rest_docs/dc_rest_doc'): try: # print(doc.get('href')) id = doc.find('dcdossier').get('id') if self._direction: if int(id) > self._id: self._id = int(id) else: if int(id) < self._id: self._id = int(id) item = {} item['guid'] = doc.find('dcdossier').get('guid') item[ITEM_TYPE] = CONTENT_TYPE.TEXT format = self._get_head_value(doc, 'Format') if format == 't': item[FORMAT] = FORMATS.PRESERVED else: item[FORMAT] = FORMATS.HTML # item[FORMAT] = FORMATS.HTML # if the item has been modified in the archive then it is due to a kill # there is an argument that this item should not be imported at all if doc.find('dcdossier').get('created') != doc.find( 'dcdossier').get('modified'): # item[ITEM_STATE] = CONTENT_STATE.KILLED continue else: item[ITEM_STATE] = CONTENT_STATE.PUBLISHED value = datetime.strptime( self._get_head_value(doc, 'PublicationDate'), '%Y%m%d%H%M%S') local_tz = pytz.timezone('Australia/Sydney') try: aus_dt = local_tz.localize(value, is_dst=None) except NonExistentTimeError as ex: aus_dt = local_tz.localize(value, is_dst=True) except AmbiguousTimeError: aus_dt = local_tz.localize(value, is_dst=False) item['firstcreated'] = aus_dt.astimezone(pytz.utc) item['versioncreated'] = item['firstcreated'] generate_unique_id_and_name(item) item['ingest_id'] = id last_line = None el = doc.find('dcdossier/document/body/BodyText') if el is not None: story = el.text lines = story.split('\n') if len(lines) > 0: last_line = lines[-1] if item.get(FORMAT) == FORMATS.HTML: story = story.replace('\n ', '<p></p>') story = story.replace('\n', '<br>') item['body_html'] = '<p>' + story + '</p>' else: item['body_html'] = '<pre>' + story + '</pre>' try: item['word_count'] = get_text_word_count( item['body_html']) except: pass else: # Items with no body are ignored continue item['source'] = self._get_head_value(doc, 'Agency') # if the source document contains no agency then by definition it is unknown if item['source'] is None: item['source'] = 'UNKNOWN' else: # check if the source of the document was Newscentre dc_unique = doc.find('dcdossier').get('unique') if dc_unique.startswith('NC.') and last_line is not None: # The AFR summary articles all have agency values 25 chars long if len(item['source']) == 25: item['source'] = 'AAP' # is it a numeric Agency elif self._get_head_value(doc, 'Agency').isdigit(): sign_off = last_line.split(' ') if len(sign_off) > 0: item['source'] = sign_off[0].upper() else: item['source'] = sign_off.upper() # clean up what we have extracted if item['source'].startswith('AAP'): item['source'] = 'AAP' else: # make sure it is one of the known values if item['source'] not in { 'AAP', 'AP', 'REUT', 'Asia Pulse', 'DPA', 'AFP', 'RAW', 'NZA', 'NZPA', 'KRT', 'PA', 'PAA', 'SNI', 'REUTERS' }: print('Source : {}'.format(item['source'])) item['source'] = 'UNKNOWN' # self._addkeywords('AsiaPulseCodes', doc, item) byline = self._get_head_value(doc, 'Byline') if byline: item['byline'] = byline # item['service'] = self._get_head_value(doc,'Service') category = self._get_head_value(doc, 'Category') if not category: publication_name = self._get_head_value( doc, 'PublicationName') if publication_name in pubnames: category = pubnames[publication_name] if category: anpacategory = {} anpacategory['qcode'] = category for anpa_category in self._anpa_categories['items']: if anpacategory['qcode'].lower( ) == anpa_category['qcode'].lower(): anpacategory = { 'qcode': anpacategory['qcode'], 'name': anpa_category['name'] } break item['anpa_category'] = [anpacategory] self._addkeywords('CompanyCodes', doc, item) item['keyword'] = self._get_head_value(doc, 'Keyword') item['ingest_provider_sequence'] = self._get_head_value( doc, 'Sequence') orginal_source = self._get_head_value(doc, 'Author') if orginal_source: item['original_source'] = orginal_source item['headline'] = self._get_head_value(doc, 'Headline') code = self._get_head_value(doc, 'SubjectRefNum') if code and len(code) == 7: code = '0' + code if code and code in subject_codes: item['subject'] = [] item['subject'].append({ 'qcode': code, 'name': subject_codes[code] }) try: process_iptc_codes(item, None) except: pass slug = self._get_head_value(doc, 'SLUG') if slug: item['slugline'] = slug else: item['slugline'] = self._get_head_value(doc, 'Keyword') take_key = self._get_head_value(doc, 'Takekey') if take_key: item['anpa_take_key'] = take_key self._addkeywords('Topic', doc, item) # self._addkeywords('Selectors', doc, item) item['pubstatus'] = 'usable' # this is required for the archived service additional lookup item['item_id'] = item['guid'] item[config.VERSION] = 1 item['flags'] = {'marked_archived_only': True} # item['_id'] = ObjectId(id.rjust(24,'0')) item['_id'] = ObjectId() items.append(item) if self._limit: self._limit -= 1 # print(item) except Exception as ex: print('Exception parsing DC documnent {}'.format(id)) pass try: res = superdesk.get_resource_service('archived') s = time.time() res.post(items) print('Post to Batch to Superdesk took {:.2f}'.format(time.time() - s)) except Exception as ex: if ex.code == 409: print('Key clash exceptionn detected') # create a list of the guids we tried to post guids = [g['guid'] for g in items] # create a query for all those id's query = { 'size': self.BATCH_SIZE, 'query': { 'filtered': { 'filter': { "terms": { "guid": [guids] } } } } } req = ParsedRequest() repos = 'archived' req.args = {'source': json.dumps(query), 'repo': repos} search_res = superdesk.get_resource_service('search') existing = search_res.get(req=req, lookup=None) existing_guids = [e['guid'] for e in existing] not_existing = [g for g in guids if g not in existing_guids] for missing_guid in not_existing: i = [m for m in items if m['guid'] == missing_guid] original = res.find_one(req=None, guid=i[0]['guid']) if not original: try: s = time.time() res.post(i) print( 'Post single item to Superdesk in {:.2f} seconds' .format(time.time() - s)) except Exception as ex: print('Exception posting single item') else: print('Exception posting batch')
def _process_bunch(self, x): # x.findall('dc_rest_docs/dc_rest_doc')[0].get('href') items = [] for doc in x.findall('dc_rest_docs/dc_rest_doc'): try: # print(doc.get('href')) id = doc.find('dcdossier').get('id') if self._direction: if int(id) > self._id: self._id = int(id) else: if int(id) < self._id: self._id = int(id) item = {} item['guid'] = doc.find('dcdossier').get('guid') # if the item has been modified in the archive then it is due to a kill # there is an argument that this item should not be imported at all if doc.find('dcdossier').get('created') != doc.find( 'dcdossier').get('modified'): # item[ITEM_STATE] = CONTENT_STATE.KILLED continue else: item[ITEM_STATE] = CONTENT_STATE.PUBLISHED value = datetime.strptime( self._get_head_value(doc, 'PublicationDate'), '%Y%m%d%H%M%S') local_tz = pytz.timezone('Australia/Sydney') try: aus_dt = local_tz.localize(value, is_dst=None) except NonExistentTimeError as ex: aus_dt = local_tz.localize(value, is_dst=True) except AmbiguousTimeError: aus_dt = local_tz.localize(value, is_dst=False) item['firstcreated'] = aus_dt.astimezone(pytz.utc) item['versioncreated'] = item['firstcreated'] generate_unique_id_and_name(item) item['ingest_id'] = id item['source'] = self._get_head_value(doc, 'Agency') # self._addkeywords('AsiaPulseCodes', doc, item) byline = self._get_head_value(doc, 'Byline') if byline: item['byline'] = byline # item['service'] = self._get_head_value(doc,'Service') category = self._get_head_value(doc, 'Category') if not category: publication_name = self._get_head_value( doc, 'PublicationName') if publication_name in pubnames: category = pubnames[publication_name] if category: anpacategory = {} anpacategory['qcode'] = category for anpa_category in self._anpa_categories['items']: if anpacategory['qcode'].lower( ) == anpa_category['qcode'].lower(): anpacategory = { 'qcode': anpacategory['qcode'], 'name': anpa_category['name'] } break item['anpa_category'] = [anpacategory] # self._addkeywords('CompanyCodes', doc, item) type = self._get_head_value(doc, 'Format') if type == 'x': item[ITEM_TYPE] = CONTENT_TYPE.TEXT elif type == 't': item[ITEM_TYPE] = CONTENT_TYPE.PREFORMATTED else: item[ITEM_TYPE] = CONTENT_TYPE.TEXT item['keyword'] = self._get_head_value(doc, 'Keyword') item['ingest_provider_sequence'] = self._get_head_value( doc, 'Sequence') orginal_source = self._get_head_value(doc, 'Author') if orginal_source: item['original_source'] = orginal_source item['headline'] = self._get_head_value(doc, 'Headline') code = self._get_head_value(doc, 'SubjectRefNum') if code and len(code) == 7: code = '0' + code if code and code in subject_codes: item['subject'] = [] item['subject'].append({ 'qcode': code, 'name': subject_codes[code] }) try: process_iptc_codes(item, None) except: pass slug = self._get_head_value(doc, 'SLUG') if slug: item['slugline'] = slug else: item['slugline'] = self._get_head_value(doc, 'Keyword') take_key = self._get_head_value(doc, 'Takekey') if take_key: item['anpa_take_key'] = take_key # self._addkeywords('Topic', doc, item) # self._addkeywords('Selectors', doc, item) el = doc.find('dcdossier/document/body/BodyText') if el is not None: story = el.text if item[ITEM_TYPE] == CONTENT_TYPE.TEXT: story = story.replace('\n ', '<br><br>') story = story.replace('\n', '<br>') item['body_html'] = story else: item['body_html'] = story try: item['word_count'] = get_text_word_count( item['body_html']) except: pass item['pubstatus'] = 'usable' # this is required for the archived service additional lookup item['item_id'] = item['guid'] item[config.VERSION] = 1 item['flags'] = {'marked_archived_only': True} # item['_id'] = ObjectId(id.rjust(24,'0')) item['_id'] = ObjectId() items.append(item) if self._limit: self._limit -= 1 # print(item) except Exception as ex: print('Exception parsing DC documnent {}'.format(id)) pass try: res = superdesk.get_resource_service('archived') s = time.time() res.post(items) print('Post to Batch to Superdesk took {:.2f}'.format(time.time() - s)) except Exception as ex: if ex.code == 409: print('Key clash exceptionn detected') # create a list of the guids we tried to post guids = [g['guid'] for g in items] # create a query for all those id's query = { 'size': self.BATCH_SIZE, 'query': { 'filtered': { 'filter': { "terms": { "guid": [guids] } } } } } req = ParsedRequest() repos = 'archived' req.args = {'source': json.dumps(query), 'repo': repos} search_res = superdesk.get_resource_service('search') existing = search_res.get(req=req, lookup=None) existing_guids = [e['guid'] for e in existing] not_existing = [g for g in guids if g not in existing_guids] for missing_guid in not_existing: i = [m for m in items if m['guid'] == missing_guid] original = res.find_one(req=None, guid=i[0]['guid']) if not original: try: s = time.time() res.post(i) print( 'Post single item to Superdesk in {:.2f} seconds' .format(time.time() - s)) except Exception as ex: print('Exception posting single item') else: print('Exception posting batch')