def create_object_slug(self, container, data, *args, **kwargs): """Create an content object slug for the given data """ id = data.get("id") portal_type = data.get("portal_type") types_tool = api.get_tool("portal_types") fti = types_tool.getTypeInfo(portal_type) logger.info("Creating {} with ID {} in parent path {}".format( portal_type, id, api.get_path(container))) if fti.product: obj = _createObjectByType(portal_type, container, id) else: # newstyle factory factory = getUtility(IFactory, fti.factory) obj = factory(id, *args, **kwargs) if hasattr(obj, '_setPortalTypeName'): obj._setPortalTypeName(fti.getId()) # notifies ObjectWillBeAddedEvent, ObjectAddedEvent and ContainerModifiedEvent container._setObject(id, obj) # we get the object here with the current object id, as it might be renamed # already by an event handler obj = container._getOb(obj.getId()) return obj
def store(self, domain, key, value, overwrite=False): """Store a dictionary in the domain's storage """ # Get the storage for the current URL storage = self.get_storage(domain=domain) datastore = storage["data"] indexstore = storage["index"] # already fetched if key in datastore and not overwrite: logger.info("Skipping existing key {}".format(key)) return # Create some indexes for index in ["portal_type", "parent_id", "parent_path"]: index_key = "by_{}".format(index) if not indexstore.get(index_key): indexstore[index_key] = OOBTree() indexvalue = value.get(index) # Check if the index value, e.g. the portal_type="Sample", is # already known as a key in the index. if not indexstore[index_key].get(indexvalue): indexstore[index_key][indexvalue] = OOSet() indexstore[index_key][indexvalue].add(key) # store the data datastore[key] = value
def _recover_failed_objects(self): """ Checks for non-updated objects (by filtering null Title) and re-updates them. :return: """ uc = api.get_tool('uid_catalog', self.portal) # Reference objects must be skipped query = Eq('Title', '') & ~ Eq('portal_type', 'Reference') & ~ \ Eq('portal_type', 'ARReport') brains = uc.evalAdvancedQuery(query) total = len(brains) logger.info('*** Recovering {} objects ***'.format(total)) for idx, brain in enumerate(brains): # Check if object has been created during migration uid = brain.UID existing = self.sh.find_unique(LOCAL_UID, uid) if existing is None: continue logger.info('Recovering {0}/{1} : {2} '.format( idx + 1, total, existing[REMOTE_PATH])) # Mark that update failed previously existing['updated'] = '0' self._handle_obj(existing, handle_dependencies=False) obj = brain.getObject() obj.reindexObject() return
def _import_registry_records(self): """Import the registry records from the storage identified by domain """ if not self.import_registry: return logger.info("***Importing Registry Records: {}***".format( self.domain_name)) storage = self.get_storage() registry_store = storage["registry"] current_registry = getUtility(IRegistry) # For each of the keywords used to retrieve registry data # import the records that were found for key in registry_store.keys(): records = registry_store[key] for record in records.keys(): logger.debug("Updating record {} with value {}".format( record, records.get(record))) if record not in current_registry.records: logger.warn( "Current Registry has no record named {}".format( record)) continue current_registry[record] = records.get(record) logger.info("*** Registry Records Imported: {}***".format( self.domain_name))
def fetch_users(self, domain): """Fetch all users from the source URL """ logger.info("*** FETCH USERS {} ***".format(domain)) storage = self.get_storage(domain=domain) userstore = storage["users"] for user in self.yield_items("users"): username = user.get("username") userstore[username] = user
def run(self): """ :return: """ logger.info("*** FETCH STARTED {} ***".format(self.domain_name)) if self.import_registry: self._fetch_registry_records(keys=["bika", "senaite"]) if self.import_settings: self._fetch_settings() self._fetch_data() logger.info("*** FETCH FINISHED {} ***".format(self.domain_name)) return
def _import_settings(self): """Import the settings from the storage identified by domain """ if not self.import_settings: return logger.info("*** Importing Settings: {} ***".format(self.domain_name)) storage = self.get_storage() settings_store = storage["settings"] for key in settings_store: self._set_settings(key, settings_store[key])
def import_users(self, domain): """Import the users from the storage identified by domain """ logger.info("*** IMPORT USERS {} ***".format(domain)) storage = self.get_storage(domain=domain) userstore = storage["users"] for username, userdata in userstore.items(): if ploneapi.user.get(username): logger.info("Skipping existing user {}".format(username)) continue email = userdata.get("email", "") roles = userdata.get("roles", ()) # TODO handle groups # groups = userdata.get("groups", groups=groups)()) logger.info("Creating user {}".format(username)) message = _("Created new user {} with password {}".format( username, username)) # create new user with the same password as the username ploneapi.user.create( email=email, username=username, password=username, roles=roles, ) self.add_status_message(message, "info") logger.info(message)
def _import_data(self): """ For each UID from the fetched data, creates and updates objects step by step. :return: """ logger.info("*** IMPORT DATA STARTED: {} ***".format(self.domain_name)) self.sh = SoupHandler(self.domain_name) self.uids_to_reindex = [] storage = self.get_storage() ordered_uids = storage["ordered_uids"] total_object_count = len(ordered_uids) start_time = datetime.now() for item_index, r_uid in enumerate(ordered_uids): row = self.sh.find_unique(REMOTE_UID, r_uid) logger.debug("Handling: {} ".format(row[REMOTE_PATH])) self._handle_obj(row) # Handling object means there is a chunk containing several objects # which have been created and updated. Reindex them now. self.uids_to_reindex = list(set(self.uids_to_reindex)) for uid in self.uids_to_reindex: # It is possible that the object has a method (not a Field # in its Schema) which is used as an index and it fails. # TODO: Make sure reindexing won't fail! try: obj = api.get_object_by_uid(uid) obj.reindexObject() except Exception, e: rec = self.sh.find_unique(LOCAL_UID, uid) logger.error("Error while reindexing {} - {}".format( rec, e)) self._non_commited_objects += len(self.uids_to_reindex) self.uids_to_reindex = [] # Commit the transaction if necessary if self._non_commited_objects > COMMIT_INTERVAL: transaction.commit() logger.info("Committed: {} / {} ".format( self._non_commited_objects, total_object_count)) self._non_commited_objects = 0 # Log.info every 50 objects imported utils.log_process(task_name="Data Import", started=start_time, processed=item_index + 1, total=total_object_count, frequency=50)
def import_registry_records(self, domain): """Import the registry records from the storage identified by domain """ logger.info("*** IMPORT REGISTRY RECORDS {} ***".format(domain)) storage = self.get_storage(domain=domain) registry_store = storage["registry"] current_registry = getUtility(IRegistry) # For each of the keywords used to retrieve registry data # import the records that were found for key in registry_store.keys(): records = registry_store[key] for record in records.keys(): logger.info("Updating record {} with value {}".format( record, records.get(record))) current_registry[record] = records.get(record)
def insert(self, data): """ Inserts a row to the soup table. :param data: row dictionary :return: intid of created record """ if self._already_exists(data): logger.debug("Trying to insert existing record... {}".format(data)) return False record = Record() record.attrs[REMOTE_UID] = data[REMOTE_UID] record.attrs[LOCAL_UID] = data.get(LOCAL_UID, "") record.attrs[REMOTE_PATH] = data[REMOTE_PATH] record.attrs[LOCAL_PATH] = data.get(LOCAL_PATH, "") record.attrs[PORTAL_TYPE] = data[PORTAL_TYPE] record.attrs[UPDATED] = data.get(UPDATED, "0") r_id = self.soup.add(record) logger.info("Record {} inserted: {}".format(r_id, data)) return r_id
def _fetch_settings(self, keys=None): """Fetch source instance settings by keyword """ logger.info("*** Fetching Settings: {} ***".format(self.domain_name)) storage = self.get_storage() settings_store = storage["settings"] if keys is None: retrieved_settings = self._get_settings_by_key() else: retrieved_settings = [] for key in keys: retrieved_settings += self._get_settings_by_key(key) for setting_dict in retrieved_settings: for key in setting_dict.keys(): if not setting_dict[key]: continue settings_store[key] = setting_dict[key]
def get_json(self, url_or_endpoint, **kw): """Fetch the given url or endpoint and return a parsed JSON object """ api_url = self.get_api_url(url_or_endpoint, **kw) logger.info("get_json::url={}".format(api_url)) try: response = self.session.get(api_url) except Exception as e: message = "Could not connect to {} Please check.".format(api_url) logger.error(e) self.add_status_message(message, "error") return {} status = response.status_code if status != 200: message = "GET for {} ({}) returned Status Code {}. Please check.".format( url_or_endpoint, api_url, status) self.add_status_message(message, "warning") return {} return response.json()
def log_process(task_name, started, processed, total, frequency=1): """Logs the current status of the process :param task_name: name of the task :param started: datetime when the process started :param processed: number of processed items :param total: total number of items to be processed :param frequency: number of items to be processed before logging more :return: """ if frequency <= 0 or processed % frequency > 0 or total <= 0: return percentage = "0.0" if processed > 0: percentage = "{0:.1f}".format(processed * 100.0 / total) estimated = get_estimated_end_date(started, processed, total) estimated = estimated and estimated.strftime("%Y-%m-%d %H:%M:%S") or "-" msg = "{}: {} / {} ({}%) - ETD: {}".format(task_name, processed, total, percentage, estimated) logger.info(msg)
def fetch_registry_records(self, domain, keys=None): """Fetch configuration registry records of interest (those associated to the keywords passed) from source instance """ logger.info("*** FETCH REGISTRY RECORDS {} ***".format(domain)) storage = self.get_storage(domain=domain) registry_store = storage["registry"] retrieved_records = {} if keys is None: retrieved_records["all"] = self.get_registry_records_by_key() else: for key in keys: retrieved_records[key] = self.get_registry_records_by_key(key) for key in retrieved_records.keys(): if not retrieved_records[key]: continue registry_store[key] = OOBTree() for record in retrieved_records[key][0].keys(): registry_store[key][record] = retrieved_records[key][0][record]
def _import_users(self): """Import the users from the storage identified by domain """ if not self.import_users: return logger.info("*** Importing Users: {} ***".format(self.domain_name)) for user in self.yield_items("users"): username = user.get("username") if ploneapi.user.get(username): logger.debug("Skipping existing user {}".format(username)) continue email = user.get("email", "") if not email: email = "{}@example.com".format(username) roles = user.get("roles", ()) groups = user.get("groups", ()) logger.debug("Creating user {}".format(username)) message = _("Created new user {} with password {}".format( username, username)) # create new user with the same password as the username ploneapi.user.create( email=email, username=username, password=username, roles=roles, ) for group in groups: # Try to add the user to the group if group exists. try: ploneapi.group.add_user(groupname=group, username=username) except KeyError: continue logger.debug(message) logger.info("*** Users Were Imported: {} ***".format(self.domain_name))
def _fetch_registry_records(self, keys=None): """Fetch configuration registry records of interest (those associated to the keywords passed) from source instance """ logger.info("*** Fetching Registry Records: {} ***".format( self.domain_name)) storage = self.get_storage() registry_store = storage["registry"] retrieved_records = {} if keys is None: retrieved_records["all"] = self._get_registry_records_by_key() else: for key in keys: retrieved_records[key] = self._get_registry_records_by_key(key) for key in retrieved_records.keys(): if not retrieved_records[key]: continue registry_store[key] = OOBTree() for record in retrieved_records[key][0].keys(): registry_store[key][record] = retrieved_records[key][0][record] logger.info("*** Registry Records Fetched: {} ***".format( self.domain_name))
def reindex_updated_objects(self): """ Reindexes updated objects. """ total = len(self.uids_to_reindex) logger.info( 'Reindexing {} objects which were updated...'.format(total)) indexed = 0 for uid in self.uids_to_reindex: obj = api.get_object_by_uid(uid[0], None) if obj is None: logger.error("Object not found: {} ".format(uid[1])) continue obj.reindexObject() indexed += 1 if indexed % 100 == 0: logger.info('{} objects were reindexed, remain {}'.format( indexed, total - indexed)) logger.info('Reindexing finished...')
def __call__(self): protect.CheckAuthenticator(self.request.form) logger.info("**** AUTO SYNC STARTED ****") self.portal = api.get_portal() storage = u.get_annotation(self.portal)[SYNC_STORAGE] for domain_name, values in storage.iteritems(): # Check if Auto-Sync is enabled for this Remote if not values["configuration"]["auto_sync"]: continue logger.info("Updating data with: '{}' ".format(domain_name)) self.request.form["dataform"] = 1 self.request.form["update"] = 1 self.request.form["domain_name"] = domain_name response = Sync(self.context, self.request) response() logger.info("**** AUTO SYNC FINISHED ****") return "Done..."
def __call__(self): protect.CheckAuthenticator(self.request.form) self.portal = api.get_portal() self.request.set('disable_plone.rightcolumn', 1) self.request.set('disable_border', 1) # Handle form submit form = self.request.form fetchform = form.get("fetchform", False) dataform = form.get("dataform", False) if not any([fetchform, dataform]): return self.template() # remember the form field values url = form.get("url", "") if not url.startswith("http"): url = "http://{}".format(url) self.url = url self.username = form.get("ac_name", None) self.password = form.get("ac_password", None) # Handle "Import" action if form.get("import", False): domain = form.get("domain", None) self.import_registry_records(domain) self.import_users(domain) self.import_data(domain) logger.info("*** END OF DATA IMPORT {} ***".format(domain)) return self.template() # Handle "Clear this Storage" action if form.get("clear_storage", False): domain = form.get("domain", None) del self.storage[domain] message = _("Cleared Storage {}".format(domain)) self.add_status_message(message, "info") return self.template() # Handle "Clear all Storages" action if form.get("clear", False): self.flush_storage() message = _("Cleared Data Storage") self.add_status_message(message, "info") return self.template() # Handle "Fetch" action if form.get("fetch", False): # check if all mandatory fields have values if not all([self.url, self.username, self.password]): message = _("Please fill in all required fields") self.add_status_message(message, "error") return self.template() # initialize the session self.session = self.get_session(self.username, self.password) # remember the credentials in the storage storage = self.get_storage(self.url) storage["credentials"]["username"] = self.username storage["credentials"]["password"] = self.password # try to get the version of the remote JSON API version = self.get_version() if not version or not version.get('version'): message = _( "Please install senaite.jsonapi on the source system") self.add_status_message(message, "error") return self.template() # try to get the current logged in user user = self.get_authenticated_user() if not user or user.get("authenticated") is False: message = _("Wrong username/password") self.add_status_message(message, "error") return self.template() domain = self.url # Fetch all users from the source self.fetch_users(domain) # Start the fetch process beginning from the portal object self.fetch_data(domain, uid="0") # Fetch registry records that contain the word bika or senaite self.fetch_registry_records(domain, keys=["bika", "senaite"]) logger.info("*** FETCHING DATA FINISHED {} ***".format(domain)) # always render the template return self.template()
class ImportStep(SyncStep): """ Class for the Import step of the Synchronization. It must create and update objects based on previously fetched data. """ fields_to_skip = [ 'id', # Overriding ID's can remove prefixes 'excludeFromNav', 'constrainTypesMode', 'allowDiscussion' ] def __init__(self, credentials, config): SyncStep.__init__(self, credentials, config) # A list to keep UID's of an object chunk self.uids_to_reindex = [] # An 'infinite recursion preventative' list of objects which are # being updated. self._queue = [] # An Integer to count the number of non-committed objects. self._non_commited_objects = 0 self.skipped = [] def run(self): """ :return: """ self.session = self.get_session() self._import_registry_records() self._import_settings() self._import_users() self._import_data() return def _import_settings(self): """Import the settings from the storage identified by domain """ if not self.import_settings: return logger.info("*** Importing Settings: {} ***".format(self.domain_name)) storage = self.get_storage() settings_store = storage["settings"] for key in settings_store: self._set_settings(key, settings_store[key]) def _set_settings(self, key, data): """Set settings by key """ # Get the Schema interface of the settings being imported ischemas = CONTROLPANEL_INTERFACE_MAPPING.get(key) if not ischemas: return for ischema_name in data.keys(): ischema = None for candidate_schema in ischemas: if candidate_schema.getName() == ischema_name: ischema = candidate_schema schema = getAdapter(api.get_portal(), ischema) # Once we have the schema set the data schema_import_data = data.get(ischema_name) for schema_field in schema_import_data: if schema_import_data[schema_field]: self._set_attr_from_json(schema, schema_field, schema_import_data[schema_field]) def _set_attr_from_json(self, schema, attribute, data): """Set schema attribute from JSON data. Since JSON converts tuples to lists we have to perform a preventive check before setting the value to see if the expected value is a tuple or a list. In the case it is a tuple we cast the list to tuple """ if hasattr(schema, attribute) and data: current_value = getattr(schema, attribute) if type(current_value) == tuple: setattr(schema, attribute, tuple(data)) else: setattr(schema, attribute, data) def _import_registry_records(self): """Import the registry records from the storage identified by domain """ if not self.import_registry: return logger.info("***Importing Registry Records: {}***".format( self.domain_name)) storage = self.get_storage() registry_store = storage["registry"] current_registry = getUtility(IRegistry) # For each of the keywords used to retrieve registry data # import the records that were found for key in registry_store.keys(): records = registry_store[key] for record in records.keys(): logger.debug("Updating record {} with value {}".format( record, records.get(record))) if record not in current_registry.records: logger.warn( "Current Registry has no record named {}".format( record)) continue current_registry[record] = records.get(record) logger.info("*** Registry Records Imported: {}***".format( self.domain_name)) def _import_users(self): """Import the users from the storage identified by domain """ if not self.import_users: return logger.info("*** Importing Users: {} ***".format(self.domain_name)) for user in self.yield_items("users"): username = user.get("username") if ploneapi.user.get(username): logger.debug("Skipping existing user {}".format(username)) continue email = user.get("email", "") if not email: email = "{}@example.com".format(username) roles = user.get("roles", ()) groups = user.get("groups", ()) logger.debug("Creating user {}".format(username)) message = _("Created new user {} with password {}".format( username, username)) # create new user with the same password as the username ploneapi.user.create( email=email, username=username, password=username, roles=roles, ) for group in groups: # Try to add the user to the group if group exists. try: ploneapi.group.add_user(groupname=group, username=username) except KeyError: continue logger.debug(message) logger.info("*** Users Were Imported: {} ***".format(self.domain_name)) def _import_data(self): """ For each UID from the fetched data, creates and updates objects step by step. :return: """ logger.info("*** IMPORT DATA STARTED: {} ***".format(self.domain_name)) self.sh = SoupHandler(self.domain_name) self.uids_to_reindex = [] storage = self.get_storage() ordered_uids = storage["ordered_uids"] total_object_count = len(ordered_uids) start_time = datetime.now() for item_index, r_uid in enumerate(ordered_uids): row = self.sh.find_unique(REMOTE_UID, r_uid) logger.debug("Handling: {} ".format(row[REMOTE_PATH])) self._handle_obj(row) # Handling object means there is a chunk containing several objects # which have been created and updated. Reindex them now. self.uids_to_reindex = list(set(self.uids_to_reindex)) for uid in self.uids_to_reindex: # It is possible that the object has a method (not a Field # in its Schema) which is used as an index and it fails. # TODO: Make sure reindexing won't fail! try: obj = api.get_object_by_uid(uid) obj.reindexObject() except Exception, e: rec = self.sh.find_unique(LOCAL_UID, uid) logger.error("Error while reindexing {} - {}".format( rec, e)) self._non_commited_objects += len(self.uids_to_reindex) self.uids_to_reindex = [] # Commit the transaction if necessary if self._non_commited_objects > COMMIT_INTERVAL: transaction.commit() logger.info("Committed: {} / {} ".format( self._non_commited_objects, total_object_count)) self._non_commited_objects = 0 # Log.info every 50 objects imported utils.log_process(task_name="Data Import", started=start_time, processed=item_index + 1, total=total_object_count, frequency=50) # Delete the UID list from the storage. storage["ordered_uids"] = [] self._recover_failed_objects() # Mark all objects as non-updated for the next import. self.sh.reset_updated_flags() logger.info("*** END OF DATA IMPORT: {} ***".format(self.domain_name))
def update_object_with_data(self, obj, data, domain): """Update an existing object with data """ # get the storage and UID map storage = self.get_storage(domain=domain) uidmap = storage["uidmap"] # Proxy Fields must be set after its dependency object is already set. # Thus, we will store all the ProxyFields and set them in the end proxy_fields = [] for fieldname, field in api.get_fields(obj).items(): fm = IFieldManager(field) value = data.get(fieldname) # handle JSON data reference fields if isinstance(value, dict) and value.get("uid"): # dereference the referenced object value = self.dereference_object(value.get("uid"), uidmap) elif isinstance(value, (list, tuple)): for item in value: # If it is list of json data dict of objects, add local # uid to that dictionary. This local_uid can be used in # Field Managers. if isinstance(item, dict): for k, v in item.iteritems(): if 'uid' in k: local_uid = uidmap.get(v) item[k] = local_uid # handle file fields if field.type in ("file", "image", "blob"): if data.get(fieldname) is not None: fileinfo = data.get(fieldname) url = fileinfo.get("download") filename = fileinfo.get("filename") data["filename"] = filename response = requests.get(url) value = response.content # Leave the Proxy Fields for later if isinstance(fm, ProxyFieldManager): proxy_fields.append({ 'field_name': fieldname, 'fm': fm, 'value': value }) continue logger.info("Setting value={} on field={} of object={}".format( repr(value), fieldname, api.get_id(obj))) try: fm.set(obj, value) except: logger.error("Could not set field '{}' with value '{}'".format( fieldname, value)) # All reference fields are set. We can set the proxy fields now. for pf in proxy_fields: field_name = pf.get("field_name") fm = pf.get("fm") value = pf.get("value") logger.info("Setting value={} on field={} of object={}".format( repr(value), field_name, api.get_id(obj))) try: fm.set(obj, value) except: logger.error("Could not set field '{}' with value '{}'".format( field_name, value)) # Set the workflow states wf_info = data.get("workflow_info", []) for wf_dict in wf_info: wf_id = wf_dict.get("workflow") review_history = wf_dict.get("review_history") self.import_review_history(obj, wf_id, review_history) # finally reindex the object self.uids_to_reindex.append([api.get_uid(obj), repr(obj)])
def import_data(self, domain): """Import the data from the storage identified by domain """ logger.info("*** IMPORT DATA {} ***".format(domain)) storage = self.get_storage(domain=domain) datastore = storage["data"] indexstore = storage["index"] uidmap = storage["uidmap"] credentials = storage["credentials"] # At some points api cannot retrieve objects by UID in the end of # creation process. Thus we keep them in an dictionary to access easily. objmap = {} # We will create objects from top to bottom, but will update from bottom # to up. ordered_uids = [] # initialize a new session with the stored credentials for later requests username = credentials.get("username") password = credentials.get("password") self.session = self.get_session(username, password) logger.info("Initialized a new session for user {}".format(username)) # Get UIDs grouped by their parent path ppaths = indexstore.get("by_parent_path") if ppaths is None: message = _( "No parent path info found in the import data. " "Please install senaite.jsonapi>=1.1.1 on the source instance " "and clear&refetch this storage") self.add_status_message(message, "warning") return # Import by paths from top to bottom for ppath in sorted(ppaths): # nothing to do if not ppath: continue logger.info("Importing items for parent path {}".format(ppath)) uids = ppaths[ppath] for uid in uids: ordered_uids.append(uid) # get the data for this uid data = datastore[uid] # check if the object exists in this instance remote_path = data.get("path") local_path = self.translate_path(remote_path) existing = self.portal.unrestrictedTraverse( str(local_path), None) if existing: # remember the UID -> object UID mapping for the update step uidmap[uid] = api.get_uid(existing) objmap[uid] = existing else: # get the container object by path container_path = self.translate_path(ppath) container = self.portal.unrestrictedTraverse( str(container_path), None) # create an object slug in this container obj = self.create_object_slug(container, data) # remember the UID -> object UID mapping for the update step uidmap[uid] = api.get_uid(obj) objmap[uid] = obj # When creation process is done, commit the transaction to avoid # ReferenceField relation problems. transaction.commit() # UIDs were added from up to bottom. Reverse the list to update objects # from bottom to up. ordered_uids.reverse() # Update all objects with the given data for uid in ordered_uids: obj = objmap.get(uid, None) if obj is None: logger.warn("Object not found: {} ".format(uid)) continue logger.info("Update object {} with import data".format( api.get_path(obj))) self.update_object_with_data(obj, datastore[uid], domain) self.reindex_updated_objects()
def _fetch_data(self, window=1000, overlap=10): """Fetch data from the uid catalog in the source URL :param window: number of elements to be retrieved with each query to the catalog :type window: int :param overlap: overlap between windows :type overlap: int :return: """ logger.info("*** FETCHING DATA: {} ***".format(self.domain_name)) start_time = datetime.now() storage = self.get_storage() storage["ordered_uids"] = [] ordered_uids = storage["ordered_uids"] self.sh = SoupHandler(self.domain_name) # Dummy query to get overall number of items in the specified catalog query = { "url_or_endpoint": "search", "catalog": 'uid_catalog', "limit": 1 } if self.full_sync_types: types = list() types.extend(self.full_sync_types + self.prefixable_types + self.update_only_types + self.read_only_types) query["portal_type"] = types cd = self.get_json(**query) # Knowing the catalog length compute the number of pages we will need # with the desired window size and overlap effective_window = window - overlap # When we receive an error message in JSON response or we # don't get any response at all the key 'count' doesn't exist. if not cd.get("count", None): error_message = "Error message: {}".format( cd.get('message', None) or '') logger.error( "A query to the JSON API returned and error. {}".format( error_message)) return number_of_pages = (cd["count"] / effective_window) + 1 # Retrieve data from catalog in batches with size equal to window, # format it and insert it into the import soup for current_page in xrange(number_of_pages): start_from = (current_page * window) - overlap query["limit"] = window query["b_start"] = start_from items = self.get_items_with_retry(**query) if not items: logger.error("CAN NOT GET ITEMS FROM {} TO {}".format( start_from, start_from + window)) for item in items: # skip object or extract the required data for the import if not self.is_item_allowed(item): continue data_dict = utils.get_soup_format(item) rec_id = self.sh.insert(data_dict) ordered_uids.insert(0, data_dict[REMOTE_UID]) if not self._parents_fetched(item): logger.warning( "Some parents are missing: {} ".format(item)) utils.log_process(task_name="Pages fetched", started=start_time, processed=current_page + 1, total=number_of_pages) logger.info("*** FETCHING DATA FINISHED: {} ***".format( self.domain_name)) transaction.commit()