def import_review_history(self, content, wf_id, review_history, **kw): """Change the workflow state of an object @param content: Content obj which state will be changed @param review_history: Review history of the object @param wf_id: workflow name @param kw: change the values of same name of the state mapping @return: None """ portal_workflow = api.get_tool('portal_workflow') # Might raise IndexError if no workflow is associated to this type for wf_def in portal_workflow.getWorkflowsFor(content): if wf_id == wf_def.getId(): break else: logger.error("%s: Cannot find workflow id %s" % (content, wf_id)) for rh in sorted(review_history, key=lambda k: k['time']): if not self.review_history_imported(content, rh, wf_def): portal_workflow.setStatusOf(wf_id, content, self.to_review_history_format(rh)) wf_def.updateRoleMappingsFor(content) return
def _parents_fetched(self, item): """ If data was fetched with portal type filter, this method will be used to fill the missing parents for fetched objects. :return: True if ALL parents are fetched """ # Never fetch parents of an unnecessary objects if not utils.has_valid_portal_type(item): return False parent_path = item.get("parent_path") # Skip if the parent is portal object if self.is_portal_path(parent_path): return True # Skip if already exists if self.sh.find_unique(REMOTE_PATH, parent_path): return True logger.debug("Inserting missing parent: {}".format(parent_path)) parent = self.get_first_item(item.get("parent_url")) if not parent: logger.error("Cannot fetch parent info: {} ".format(parent_path)) return False par_dict = utils.get_soup_format(parent) self.sh.insert(par_dict) # Recursively import grand parents too return self._parents_fetched(parent)
def _handle_obj(self, row, handle_dependencies=True): """ With the given dictionary: 1. Creates object's slug 2. Creates and updates dependencies of the object (which actually means this _handle_obj function will be called for the dependency if the dependency is not updated 3. Updates the object :param row: A row dictionary from the souper :type row: dict """ r_uid = row.get(REMOTE_UID) try: if row.get("updated", "0") == "1": return True self._queue.append(r_uid) obj = self._do_obj_creation(row) if obj is None: logger.error('Object creation failed: {}'.format(row)) return obj_data = self.get_json(r_uid, complete=True, workflow=True) if handle_dependencies: self._create_dependencies(obj, obj_data) self._update_object_with_data(obj, obj_data) self._set_object_permission(obj) self.sh.mark_update(r_uid) self._queue.remove(r_uid) except Exception, e: self._queue.remove(r_uid) logger.error('Failed to handle {} : {} '.format(row, str(e)))
def mark_update(self, remote_uid): """ Marks that record's object has been updated. """ recs = [r for r in self.soup.query(Eq(REMOTE_UID, remote_uid))] if not recs: logger.error("Could not find any record with remote_uid: '{}'" .format(remote_uid)) return False recs[0].attrs[UPDATED] = "1" self.soup.reindex([recs[0]]) return True
def _import_data(self): """ For each UID from the fetched data, creates and updates objects step by step. :return: """ logger.info("*** IMPORT DATA STARTED: {} ***".format(self.domain_name)) self.sh = SoupHandler(self.domain_name) self.uids_to_reindex = [] storage = self.get_storage() ordered_uids = storage["ordered_uids"] total_object_count = len(ordered_uids) start_time = datetime.now() for item_index, r_uid in enumerate(ordered_uids): row = self.sh.find_unique(REMOTE_UID, r_uid) logger.debug("Handling: {} ".format(row[REMOTE_PATH])) self._handle_obj(row) # Handling object means there is a chunk containing several objects # which have been created and updated. Reindex them now. self.uids_to_reindex = list(set(self.uids_to_reindex)) for uid in self.uids_to_reindex: # It is possible that the object has a method (not a Field # in its Schema) which is used as an index and it fails. # TODO: Make sure reindexing won't fail! try: obj = api.get_object_by_uid(uid) obj.reindexObject() except Exception, e: rec = self.sh.find_unique(LOCAL_UID, uid) logger.error("Error while reindexing {} - {}".format( rec, e)) self._non_commited_objects += len(self.uids_to_reindex) self.uids_to_reindex = [] # Commit the transaction if necessary if self._non_commited_objects > COMMIT_INTERVAL: transaction.commit() logger.info("Committed: {} / {} ".format( self._non_commited_objects, total_object_count)) self._non_commited_objects = 0 # Log.info every 50 objects imported utils.log_process(task_name="Data Import", started=start_time, processed=item_index + 1, total=total_object_count, frequency=50)
def update_by_remote_path(self, remote_path, **kwargs): """ Update the row by path column. :param path: path of the record :param kwargs: columns and their values to be updated. """ recs = [r for r in self.soup.query(Eq(REMOTE_PATH, remote_path))] if not recs: logger.error("Could not find any record with path: '{}'" .format(REMOTE_PATH)) return False for k, v in kwargs.iteritems(): recs[0].attrs[k] = v self.soup.reindex([recs[0]]) return True
def update_by_remote_uid(self, remote_uid, **kwargs): """ Update the row by remote_uid column. :param remote_uid: UID of the object in the source :param kwargs: columns and their values to be updated. """ recs = [r for r in self.soup.query(Eq(REMOTE_UID, remote_uid))] if not recs: logger.error("Could not find any record with remote_uid: '{}'" .format(remote_uid)) return False for k, v in kwargs.iteritems(): recs[0].attrs[k] = v self.soup.reindex([recs[0]]) return True
def get_json(self, url_or_endpoint, **kw): """Fetch the given url or endpoint and return a parsed JSON object """ api_url = self.get_api_url(url_or_endpoint, **kw) logger.info("get_json::url={}".format(api_url)) try: response = self.session.get(api_url) except Exception as e: message = "Could not connect to {} Please check.".format(api_url) logger.error(e) self.add_status_message(message, "error") return {} status = response.status_code if status != 200: message = "GET for {} ({}) returned Status Code {}. Please check.".format( url_or_endpoint, api_url, status) self.add_status_message(message, "warning") return {} return response.json()
def reindex_updated_objects(self): """ Reindexes updated objects. """ total = len(self.uids_to_reindex) logger.info( 'Reindexing {} objects which were updated...'.format(total)) indexed = 0 for uid in self.uids_to_reindex: obj = api.get_object_by_uid(uid[0], None) if obj is None: logger.error("Object not found: {} ".format(uid[1])) continue obj.reindexObject() indexed += 1 if indexed % 100 == 0: logger.info('{} objects were reindexed, remain {}'.format( indexed, total - indexed)) logger.info('Reindexing finished...')
def _create_object_slug(self, container, data, *args, **kwargs): """Create an content object slug for the given data """ id = data.get("id") remote_path = data.get("remote_path") portal_type = data.get("portal_type") types_tool = api.get_tool("portal_types") fti = types_tool.getTypeInfo(portal_type) if not fti: self.skipped.append(remote_path) logger.error("Type Info not found for {}".format(portal_type)) return None logger.debug("Creating {} with ID {} in parent path {}".format( portal_type, id, api.get_path(container))) if fti.product: obj = _createObjectByType(portal_type, container, id) else: # new style factory factory = getUtility(IFactory, fti.factory) obj = factory(id, *args, **kwargs) if hasattr(obj, '_setPortalTypeName'): obj._setPortalTypeName(fti.getId()) # notifies ObjectWillBeAddedEvent, ObjectAddedEvent and # ContainerModifiedEvent container._setObject(id, obj) # we get the object here with the current object id, as it # might be renamed # already by an event handler obj = container._getOb(obj.getId()) # Be sure that Creation Flag is Cleared. if obj.checkCreationFlag(): obj.unmarkCreationFlag() return obj
def update_object_with_data(self, obj, data, domain): """Update an existing object with data """ # get the storage and UID map storage = self.get_storage(domain=domain) uidmap = storage["uidmap"] # Proxy Fields must be set after its dependency object is already set. # Thus, we will store all the ProxyFields and set them in the end proxy_fields = [] for fieldname, field in api.get_fields(obj).items(): fm = IFieldManager(field) value = data.get(fieldname) # handle JSON data reference fields if isinstance(value, dict) and value.get("uid"): # dereference the referenced object value = self.dereference_object(value.get("uid"), uidmap) elif isinstance(value, (list, tuple)): for item in value: # If it is list of json data dict of objects, add local # uid to that dictionary. This local_uid can be used in # Field Managers. if isinstance(item, dict): for k, v in item.iteritems(): if 'uid' in k: local_uid = uidmap.get(v) item[k] = local_uid # handle file fields if field.type in ("file", "image", "blob"): if data.get(fieldname) is not None: fileinfo = data.get(fieldname) url = fileinfo.get("download") filename = fileinfo.get("filename") data["filename"] = filename response = requests.get(url) value = response.content # Leave the Proxy Fields for later if isinstance(fm, ProxyFieldManager): proxy_fields.append({ 'field_name': fieldname, 'fm': fm, 'value': value }) continue logger.info("Setting value={} on field={} of object={}".format( repr(value), fieldname, api.get_id(obj))) try: fm.set(obj, value) except: logger.error("Could not set field '{}' with value '{}'".format( fieldname, value)) # All reference fields are set. We can set the proxy fields now. for pf in proxy_fields: field_name = pf.get("field_name") fm = pf.get("fm") value = pf.get("value") logger.info("Setting value={} on field={} of object={}".format( repr(value), field_name, api.get_id(obj))) try: fm.set(obj, value) except: logger.error("Could not set field '{}' with value '{}'".format( field_name, value)) # Set the workflow states wf_info = data.get("workflow_info", []) for wf_dict in wf_info: wf_id = wf_dict.get("workflow") review_history = wf_dict.get("review_history") self.import_review_history(obj, wf_id, review_history) # finally reindex the object self.uids_to_reindex.append([api.get_uid(obj), repr(obj)])
def _fetch_data(self, window=1000, overlap=10): """Fetch data from the uid catalog in the source URL :param window: number of elements to be retrieved with each query to the catalog :type window: int :param overlap: overlap between windows :type overlap: int :return: """ logger.info("*** FETCHING DATA: {} ***".format(self.domain_name)) start_time = datetime.now() storage = self.get_storage() storage["ordered_uids"] = [] ordered_uids = storage["ordered_uids"] self.sh = SoupHandler(self.domain_name) # Dummy query to get overall number of items in the specified catalog query = { "url_or_endpoint": "search", "catalog": 'uid_catalog', "limit": 1 } if self.full_sync_types: types = list() types.extend(self.full_sync_types + self.prefixable_types + self.update_only_types + self.read_only_types) query["portal_type"] = types cd = self.get_json(**query) # Knowing the catalog length compute the number of pages we will need # with the desired window size and overlap effective_window = window - overlap # When we receive an error message in JSON response or we # don't get any response at all the key 'count' doesn't exist. if not cd.get("count", None): error_message = "Error message: {}".format( cd.get('message', None) or '') logger.error( "A query to the JSON API returned and error. {}".format( error_message)) return number_of_pages = (cd["count"] / effective_window) + 1 # Retrieve data from catalog in batches with size equal to window, # format it and insert it into the import soup for current_page in xrange(number_of_pages): start_from = (current_page * window) - overlap query["limit"] = window query["b_start"] = start_from items = self.get_items_with_retry(**query) if not items: logger.error("CAN NOT GET ITEMS FROM {} TO {}".format( start_from, start_from + window)) for item in items: # skip object or extract the required data for the import if not self.is_item_allowed(item): continue data_dict = utils.get_soup_format(item) rec_id = self.sh.insert(data_dict) ordered_uids.insert(0, data_dict[REMOTE_UID]) if not self._parents_fetched(item): logger.warning( "Some parents are missing: {} ".format(item)) utils.log_process(task_name="Pages fetched", started=start_time, processed=current_page + 1, total=number_of_pages) logger.info("*** FETCHING DATA FINISHED: {} ***".format( self.domain_name)) transaction.commit()
def _create_dependencies(self, obj, data): """ Creates and updates objects' dependencies if they are not in the queue. Dependencies are found as UIDs in object data. :param obj: an object to get dependencies created :param data: object data """ dependencies = [] for fieldname, field in api.get_fields(obj).items(): if fieldname in self.fields_to_skip: continue value = data.get(fieldname) if isinstance(value, dict) and value.get("uid"): dependencies.append(value.get("uid")) elif isinstance(value, (list, tuple)): for item in value: if isinstance(item, dict): for k, v in item.iteritems(): if 'uid' in k: dependencies.append(v) logger.debug("Dependencies of {} are : {} ".format( repr(obj), dependencies)) dependencies = list(set(dependencies)) for r_uid in dependencies: dep_row = self.sh.find_unique(REMOTE_UID, r_uid) if dep_row is None: # If dependency doesn't exist in fetched data table, # just try to create its object for the first time dep_item = self.get_json(r_uid) if not dep_item: logger.error( "Remote UID not found in fetched data: {}".format( r_uid)) continue if not utils.has_valid_portal_type(dep_item): logger.error( "Skipping dependency with unknown portal type:" " {}".format(dep_item)) continue data_dict = utils.get_soup_format(dep_item) rec_id = self.sh.insert(data_dict) dep_row = self.sh.get_record_by_id(rec_id, as_dict=True) if self._parents_fetched(dep_item): self._handle_obj(dep_row, handle_dependencies=False) continue # If Dependency is being processed, skip it. if r_uid in self._queue: continue # No need to handle already updated objects if dep_row.get("updated") == "0": self._handle_obj(dep_row) # Reindex dependency just in case it has a field that uses # BackReference of this object. else: self.uids_to_reindex.append(dep_row.get(LOCAL_UID)) return True