def _parents_fetched(self, item): """ If data was fetched with portal type filter, this method will be used to fill the missing parents for fetched objects. :return: True if ALL parents are fetched """ # Never fetch parents of an unnecessary objects if not utils.has_valid_portal_type(item): return False parent_path = item.get("parent_path") # Skip if the parent is portal object if self.is_portal_path(parent_path): return True # Skip if already exists if self.sh.find_unique(REMOTE_PATH, parent_path): return True logger.debug("Inserting missing parent: {}".format(parent_path)) parent = self.get_first_item(item.get("parent_url")) if not parent: logger.error("Cannot fetch parent info: {} ".format(parent_path)) return False par_dict = utils.get_soup_format(parent) self.sh.insert(par_dict) # Recursively import grand parents too return self._parents_fetched(parent)
def _import_registry_records(self): """Import the registry records from the storage identified by domain """ if not self.import_registry: return logger.info("***Importing Registry Records: {}***".format( self.domain_name)) storage = self.get_storage() registry_store = storage["registry"] current_registry = getUtility(IRegistry) # For each of the keywords used to retrieve registry data # import the records that were found for key in registry_store.keys(): records = registry_store[key] for record in records.keys(): logger.debug("Updating record {} with value {}".format( record, records.get(record))) if record not in current_registry.records: logger.warn( "Current Registry has no record named {}".format( record)) continue current_registry[record] = records.get(record) logger.info("*** Registry Records Imported: {}***".format( self.domain_name))
def _import_data(self): """ For each UID from the fetched data, creates and updates objects step by step. :return: """ logger.info("*** IMPORT DATA STARTED: {} ***".format(self.domain_name)) self.sh = SoupHandler(self.domain_name) self.uids_to_reindex = [] storage = self.get_storage() ordered_uids = storage["ordered_uids"] total_object_count = len(ordered_uids) start_time = datetime.now() for item_index, r_uid in enumerate(ordered_uids): row = self.sh.find_unique(REMOTE_UID, r_uid) logger.debug("Handling: {} ".format(row[REMOTE_PATH])) self._handle_obj(row) # Handling object means there is a chunk containing several objects # which have been created and updated. Reindex them now. self.uids_to_reindex = list(set(self.uids_to_reindex)) for uid in self.uids_to_reindex: # It is possible that the object has a method (not a Field # in its Schema) which is used as an index and it fails. # TODO: Make sure reindexing won't fail! try: obj = api.get_object_by_uid(uid) obj.reindexObject() except Exception, e: rec = self.sh.find_unique(LOCAL_UID, uid) logger.error("Error while reindexing {} - {}".format( rec, e)) self._non_commited_objects += len(self.uids_to_reindex) self.uids_to_reindex = [] # Commit the transaction if necessary if self._non_commited_objects > COMMIT_INTERVAL: transaction.commit() logger.info("Committed: {} / {} ".format( self._non_commited_objects, total_object_count)) self._non_commited_objects = 0 # Log.info every 50 objects imported utils.log_process(task_name="Data Import", started=start_time, processed=item_index + 1, total=total_object_count, frequency=50)
def insert(self, data): """ Inserts a row to the soup table. :param data: row dictionary :return: intid of created record """ if self._already_exists(data): logger.debug("Trying to insert existing record... {}".format(data)) return False record = Record() record.attrs[REMOTE_UID] = data[REMOTE_UID] record.attrs[LOCAL_UID] = data.get(LOCAL_UID, "") record.attrs[REMOTE_PATH] = data[REMOTE_PATH] record.attrs[LOCAL_PATH] = data.get(LOCAL_PATH, "") record.attrs[PORTAL_TYPE] = data[PORTAL_TYPE] record.attrs[UPDATED] = data.get(UPDATED, "0") r_id = self.soup.add(record) logger.info("Record {} inserted: {}".format(r_id, data)) return r_id
def get_json(self, url_or_endpoint, **kw): """Fetch the given url or endpoint and return a parsed JSON object """ api_url = self.get_api_url(url_or_endpoint, **kw) logger.debug("get_json::url={}".format(api_url)) try: response = self.session.get(api_url) except Exception as e: message = "Could not connect to {} Please check.".format(api_url) logger.error(message) logger.error(e) return {} status = response.status_code if status != 200: message = "GET for {} ({}) returned Status Code {}. Please check.".format( url_or_endpoint, api_url, status) logger.error(message) return {} return response.json()
def _create_object_slug(self, container, data, *args, **kwargs): """Create an content object slug for the given data """ id = data.get("id") remote_path = data.get("remote_path") portal_type = data.get("portal_type") types_tool = api.get_tool("portal_types") fti = types_tool.getTypeInfo(portal_type) if not fti: self.skipped.append(remote_path) logger.error("Type Info not found for {}".format(portal_type)) return None logger.debug("Creating {} with ID {} in parent path {}".format( portal_type, id, api.get_path(container))) if fti.product: obj = _createObjectByType(portal_type, container, id) else: # new style factory factory = getUtility(IFactory, fti.factory) obj = factory(id, *args, **kwargs) if hasattr(obj, '_setPortalTypeName'): obj._setPortalTypeName(fti.getId()) # notifies ObjectWillBeAddedEvent, ObjectAddedEvent and # ContainerModifiedEvent container._setObject(id, obj) # we get the object here with the current object id, as it # might be renamed # already by an event handler obj = container._getOb(obj.getId()) # Be sure that Creation Flag is Cleared. if obj.checkCreationFlag(): obj.unmarkCreationFlag() return obj
def _import_users(self): """Import the users from the storage identified by domain """ if not self.import_users: return logger.info("*** Importing Users: {} ***".format(self.domain_name)) for user in self.yield_items("users"): username = user.get("username") if ploneapi.user.get(username): logger.debug("Skipping existing user {}".format(username)) continue email = user.get("email", "") if not email: email = "{}@example.com".format(username) roles = user.get("roles", ()) groups = user.get("groups", ()) logger.debug("Creating user {}".format(username)) message = _("Created new user {} with password {}".format( username, username)) # create new user with the same password as the username ploneapi.user.create( email=email, username=username, password=username, roles=roles, ) for group in groups: # Try to add the user to the group if group exists. try: ploneapi.group.add_user(groupname=group, username=username) except KeyError: continue logger.debug(message) logger.info("*** Users Were Imported: {} ***".format(self.domain_name))
def _update_object_with_data(self, obj, data): """Update an existing object with data """ # Proxy Fields must be set after its dependency object is already set. # Thus, we will store all the ProxyFields and set them in the end proxy_fields = [] for fieldname, field in api.get_fields(obj).items(): if fieldname in self.fields_to_skip: continue fm = IFieldManager(field) value = data.get(fieldname) kwargs = {} # Computed Fields don't have set methods. if isinstance(fm, ComputedFieldManager): continue # handle JSON data reference fields if isinstance(value, dict) and value.get("uid"): # dereference the referenced object local_uid = self.sh.get_local_uid(value.get("uid")) if local_uid: value = api.get_object_by_uid(local_uid) else: value = None elif isinstance(value, (list, tuple)): for item in value: # If it is list of json data dict of objects, add local # uid to that dictionary. This local_uid can be used in # Field Managers. if isinstance(item, dict): for k, v in item.iteritems(): if 'uid' in k: local_uid = self.sh.get_local_uid(v) item[k] = local_uid # handle file fields if field.type in ("file", "image", "blob"): if data.get(fieldname) is not None: fileinfo = data.get(fieldname) url = fileinfo.get("download") filename = fileinfo.get("filename") kwargs["filename"] = filename response = self.session.get(url) value = response.content # Leave the Proxy Fields for later if isinstance(fm, ProxyFieldManager): proxy_fields.append({ 'field_name': fieldname, 'fm': fm, 'value': value }) continue try: fm.set(obj, value, **kwargs) except: logger.debug("Could not set field '{}' with value '{}'".format( fieldname, value)) # All reference fields are set. We can set the proxy fields now. for pf in proxy_fields: field_name = pf.get("field_name") fm = pf.get("fm") value = pf.get("value") try: fm.set(obj, value) except: logger.debug("Could not set field '{}' with value '{}'".format( field_name, value)) # Set the workflow states wf_info = data.get("workflow_info", []) for wf_dict in wf_info: wf_id = wf_dict.get("workflow") review_history = wf_dict.get("review_history") self._import_review_history(obj, wf_id, review_history) # finally reindex the object self.uids_to_reindex.append(api.get_uid(obj))
def _create_dependencies(self, obj, data): """ Creates and updates objects' dependencies if they are not in the queue. Dependencies are found as UIDs in object data. :param obj: an object to get dependencies created :param data: object data """ dependencies = [] for fieldname, field in api.get_fields(obj).items(): if fieldname in self.fields_to_skip: continue value = data.get(fieldname) if isinstance(value, dict) and value.get("uid"): dependencies.append(value.get("uid")) elif isinstance(value, (list, tuple)): for item in value: if isinstance(item, dict): for k, v in item.iteritems(): if 'uid' in k: dependencies.append(v) logger.debug("Dependencies of {} are : {} ".format( repr(obj), dependencies)) dependencies = list(set(dependencies)) for r_uid in dependencies: dep_row = self.sh.find_unique(REMOTE_UID, r_uid) if dep_row is None: # If dependency doesn't exist in fetched data table, # just try to create its object for the first time dep_item = self.get_json(r_uid) if not dep_item: logger.error( "Remote UID not found in fetched data: {}".format( r_uid)) continue if not utils.has_valid_portal_type(dep_item): logger.error( "Skipping dependency with unknown portal type:" " {}".format(dep_item)) continue data_dict = utils.get_soup_format(dep_item) rec_id = self.sh.insert(data_dict) dep_row = self.sh.get_record_by_id(rec_id, as_dict=True) if self._parents_fetched(dep_item): self._handle_obj(dep_row, handle_dependencies=False) continue # If Dependency is being processed, skip it. if r_uid in self._queue: continue # No need to handle already updated objects if dep_row.get("updated") == "0": self._handle_obj(dep_row) # Reindex dependency just in case it has a field that uses # BackReference of this object. else: self.uids_to_reindex.append(dep_row.get(LOCAL_UID)) return True