Ejemplo n.º 1
0
 def _parents_fetched(self, item):
     """
     If data was fetched with portal type filter, this method will be used
     to fill the missing parents for fetched objects.
     :return: True if ALL parents are fetched
     """
     # Never fetch parents of an unnecessary objects
     if not utils.has_valid_portal_type(item):
         return False
     parent_path = item.get("parent_path")
     # Skip if the parent is portal object
     if self.is_portal_path(parent_path):
         return True
     # Skip if already exists
     if self.sh.find_unique(REMOTE_PATH, parent_path):
         return True
     logger.debug("Inserting missing parent: {}".format(parent_path))
     parent = self.get_first_item(item.get("parent_url"))
     if not parent:
         logger.error("Cannot fetch parent info: {} ".format(parent_path))
         return False
     par_dict = utils.get_soup_format(parent)
     self.sh.insert(par_dict)
     # Recursively import grand parents too
     return self._parents_fetched(parent)
Ejemplo n.º 2
0
    def _fetch_data(self, window=1000, overlap=10):
        """Fetch data from the uid catalog in the source URL
        :param window: number of elements to be retrieved with each query to
                       the catalog
        :type window: int
        :param overlap: overlap between windows
        :type overlap: int
        :return:
        """
        logger.info("*** FETCHING DATA: {} ***".format(self.domain_name))
        start_time = datetime.now()
        storage = self.get_storage()
        storage["ordered_uids"] = []
        ordered_uids = storage["ordered_uids"]
        self.sh = SoupHandler(self.domain_name)
        # Dummy query to get overall number of items in the specified catalog
        query = {
            "url_or_endpoint": "search",
            "catalog": 'uid_catalog',
            "limit": 1
        }
        if self.full_sync_types:
            types = list()
            types.extend(self.full_sync_types + self.prefixable_types +
                         self.update_only_types + self.read_only_types)
            query["portal_type"] = types
        cd = self.get_json(**query)
        # Knowing the catalog length compute the number of pages we will need
        # with the desired window size and overlap
        effective_window = window - overlap
        # When we receive an error message in JSON response or we
        # don't get any response at all the key 'count' doesn't exist.
        if not cd.get("count", None):
            error_message = "Error message: {}".format(
                cd.get('message', None) or '')
            logger.error(
                "A query to the JSON API returned and error. {}".format(
                    error_message))
            return

        number_of_pages = (cd["count"] / effective_window) + 1
        # Retrieve data from catalog in batches with size equal to window,
        # format it and insert it into the import soup
        for current_page in xrange(number_of_pages):
            start_from = (current_page * window) - overlap
            query["limit"] = window
            query["b_start"] = start_from
            items = self.get_items_with_retry(**query)
            if not items:
                logger.error("CAN NOT GET ITEMS FROM {} TO {}".format(
                    start_from, start_from + window))
            for item in items:
                # skip object or extract the required data for the import
                if not self.is_item_allowed(item):
                    continue
                data_dict = utils.get_soup_format(item)
                rec_id = self.sh.insert(data_dict)
                ordered_uids.insert(0, data_dict[REMOTE_UID])
                if not self._parents_fetched(item):
                    logger.warning(
                        "Some parents are missing: {} ".format(item))

            utils.log_process(task_name="Pages fetched",
                              started=start_time,
                              processed=current_page + 1,
                              total=number_of_pages)

        logger.info("*** FETCHING DATA FINISHED: {} ***".format(
            self.domain_name))

        transaction.commit()
Ejemplo n.º 3
0
    def _create_dependencies(self, obj, data):
        """
        Creates and updates objects' dependencies if they are not in the queue.
        Dependencies are found as UIDs in object data.
        :param obj: an object to get dependencies created
        :param data: object data
        """

        dependencies = []

        for fieldname, field in api.get_fields(obj).items():

            if fieldname in self.fields_to_skip:
                continue

            value = data.get(fieldname)

            if isinstance(value, dict) and value.get("uid"):
                dependencies.append(value.get("uid"))
            elif isinstance(value, (list, tuple)):
                for item in value:
                    if isinstance(item, dict):
                        for k, v in item.iteritems():
                            if 'uid' in k:
                                dependencies.append(v)

        logger.debug("Dependencies of {} are : {} ".format(
            repr(obj), dependencies))
        dependencies = list(set(dependencies))
        for r_uid in dependencies:
            dep_row = self.sh.find_unique(REMOTE_UID, r_uid)
            if dep_row is None:
                # If dependency doesn't exist in fetched data table,
                # just try to create its object for the first time
                dep_item = self.get_json(r_uid)
                if not dep_item:
                    logger.error(
                        "Remote UID not found in fetched data: {}".format(
                            r_uid))
                    continue
                if not utils.has_valid_portal_type(dep_item):
                    logger.error(
                        "Skipping dependency with unknown portal type:"
                        " {}".format(dep_item))
                    continue
                data_dict = utils.get_soup_format(dep_item)
                rec_id = self.sh.insert(data_dict)
                dep_row = self.sh.get_record_by_id(rec_id, as_dict=True)
                if self._parents_fetched(dep_item):
                    self._handle_obj(dep_row, handle_dependencies=False)
                continue

            # If Dependency is being processed, skip it.
            if r_uid in self._queue:
                continue

            # No need to handle already updated objects
            if dep_row.get("updated") == "0":
                self._handle_obj(dep_row)
            # Reindex dependency just in case it has a field that uses
            # BackReference of this object.
            else:
                self.uids_to_reindex.append(dep_row.get(LOCAL_UID))

        return True