def get_items(self, path, depth=0): if path and path[-1] == '/': path = path[:-1] if self.remote_crawl_depth == -1 or depth <= self.remote_crawl_depth: item, subitems = self.get_remote_item(path) if item is None: logger.warn(':: Skipping -> %s. No remote data.' % path) return if item.startswith('ERROR'): logger.error( "Could not get item '%s' from remote. Got %s." % (path, item)) return try: item = json.loads(item) except json.JSONDecodeError: logger.error( "Could not decode item from path '%s' as JSON." % path) return logger.info(':: Crawling %s' % item['_path']) # item['_path'] is relative to domain root. we need relative to # plone root remote_url = self.remote_url _, _, remote_path, _, _, _ = urlparse.urlparse(remote_url) item['_path'] = item['_path'][len(remote_path):] if item['_path'].startswith('/'): item['_path'] = item['_path'][1:] if item['_type'] == "Plone Site": pass else: yield item if subitems.startswith('ERROR'): logger.error( "Could not get subitems for '%s'. Got %s." % (path, subitems)) return for subitem_id in json.loads(subitems): subitem_path = path + '/' + subitem_id if subitem_path[len(self.remote_path):]\ in self.remote_skip_path: logger.info(':: Skipping -> ' + subitem_path) continue for subitem in self.get_items(subitem_path, depth + 1): yield subitem
def get_items(self, path, depth=0): if path and path[-1] == '/': path = path[:-1] if self.remote_crawl_depth == -1 or depth <= self.remote_crawl_depth: item, subitems = self.get_remote_item(path) if item is None: logger.warn(':: Skipping -> %s. No remote data.' % path) return if item.startswith('ERROR'): logger.error("Could not get item '%s' from remote. Got %s." % (path, item)) return item = simplejson.loads(item) logger.info(':: Crawling %s' % item['_path']) if self.local_path: item['_path'] = self.local_path + item['_path'][len(self.remote_path):] # item['_path'] is relative to domain root. we need relative to plone root # remote_url = self.remote_url # _,_,remote_path,_,_,_ = urlparse.urlparse(remote_url) # item['_path'] = item['_path'][len(remote_path):] # if item['_path'].startswith('/'): # item['_path'] = item['_path'][1:] if item['_type'] == "Plone Site": pass else: yield item if subitems.startswith('ERROR'): logger.error("Could not get subitems for '%s'. Got %s." % (path, subitems)) return for subitem_id in simplejson.loads(subitems): subitem_path = path + '/' + subitem_id if subitem_path[len(self.remote_path):] in self.remote_skip_path: logger.info(':: Skipping -> ' + subitem_path) continue if self.remote_catalog_query: if subitem_path not in self.remote_ok_path: logger.info(':: Skipping (2) -> ' + subitem_path) continue for subitem in self.get_items(subitem_path, depth+1): yield subitem
def get_items(self, path, depth=0): if path and path[-1] == '/': path = path[:-1] if self.remote_crawl_depth == -1 or depth <= self.remote_crawl_depth: item, subitems = self.get_remote_item(path) if item is None: logger.warn(':: Skipping -> %s. No remote data.' % path) return if item.startswith(b'ERROR'): logger.error("Could not get item '%s' from remote. Got %s." % (path, item)) return try: item = json.loads(item) except JSONDecodeError: logger.error("Could not decode item from path '%s' as JSON." % path) return logger.info(':: Crawling %s' % item['_path']) # item['_path'] is relative to domain root. we need relative to # plone root remote_url = self.remote_url _, _, remote_path, _, _, _ = urllib.parse.urlparse(remote_url) item['_path'] = item['_path'][len(remote_path):] if item['_path'].startswith('/'): item['_path'] = item['_path'][1:] if item['_type'] == "Plone Site": pass else: yield item if subitems.startswith(b'ERROR'): logger.error("Could not get subitems for '%s'. Got %s." % (path, subitems)) return for subitem_id in json.loads(subitems): subitem_path = path + '/' + subitem_id if subitem_path[len(self.remote_path):]\ in self.remote_skip_path: logger.info(':: Skipping -> ' + subitem_path) continue for subitem in self.get_items(subitem_path, depth + 1): yield subitem
def __iter__(self): for item in self.previous: yield item offset = int(self.options.get("offset", "0")) limit = int(self.options.get("limit", "0")) counter = 0 if hasattr(self.transmogrifier, "jsonmigrator_offset"): # truncate results when live importing? # inject the parameters bellow in the # transmogrifier object if you are # doing an interactive import of small chunks # in the python prompt print ("total results: ", len(self.item_paths)) hard_limit = self.transmogrifier.jsonmigrator_offset + self.transmogrifier.jsonmigrator_limit self.item_paths = self.item_paths[ self.transmogrifier.jsonmigrator_offset: hard_limit] logger.warn("Migrating %d items from position %s" % (self.transmogrifier.jsonmigrator_limit, self.transmogrifier.jsonmigrator_offset)) for path in self.item_paths: skip = False if not counter % 100: print counter counter += 1 if counter < offset: logger.debug("Skipping item n.# %d at %s " % (counter, path)) continue if limit and counter > (offset + limit): logger.debug("Post skipping item n.# %d at %s " % (counter, path)) continue for skip_path in self.remote_skip_paths: if path.startswith(skip_path): skip = True if not skip: item = self.get_remote_item(path) if item: item['_path'] = item['_path'][self.site_path_length:] yield item