def testCrawlerJson(self): """ Test that you can convert a crawler to json and back. """ crawler = Crawler.create(PathHolder(self.__turntableFile)) jsonResult = crawler.toJson() crawlerResult = Crawler.createFromJson(jsonResult) self.assertCountEqual(crawler.varNames(), crawlerResult.varNames()) self.assertCountEqual(crawler.contextVarNames(), crawlerResult.contextVarNames()) self.assertCountEqual(crawler.tagNames(), crawlerResult.tagNames())
def __runCollapsed(data, taskHolder, dataJsonFile): """ Execute a collapsed job. """ taskInputFilePaths = [] # we use the base dataJsonFile to find auxiliary files used by # the dispatcher name, ext = os.path.splitext(dataJsonFile) # checking if the job has been already processed. This happens # when a collapsed job dispatches expanded jobs on the farm. The # dispatched jobs get added as dependencies of the collapsed # job itself. Therefore, when the dependencies are completed # and the job gets resumed (restarted). We want to avoid the job to # execute itself again by just returning it right away. jobProcessedFilePath = "{}_jobProcessed".format(name) if os.path.exists(jobProcessedFilePath): sys.stdout.write("Job has been already processed, skipping it.\n") return # otherwise we "touch" a file used in the future to tell the job has been # already processed else: open(jobProcessedFilePath, 'a').close() # looking for its own job id on the farm, this information # is going to be used to include the expanded jobs # as dependency of the job itself. jobIdFilePath = "{}_jobId.{ext}".format( name, ext=ext[1:] ) mainJobId = None if os.path.exists(jobIdFilePath): with open(jobIdFilePath) as jsonFile: mainJobId = json.load(jsonFile)["id"] # looking for a task that has been chunkfied on the farm if len(data['taskInputFilePaths']) == 1 and not os.path.exists(data['taskInputFilePaths'][0]): nameParts = os.path.splitext(data['taskInputFilePaths'][0]) taskInputFilePaths = glob( "{}_range_*_*.{ext}".format(nameParts[0], ext=nameParts[1][1:]) ) # since the range is padded by sorting them it is going to # provide the proper order that the crawlers should be loaded taskInputFilePaths.sort() else: taskInputFilePaths = data['taskInputFilePaths'] # loading input crawlers crawlers = [] for taskInputFilePath in taskInputFilePaths: with open(taskInputFilePath) as jsonFile: serializedCrawlers = json.load(jsonFile) crawlers += list(map(lambda x: Crawler.createFromJson(x), serializedCrawlers)) dispatcher = Dispatcher.createFromJson(data['dispatcher']) dispatchedIds = dispatcher.dispatch( taskHolder, crawlers ) # since this job can be used as dependency of other jobs # we need to include the dispached jobs as dependencies # of itself. Also, the implementation of "extendDependencyIds" # may need to mark the mainJobId as pending status again # in case your renderfarm manager does not do that # automatically. In case your renderfarm manager executes the # main job again (when all the new dependencies are completed) # the dispatcher is going to ignore the second execution # automatically. if mainJobId is not None: dispatcher.extendDependencyIds( mainJobId, dispatchedIds )