예제 #1
0
 def testCrawlerJson(self):
     """
     Test that you can convert a crawler to json and back.
     """
     crawler = Crawler.create(PathHolder(self.__turntableFile))
     jsonResult = crawler.toJson()
     crawlerResult = Crawler.createFromJson(jsonResult)
     self.assertCountEqual(crawler.varNames(), crawlerResult.varNames())
     self.assertCountEqual(crawler.contextVarNames(), crawlerResult.contextVarNames())
     self.assertCountEqual(crawler.tagNames(), crawlerResult.tagNames())
예제 #2
0
def __runCollapsed(data, taskHolder, dataJsonFile):
    """
    Execute a collapsed job.
    """
    taskInputFilePaths = []

    # we use the base dataJsonFile to find auxiliary files used by
    # the dispatcher
    name, ext = os.path.splitext(dataJsonFile)

    # checking if the job has been already processed. This happens
    # when a collapsed job dispatches expanded jobs on the farm. The
    # dispatched jobs get added as dependencies of the collapsed
    # job itself. Therefore, when the dependencies are completed
    # and the job gets resumed (restarted). We want to avoid the job to
    # execute itself again by just returning it right away.
    jobProcessedFilePath = "{}_jobProcessed".format(name)
    if os.path.exists(jobProcessedFilePath):
        sys.stdout.write("Job has been already processed, skipping it.\n")
        return
    # otherwise we "touch" a file used in the future to tell the job has been
    # already processed
    else:
        open(jobProcessedFilePath, 'a').close()

    # looking for its own job id on the farm, this information
    # is going to be used to include the expanded jobs
    # as dependency of the job itself.
    jobIdFilePath = "{}_jobId.{ext}".format(
        name,
        ext=ext[1:]
    )

    mainJobId = None
    if os.path.exists(jobIdFilePath):
        with open(jobIdFilePath) as jsonFile:
            mainJobId = json.load(jsonFile)["id"]

    # looking for a task that has been chunkfied on the farm
    if len(data['taskInputFilePaths']) == 1 and not os.path.exists(data['taskInputFilePaths'][0]):
        nameParts = os.path.splitext(data['taskInputFilePaths'][0])

        taskInputFilePaths = glob(
            "{}_range_*_*.{ext}".format(nameParts[0], ext=nameParts[1][1:])
        )

        # since the range is padded by sorting them it is going to
        # provide the proper order that the crawlers should be loaded
        taskInputFilePaths.sort()
    else:
        taskInputFilePaths = data['taskInputFilePaths']

    # loading input crawlers
    crawlers = []
    for taskInputFilePath in taskInputFilePaths:
        with open(taskInputFilePath) as jsonFile:
            serializedCrawlers = json.load(jsonFile)
            crawlers += list(map(lambda x: Crawler.createFromJson(x), serializedCrawlers))

    dispatcher = Dispatcher.createFromJson(data['dispatcher'])
    dispatchedIds = dispatcher.dispatch(
        taskHolder,
        crawlers
    )

    # since this job can be used as dependency of other jobs
    # we need to include the dispached jobs as dependencies
    # of itself. Also, the implementation of "extendDependencyIds"
    # may need to mark the mainJobId as pending status again
    # in case your renderfarm manager does not do that
    # automatically. In case your renderfarm manager executes the
    # main job again (when all the new dependencies are completed)
    # the dispatcher is going to ignore the second execution
    # automatically.
    if mainJobId is not None:
        dispatcher.extendDependencyIds(
            mainJobId,
            dispatchedIds
        )