Ejemplo n.º 1
0
    def run(self, job_name, sequence_num, time_started, namespace, output):
        results = []

        # TODO(mgainer): Notice errors earlier in pipeline, and mark job
        # as failed in that case as well.
        try:
            iterator = input_readers.RecordsReader(output, 0)
            for item in iterator:
                # Map/reduce puts reducer output into blobstore files as a
                # string obtained via "str(result)".  Use AST as a safe
                # alternative to eval() to get the Python object back.
                results.append(ast.literal_eval(item))
            time_completed = time.time()
            with Namespace(namespace):
                db.run_in_transaction(
                    DurableJobEntity._complete_job, job_name, sequence_num,
                    MapReduceJob.build_output(self.root_pipeline_id, results),
                    long(time_completed - time_started))
        # Don't know what exceptions are currently, or will be in future,
        # thrown from Map/Reduce or Pipeline libraries; these are under
        # active development.
        #
        # pylint: disable=broad-except
        except Exception, ex:
            time_completed = time.time()
            with Namespace(namespace):
                db.run_in_transaction(
                    DurableJobEntity._fail_job, job_name, sequence_num,
                    MapReduceJob.build_output(self.root_pipeline_id, results,
                                              str(ex)),
                    long(time_completed - time_started))
Ejemplo n.º 2
0
    def testSuccessfulRun(self):
        file_name1 = self.createMockData((
            "http://hoge_0.com",
            "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3"
        ))
        file_name2 = self.createMockData((
            "http://hoge_1.com",
            "User-agent: test\nAllow: /content_0\nAllow: /content_1\nDisallow: /content_3"
        ))
        createMockCrawlDbDatum(2, 6, True)
        p = pipelines._FetchSetsBufferPipeline("FetchSetsBufferPipeline",
                                               [file_name1, file_name2])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        finished_map = pipelines._FetchSetsBufferPipeline.from_id(
            p.pipeline_id)

        # Can open files
        file_paths = finished_map.outputs.default.value
        self.assertTrue(len(file_paths) > 0)
        self.assertTrue(file_paths[0].startswith("/blobstore/"))

        reader = input_readers.RecordsReader(file_paths, 0)
        for binary_record in reader:
            proto = file_service_pb.KeyValue()
            proto.ParseFromString(binary_record)
            key = proto.key()
            value = proto.value()
            self.assertTrue(key is not None)
            self.assertTrue(value is not None)
Ejemplo n.º 3
0
    def testFetchError(self):
        blob_keys = self.createInvalidMockData()
        static_content = "User-agent: *\nDisallow: /search\nDisallow: /sdch\nDisallow: /groups"
        self.setReturnValue(content=static_content,
                            headers={
                                "Content-Length": len(static_content),
                                "Content-Type": "text/html"
                            })
        p = pipelines._RobotsFetchPipeline("RobotsFetchPipeline", blob_keys, 2)
        p.start()

        test_support.execute_until_empty(self.taskqueue)
        finished_map = pipelines._RobotsFetchPipeline.from_id(p.pipeline_id)

        # Can open files
        file_list = finished_map.outputs.default.value
        self.assertTrue(len(file_list) > 0)
        reader = input_readers.RecordsReader(file_list, 0)
        for binary_record in reader:
            proto = file_service_pb.KeyValue()
            proto.ParseFromString(binary_record)
            key = proto.key()
            value = proto.value()
            self.assertEquals("invalidScheme://test_url.com", key)
            self.assertEquals("User-agent: *\nDisallow: /", value)
Ejemplo n.º 4
0
    def testSuccessfulRun(self):
        file_name1 = self.createMockData(
            ("https://developers.google.com/appengine/",
             "http://k.yimg.jp/images/top/sp/logo.gif"))
        file_name2 = self.createMockData(
            ("https://developers.google.com/appengine/",
             "/appengine/images/slide1.png"))
        datum = CrawlDbDatum(
            parent=ndb.Key(CrawlDbDatum,
                           "https://developers.google.com/appengine/"),
            url="https://developers.google.com/appengine/",
            extract_domain_url="https://developers.google.com",
            last_status=pipelines.UNFETCHED)
        datum.put()
        resource = self.getResource("slide1.png")
        static_content = resource.read()
        self.setReturnValue(content=static_content,
                            headers={
                                "Content-Length": len(static_content),
                                "Content-Type": "image/png"
                            })
        p = pipelines._FetchContentPipeline("FetchContentPipeline",
                                            [file_name1, file_name2])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        finished_map = pipelines._FetchSetsBufferPipeline.from_id(
            p.pipeline_id)

        # Can open files
        file_paths = finished_map.outputs.default.value
        self.assertTrue(len(file_paths) > 0)
        self.assertTrue(file_paths[0].startswith("/blobstore/"))

        reader = input_readers.RecordsReader(file_paths, 0)
        for binary_record in reader:
            proto = file_service_pb.KeyValue()
            proto.ParseFromString(binary_record)
            key = proto.key()
            value = proto.value()
            self.assertTrue(key is not None)
            self.assertTrue(value is not None)

        query = CrawlDbDatum.query(
            CrawlDbDatum.url == "https://developers.google.com/appengine/")
        crawl_db_datums = query.fetch()
        self.assertTrue(len(crawl_db_datums) > 0)
        key = crawl_db_datums[0].key
        content_datums = ContentDbDatum.query(ancestor=key).fetch()
        self.assertEqual(2, len(content_datums))
Ejemplo n.º 5
0
    def run(self, job_id, job_class_str, output):
        job_class = mapreduce_util.for_name(job_class_str)

        try:
            iterator = input_readers.RecordsReader(output, 0)
            results_list = []
            for item in iterator:
                # Map/reduce puts reducer output into blobstore files as a
                # string obtained via "str(result)".  Use AST as a safe
                # alternative to eval() to get the Python object back.
                results_list.append(ast.literal_eval(item))
            job_class.register_completion(job_id, results_list)
        except Exception as e:
            logging.error(traceback.format_exc())
            logging.error('Job %s failed at %s' %
                          (job_id, utils.get_current_time_in_millisecs()))
            job_class.register_failure(
                job_id, '%s\n%s' % (unicode(e), traceback.format_exc()))