def testSortFile(self): """Test sorting a file.""" input_file = files.blobstore.create() input_data = [(str(i), "_" + str(i)) for i in range(100)] with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = shuffler._SortChunksPipeline("testjob", [input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler._SortChunksPipeline.from_id(p.pipeline_id) input_data.sort() output_files = p.outputs.default.value[0] output_data = [] for output_file in output_files: with files.open(output_file, "r") as f: for binary_record in records.RecordsReader(f): proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) output_data.append((proto.key(), proto.value())) self.assertEquals(input_data, output_data)
def write(self, data, ctx): """Write data. Args: data: actual data yielded from handler. Type is writer-specific. ctx: an instance of context.Context. """ if len(data) != 2: logging.error("Got bad tuple of length %d (2-tuple expected): %s", len(data), data) try: key = str(data[0]) value = str(data[1]) except TypeError: logging.error("Expecting a tuple, but got %s: %s", data.__class__.__name__, data) file_index = key.__hash__() % len(self._filenames) pool_name = "kv_pool%d" % file_index filename = self._filenames[file_index] if ctx.get_pool(pool_name) is None: ctx.register_pool(pool_name, output_writers.RecordsPool(filename=filename, ctx=ctx)) proto = file_service_pb.KeyValue() proto.set_key(key) proto.set_value(value) ctx.get_pool(pool_name).append(proto.Encode())
def testSuccessfulRun(self): file_name1 = self.createMockData(( "http://hoge_0.com", "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3" )) file_name2 = self.createMockData(( "http://hoge_1.com", "User-agent: test\nAllow: /content_0\nAllow: /content_1\nDisallow: /content_3" )) createMockCrawlDbDatum(2, 6, True) p = pipelines._FetchSetsBufferPipeline("FetchSetsBufferPipeline", [file_name1, file_name2]) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._FetchSetsBufferPipeline.from_id( p.pipeline_id) # Can open files file_paths = finished_map.outputs.default.value self.assertTrue(len(file_paths) > 0) self.assertTrue(file_paths[0].startswith("/blobstore/")) reader = input_readers.RecordsReader(file_paths, 0) for binary_record in reader: proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) key = proto.key() value = proto.value() self.assertTrue(key is not None) self.assertTrue(value is not None)
def testFetchError(self): blob_keys = self.createInvalidMockData() static_content = "User-agent: *\nDisallow: /search\nDisallow: /sdch\nDisallow: /groups" self.setReturnValue(content=static_content, headers={ "Content-Length": len(static_content), "Content-Type": "text/html" }) p = pipelines._RobotsFetchPipeline("RobotsFetchPipeline", blob_keys, 2) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._RobotsFetchPipeline.from_id(p.pipeline_id) # Can open files file_list = finished_map.outputs.default.value self.assertTrue(len(file_list) > 0) reader = input_readers.RecordsReader(file_list, 0) for binary_record in reader: proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) key = proto.key() value = proto.value() self.assertEquals("invalidScheme://test_url.com", key) self.assertEquals("User-agent: *\nDisallow: /", value)
def testMergeFiles(self): """Test merging multiple files.""" input_data = [(str(i), "_" + str(i)) for i in range(100)] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = TestMergePipeline([input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data] self.assertEquals(expected_data, output_data)
def _hashing_map(binary_record): """A map function used in hash phase. Reads KeyValue from binary record and yields (key, value). """ proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) yield (proto.key(), proto.value())
def _fetchContentMap(binary_record): """Map function of fetch content. Fetched content will store to blobstore. Arg: binary_record: key value data, that key is url of target page, value is url of target of fetch. Returns: url: fetched url. """ proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) page_url = proto.key() target_url = proto.value() #Fetch to CrawlDbDatum try: query = CrawlDbDatum.query(CrawlDbDatum.url == page_url) crawl_db_datum_future = query.fetch_async() except Exception as e: logging.warning("Failed create key, caused by invalid url:" + page_url + ":" + e.message) #start fetch fetcher = fetchers.SimpleHttpFetcher(1, fetcher_policy_yaml.fetcher_policy) stored_url = None if re.match("^/", target_url): crawl_db_datum = _getCrawlDatum(crawl_db_datum_future) target_url = "%s%s" % (crawl_db_datum.extract_domain_url, target_url) try: fetch_result = fetcher.get(target_url) if fetch_result: #Storing to blobstore blob_io = files.blobstore.create( mime_type=fetch_result.get("mime_type"), _blobinfo_uploaded_filename=fetch_result.get("fetched_url")) with files.open(blob_io, 'a') as f: f.write(fetch_result.get("content")) files.finalize(blob_io) blob_key = files.blobstore.get_blob_key(blob_io) stored_url = images.get_serving_url(str(blob_key)) except Exception as e: logging.warning("Fetch Error Occurs:" + e.message) #Put content to datastore. crawl_db_datum = _getCrawlDatum(crawl_db_datum_future) if crawl_db_datum and stored_url is not None: entity = ContentDbDatum( parent=crawl_db_datum.key, fetched_url=fetch_result.get("fetched_url"), stored_url=stored_url, content_type=fetch_result.get("mime_type"), content_size=fetch_result.get("content_length"), http_headers=str(fetch_result.get("headers"))) entity.put() yield "%s:%s" % (target_url, stored_url)
def testSuccessfulRun(self): file_name1 = self.createMockData( ("https://developers.google.com/appengine/", "http://k.yimg.jp/images/top/sp/logo.gif")) file_name2 = self.createMockData( ("https://developers.google.com/appengine/", "/appengine/images/slide1.png")) datum = CrawlDbDatum( parent=ndb.Key(CrawlDbDatum, "https://developers.google.com/appengine/"), url="https://developers.google.com/appengine/", extract_domain_url="https://developers.google.com", last_status=pipelines.UNFETCHED) datum.put() resource = self.getResource("slide1.png") static_content = resource.read() self.setReturnValue(content=static_content, headers={ "Content-Length": len(static_content), "Content-Type": "image/png" }) p = pipelines._FetchContentPipeline("FetchContentPipeline", [file_name1, file_name2]) p.start() test_support.execute_until_empty(self.taskqueue) finished_map = pipelines._FetchSetsBufferPipeline.from_id( p.pipeline_id) # Can open files file_paths = finished_map.outputs.default.value self.assertTrue(len(file_paths) > 0) self.assertTrue(file_paths[0].startswith("/blobstore/")) reader = input_readers.RecordsReader(file_paths, 0) for binary_record in reader: proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) key = proto.key() value = proto.value() self.assertTrue(key is not None) self.assertTrue(value is not None) query = CrawlDbDatum.query( CrawlDbDatum.url == "https://developers.google.com/appengine/") crawl_db_datums = query.fetch() self.assertTrue(len(crawl_db_datums) > 0) key = crawl_db_datums[0].key content_datums = ContentDbDatum.query(ancestor=key).fetch() self.assertEqual(2, len(content_datums))
def testPartialRecords(self): """Test merging into partial key values.""" try: self._prev_max_values_count = shuffler._MergePipeline._MAX_VALUES_COUNT # force max values count to extremely low value. shuffler._MergePipeline._MAX_VALUES_COUNT = 1 input_data = [('1', 'a'), ('2', 'b'), ('3', 'c')] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = TestMergePipeline([input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = TestMergePipeline.from_id(p.pipeline_id) output_file = p.outputs.default.value[0] output_data = [] with files.open(output_file, "r") as f: for record in records.RecordsReader(f): output_data.append(record) expected_data = [ ('1', ['a'], True), ('1', ['a'], True), ('1', ['a'], False), ('2', ['b'], True), ('2', ['b'], True), ('2', ['b'], False), ('3', ['c'], True), ('3', ['c'], True), ('3', ['c'], False), ] self.assertEquals([str(e) for e in expected_data], output_data) finally: shuffler._MergePipeline._MAX_VALUES_COUNT = self._prev_max_values_count
def write(self, data, ctx): if len(data) != 2: logging.error("Got bad tuple of length %d (2-tuple expected): %s", len(data), data) try: key = str(data[0]) value = str(data[1]) except TypeError: logging.error("Expecting a tuple, but got %s: %s", data.__class__.__name__, data) proto = file_service_pb.KeyValue() proto.set_key(key) proto.set_value(value) output_writers.BlobstoreRecordsOutputWriter.write(self, proto.Encode(), ctx)
def createMockData(self, data): """Create mock data for FetchContentPipeline""" input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: key = str(data[0]) value = str(data[1]) proto = file_service_pb.KeyValue() proto.set_key(key) proto.set_value(value) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) return input_file
def _makeFetchSetBufferMap(binary_record): """Map function of create fetch buffers, that output thus is one or more fetch url to fetch or skip. Arg: binary_record: key value data, that key is extract domain url, value is content from robots.txt. Returns: url: to fetch url. fetch_or_unfetch: the boolean value of fetch or unfetch, if sets true is fetch, false is skip. """ proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) extract_domain_url = proto.key() content = proto.value() #Extract urls from CrawlDbDatum. try: query = CrawlDbDatum.query( CrawlDbDatum.extract_domain_url == extract_domain_url) crawl_datum_future = query.fetch_async() except Exception as e: logging.warning("Fetch error occurs from CrawlDbDatum" + e.message) can_fetch = False #Get the fetcher policy from resource. user_agent = fetcher_policy_yaml.fetcher_policy.agent_name rp = robotparser.RobotFileParser() try: rp.parse(content.split("\n").__iter__()) except Exception as e: logging.warning("RobotFileParser raises exception:" + e.message) for crawl_datum in crawl_datum_future.get_result(): url = crawl_datum.url try: can_fetch = rp.can_fetch(user_agent, url) except Exception as e: logging.warning("RobotFileParser raises exception:" + e.message) url = "" yield (url, can_fetch)
def _sort_records(records): """Map function sorting records. Converts records to KeyValue protos, sorts them by key and writes them into new blobstore file. Creates _OutputFile entity to record resulting file name. Args: records: list of records which are serialized KeyValue protos. """ ctx = context.get() l = len(records) proto_records = [None] * l # TODO(user): demote these log statements. logging.info("parsing") for i in range(l): proto = file_service_pb.KeyValue() proto.ParseFromString(records[i]) proto_records[i] = proto logging.info("sorting") proto_records.sort(cmp=_compare_keys) logging.info("writing") blob_file_name = (ctx.mapreduce_spec.name + "-" + ctx.mapreduce_id + "-output") output_path = files.blobstore.create( _blobinfo_uploaded_filename=blob_file_name) with output_writers.RecordsPool(output_path, ctx=ctx) as pool: for proto in proto_records: pool.append(proto.Encode()) logging.info("finalizing") files.finalize(output_path) output_path = files.blobstore.get_file_name( files.blobstore.get_blob_key(output_path)) entity = _OutputFile(key_name=output_path, parent=_OutputFile.get_root_key(ctx.mapreduce_id)) entity.put()
def _sort_records_map(records): """Map function sorting records. Converts records to KeyValue protos, sorts them by key and writes them into new blobstore file. Creates _OutputFile entity to record resulting file name. Args: records: list of records which are serialized KeyValue protos. """ ctx = context.get() l = len(records) key_records = [None] * l logging.debug("Parsing") for i in range(l): proto = file_service_pb.KeyValue() proto.ParseFromString(records[i]) key_records[i] = (proto.key(), records[i]) logging.debug("Sorting") key_records.sort(cmp=_compare_keys) logging.debug("Writing") blob_file_name = (ctx.mapreduce_spec.name + "-" + ctx.mapreduce_id + "-output") output_path = files.blobstore.create( _blobinfo_uploaded_filename=blob_file_name) with output_writers.RecordsPool(output_path, ctx=ctx) as pool: for key_record in key_records: pool.append(key_record[1]) logging.debug("Finalizing") files.finalize(output_path) output_path = files.blobstore.get_file_name( files.blobstore.get_blob_key(output_path)) entity = _OutputFile(key_name=output_path, parent=_OutputFile.get_root_key(ctx.mapreduce_id)) entity.put()
def testShuffleFiles(self): """Test shuffling multiple files.""" input_data = [(str(i), str(i)) for i in range(100)] input_data.sort() input_file = files.blobstore.create() with files.open(input_file, "a") as f: with records.RecordsWriter(f) as w: for (k, v) in input_data: proto = file_service_pb.KeyValue() proto.set_key(k) proto.set_value(v) w.write(proto.Encode()) files.finalize(input_file) input_file = files.blobstore.get_file_name( files.blobstore.get_blob_key(input_file)) p = shuffler.ShufflePipeline("testjob", [input_file, input_file, input_file]) p.start() test_support.execute_until_empty(self.taskqueue) p = shuffler.ShufflePipeline.from_id(p.pipeline_id) output_files = p.outputs.default.value output_data = [] for output_file in output_files: with files.open(output_file, "r") as f: for record in records.RecordsReader(f): proto = file_service_pb.KeyValues() proto.ParseFromString(record) output_data.append((proto.key(), proto.value_list())) output_data.sort() expected_data = sorted([(str(k), [str(v), str(v), str(v)]) for (k, v) in input_data]) self.assertEquals(expected_data, output_data)
def __iter__(self): """Iterate over records in input files. self._offsets is always correctly updated so that stopping iterations doesn't skip records and doesn't read the same record twice. """ ctx = context.get() mapper_spec = ctx.mapreduce_spec.mapper shard_number = ctx.shard_state.shard_number filenames = mapper_spec.params[self.FILES_PARAM][shard_number] if len(filenames) != len(self._offsets): raise Exception("Files list and offsets do not match.") # Heap with (Key, Value, Index, reader) pairs. readers = [] # Initialize heap for (i, filename) in enumerate(filenames): offset = self._offsets[i] reader = records.RecordsReader(files.BufferedFile(filename)) reader.seek(offset) readers.append((None, None, i, reader)) # Read records from heap and merge values with the same key. current_result = None while readers: (key, value, index, reader) = readers[0] if key is not None: if current_result and key != current_result[0]: # New key encountered. Yield corrent key. yield current_result if not current_result or key != current_result[0]: current_result = (key, []) current_result[1].append(value) # Read next key/value from reader. try: self._offsets[index] = reader.tell() start_time = time.time() binary_record = reader.read() # update counters if context.get(): operation.counters.Increment( input_readers.COUNTER_IO_READ_BYTES, len(binary_record))(context.get()) operation.counters.Increment( input_readers.COUNTER_IO_READ_MSEC, int((time.time() - start_time) * 1000))(context.get()) proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) # Put read data back into heap. heapq.heapreplace(readers, (proto.key(), proto.value(), index, reader)) except EOFError: heapq.heappop(readers) # Yield leftovers. if current_result: yield current_result
def _fetchMap(binary_record): """Map function of create fetch result, that create FetchResulDatum entity, will be store to datastore. Arg: binary_record: key value data, that key is url to fetch, value is boolean value of can be fetch. Returns: url: to fetch url. fetch_result: the result of fetch. """ proto = file_service_pb.KeyValue() proto.ParseFromString(binary_record) url = proto.key() could_fetch = _str2bool(proto.value()) result = UNFETCHED fetched_url = "" fetch_date = None #Fetch to CrawlDbDatum try: query = CrawlDbDatum.query(CrawlDbDatum.url == url) crawl_db_datum_future = query.fetch_async() except Exception as e: logging.warning("Failed create key, caused by invalid url:" + url + ":" + e.message) could_fetch = False if could_fetch: #start fetch fetcher = fetchers.SimpleHttpFetcher( 1, fetcher_policy_yaml.fetcher_policy) try: fetch_result = fetcher.get(url) if fetch_result: #Storing to datastore crawl_db_datums = crawl_db_datum_future.get_result() fetche_datum = FetchedDbDatum( parent=crawl_db_datums[0].key, url=url, fetched_url=fetch_result.get("fetched_url"), fetch_time=fetch_result.get("time"), fetched_content=fetch_result.get("content"), content_type=fetch_result.get("mime_type"), content_size=fetch_result.get("read_rate"), response_rate=fetch_result.get("read_rate"), http_headers=str(fetch_result.get("headers"))) fetche_datum.put() #update time of last fetched result = FETCHED fetch_date = datetime.datetime.now() fetched_url = ("%s\n" % url) except Exception as e: logging.warning("Fetch Page Error Occurs:" + e.message) result = FAILED else: result = FAILED #Update status to all datums. crawl_db_datums = crawl_db_datum_future.get_result() for datum in crawl_db_datums: datum.last_status = result datum.last_fetched = fetch_date ndb.put_multi(crawl_db_datums) yield fetched_url