Exemple #1
0
    def testSortFile(self):
        """Test sorting a file."""
        input_file = files.blobstore.create()

        input_data = [(str(i), "_" + str(i)) for i in range(100)]

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        p = shuffler._SortChunksPipeline("testjob", [input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler._SortChunksPipeline.from_id(p.pipeline_id)

        input_data.sort()
        output_files = p.outputs.default.value[0]
        output_data = []
        for output_file in output_files:
            with files.open(output_file, "r") as f:
                for binary_record in records.RecordsReader(f):
                    proto = file_service_pb.KeyValue()
                    proto.ParseFromString(binary_record)
                    output_data.append((proto.key(), proto.value()))

        self.assertEquals(input_data, output_data)
Exemple #2
0
  def write(self, data, ctx):
    """Write data.

    Args:
      data: actual data yielded from handler. Type is writer-specific.
      ctx: an instance of context.Context.
    """
    if len(data) != 2:
      logging.error("Got bad tuple of length %d (2-tuple expected): %s",
                    len(data), data)

    try:
      key = str(data[0])
      value = str(data[1])
    except TypeError:
      logging.error("Expecting a tuple, but got %s: %s",
                    data.__class__.__name__, data)

    file_index = key.__hash__() % len(self._filenames)
    pool_name = "kv_pool%d" % file_index
    filename = self._filenames[file_index]

    if ctx.get_pool(pool_name) is None:
      ctx.register_pool(pool_name,
                        output_writers.RecordsPool(filename=filename, ctx=ctx))
    proto = file_service_pb.KeyValue()
    proto.set_key(key)
    proto.set_value(value)
    ctx.get_pool(pool_name).append(proto.Encode())
Exemple #3
0
    def testSuccessfulRun(self):
        file_name1 = self.createMockData((
            "http://hoge_0.com",
            "User-agent: test\nDisallow: /content_0\nDisallow: /content_1\nDisallow: /content_3"
        ))
        file_name2 = self.createMockData((
            "http://hoge_1.com",
            "User-agent: test\nAllow: /content_0\nAllow: /content_1\nDisallow: /content_3"
        ))
        createMockCrawlDbDatum(2, 6, True)
        p = pipelines._FetchSetsBufferPipeline("FetchSetsBufferPipeline",
                                               [file_name1, file_name2])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        finished_map = pipelines._FetchSetsBufferPipeline.from_id(
            p.pipeline_id)

        # Can open files
        file_paths = finished_map.outputs.default.value
        self.assertTrue(len(file_paths) > 0)
        self.assertTrue(file_paths[0].startswith("/blobstore/"))

        reader = input_readers.RecordsReader(file_paths, 0)
        for binary_record in reader:
            proto = file_service_pb.KeyValue()
            proto.ParseFromString(binary_record)
            key = proto.key()
            value = proto.value()
            self.assertTrue(key is not None)
            self.assertTrue(value is not None)
Exemple #4
0
    def testFetchError(self):
        blob_keys = self.createInvalidMockData()
        static_content = "User-agent: *\nDisallow: /search\nDisallow: /sdch\nDisallow: /groups"
        self.setReturnValue(content=static_content,
                            headers={
                                "Content-Length": len(static_content),
                                "Content-Type": "text/html"
                            })
        p = pipelines._RobotsFetchPipeline("RobotsFetchPipeline", blob_keys, 2)
        p.start()

        test_support.execute_until_empty(self.taskqueue)
        finished_map = pipelines._RobotsFetchPipeline.from_id(p.pipeline_id)

        # Can open files
        file_list = finished_map.outputs.default.value
        self.assertTrue(len(file_list) > 0)
        reader = input_readers.RecordsReader(file_list, 0)
        for binary_record in reader:
            proto = file_service_pb.KeyValue()
            proto.ParseFromString(binary_record)
            key = proto.key()
            value = proto.value()
            self.assertEquals("invalidScheme://test_url.com", key)
            self.assertEquals("User-agent: *\nDisallow: /", value)
Exemple #5
0
    def testMergeFiles(self):
        """Test merging multiple files."""
        input_data = [(str(i), "_" + str(i)) for i in range(100)]
        input_data.sort()

        input_file = files.blobstore.create()

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        p = TestMergePipeline([input_file, input_file, input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = TestMergePipeline.from_id(p.pipeline_id)

        output_file = p.outputs.default.value[0]
        output_data = []
        with files.open(output_file, "r") as f:
            for record in records.RecordsReader(f):
                output_data.append(record)

        expected_data = [str((k, [v, v, v], False)) for (k, v) in input_data]
        self.assertEquals(expected_data, output_data)
Exemple #6
0
def _hashing_map(binary_record):
  """A map function used in hash phase.

  Reads KeyValue from binary record and yields (key, value).
  """
  proto = file_service_pb.KeyValue()
  proto.ParseFromString(binary_record)
  yield (proto.key(), proto.value())
Exemple #7
0
def _fetchContentMap(binary_record):
    """Map function of fetch content.
  Fetched content will store to blobstore.

  Arg:
    binary_record: key value data, that key is url of target page,
      value is url of target of fetch.

  Returns:
    url: fetched url.
  """
    proto = file_service_pb.KeyValue()
    proto.ParseFromString(binary_record)
    page_url = proto.key()
    target_url = proto.value()
    #Fetch to CrawlDbDatum
    try:
        query = CrawlDbDatum.query(CrawlDbDatum.url == page_url)
        crawl_db_datum_future = query.fetch_async()
    except Exception as e:
        logging.warning("Failed create key, caused by invalid url:" +
                        page_url + ":" + e.message)

    #start fetch
    fetcher = fetchers.SimpleHttpFetcher(1, fetcher_policy_yaml.fetcher_policy)
    stored_url = None
    if re.match("^/", target_url):
        crawl_db_datum = _getCrawlDatum(crawl_db_datum_future)
        target_url = "%s%s" % (crawl_db_datum.extract_domain_url, target_url)

    try:
        fetch_result = fetcher.get(target_url)
        if fetch_result:
            #Storing to blobstore
            blob_io = files.blobstore.create(
                mime_type=fetch_result.get("mime_type"),
                _blobinfo_uploaded_filename=fetch_result.get("fetched_url"))
            with files.open(blob_io, 'a') as f:
                f.write(fetch_result.get("content"))
            files.finalize(blob_io)
            blob_key = files.blobstore.get_blob_key(blob_io)
            stored_url = images.get_serving_url(str(blob_key))
    except Exception as e:
        logging.warning("Fetch Error Occurs:" + e.message)

    #Put content to datastore.
    crawl_db_datum = _getCrawlDatum(crawl_db_datum_future)
    if crawl_db_datum and stored_url is not None:
        entity = ContentDbDatum(
            parent=crawl_db_datum.key,
            fetched_url=fetch_result.get("fetched_url"),
            stored_url=stored_url,
            content_type=fetch_result.get("mime_type"),
            content_size=fetch_result.get("content_length"),
            http_headers=str(fetch_result.get("headers")))
        entity.put()

    yield "%s:%s" % (target_url, stored_url)
Exemple #8
0
    def testSuccessfulRun(self):
        file_name1 = self.createMockData(
            ("https://developers.google.com/appengine/",
             "http://k.yimg.jp/images/top/sp/logo.gif"))
        file_name2 = self.createMockData(
            ("https://developers.google.com/appengine/",
             "/appengine/images/slide1.png"))
        datum = CrawlDbDatum(
            parent=ndb.Key(CrawlDbDatum,
                           "https://developers.google.com/appengine/"),
            url="https://developers.google.com/appengine/",
            extract_domain_url="https://developers.google.com",
            last_status=pipelines.UNFETCHED)
        datum.put()
        resource = self.getResource("slide1.png")
        static_content = resource.read()
        self.setReturnValue(content=static_content,
                            headers={
                                "Content-Length": len(static_content),
                                "Content-Type": "image/png"
                            })
        p = pipelines._FetchContentPipeline("FetchContentPipeline",
                                            [file_name1, file_name2])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        finished_map = pipelines._FetchSetsBufferPipeline.from_id(
            p.pipeline_id)

        # Can open files
        file_paths = finished_map.outputs.default.value
        self.assertTrue(len(file_paths) > 0)
        self.assertTrue(file_paths[0].startswith("/blobstore/"))

        reader = input_readers.RecordsReader(file_paths, 0)
        for binary_record in reader:
            proto = file_service_pb.KeyValue()
            proto.ParseFromString(binary_record)
            key = proto.key()
            value = proto.value()
            self.assertTrue(key is not None)
            self.assertTrue(value is not None)

        query = CrawlDbDatum.query(
            CrawlDbDatum.url == "https://developers.google.com/appengine/")
        crawl_db_datums = query.fetch()
        self.assertTrue(len(crawl_db_datums) > 0)
        key = crawl_db_datums[0].key
        content_datums = ContentDbDatum.query(ancestor=key).fetch()
        self.assertEqual(2, len(content_datums))
Exemple #9
0
    def testPartialRecords(self):
        """Test merging into partial key values."""
        try:
            self._prev_max_values_count = shuffler._MergePipeline._MAX_VALUES_COUNT
            # force max values count to extremely low value.
            shuffler._MergePipeline._MAX_VALUES_COUNT = 1

            input_data = [('1', 'a'), ('2', 'b'), ('3', 'c')]
            input_data.sort()

            input_file = files.blobstore.create()

            with files.open(input_file, "a") as f:
                with records.RecordsWriter(f) as w:
                    for (k, v) in input_data:
                        proto = file_service_pb.KeyValue()
                        proto.set_key(k)
                        proto.set_value(v)
                        w.write(proto.Encode())
            files.finalize(input_file)
            input_file = files.blobstore.get_file_name(
                files.blobstore.get_blob_key(input_file))

            p = TestMergePipeline([input_file, input_file, input_file])
            p.start()
            test_support.execute_until_empty(self.taskqueue)
            p = TestMergePipeline.from_id(p.pipeline_id)

            output_file = p.outputs.default.value[0]
            output_data = []
            with files.open(output_file, "r") as f:
                for record in records.RecordsReader(f):
                    output_data.append(record)

            expected_data = [
                ('1', ['a'], True),
                ('1', ['a'], True),
                ('1', ['a'], False),
                ('2', ['b'], True),
                ('2', ['b'], True),
                ('2', ['b'], False),
                ('3', ['c'], True),
                ('3', ['c'], True),
                ('3', ['c'], False),
            ]
            self.assertEquals([str(e) for e in expected_data], output_data)
        finally:
            shuffler._MergePipeline._MAX_VALUES_COUNT = self._prev_max_values_count
  def write(self, data, ctx):
    if len(data) != 2:
      logging.error("Got bad tuple of length %d (2-tuple expected): %s",
                    len(data), data)

    try:
      key = str(data[0])
      value = str(data[1])
    except TypeError:
      logging.error("Expecting a tuple, but got %s: %s",
                    data.__class__.__name__, data)

    proto = file_service_pb.KeyValue()
    proto.set_key(key)
    proto.set_value(value)
    output_writers.BlobstoreRecordsOutputWriter.write(self, proto.Encode(), ctx)
Exemple #11
0
    def createMockData(self, data):
        """Create mock data for FetchContentPipeline"""
        input_file = files.blobstore.create()
        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                key = str(data[0])
                value = str(data[1])
                proto = file_service_pb.KeyValue()
                proto.set_key(key)
                proto.set_value(value)
                w.write(proto.Encode())

        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        return input_file
Exemple #12
0
def _makeFetchSetBufferMap(binary_record):
    """Map function of create fetch buffers,
  that output thus is one or more fetch url to fetch or skip.
  
  Arg:
    binary_record: key value data, that key is extract domain url,
      value is content from robots.txt.

  Returns:
    url: to fetch url.
    fetch_or_unfetch: the boolean value of fetch or unfetch,
      if sets true is fetch, false is skip.
  """
    proto = file_service_pb.KeyValue()
    proto.ParseFromString(binary_record)
    extract_domain_url = proto.key()
    content = proto.value()
    #Extract urls from CrawlDbDatum.
    try:
        query = CrawlDbDatum.query(
            CrawlDbDatum.extract_domain_url == extract_domain_url)
        crawl_datum_future = query.fetch_async()
    except Exception as e:
        logging.warning("Fetch error occurs from CrawlDbDatum" + e.message)

    can_fetch = False
    #Get the fetcher policy from resource.
    user_agent = fetcher_policy_yaml.fetcher_policy.agent_name
    rp = robotparser.RobotFileParser()
    try:
        rp.parse(content.split("\n").__iter__())
    except Exception as e:
        logging.warning("RobotFileParser raises exception:" + e.message)

    for crawl_datum in crawl_datum_future.get_result():
        url = crawl_datum.url
        try:
            can_fetch = rp.can_fetch(user_agent, url)
        except Exception as e:
            logging.warning("RobotFileParser raises exception:" + e.message)
            url = ""

        yield (url, can_fetch)
Exemple #13
0
def _sort_records(records):
    """Map function sorting records.

  Converts records to KeyValue protos, sorts them by key and writes them
  into new blobstore file. Creates _OutputFile entity to record resulting
  file name.

  Args:
    records: list of records which are serialized KeyValue protos.
  """
    ctx = context.get()
    l = len(records)
    proto_records = [None] * l

    # TODO(user): demote these log statements.
    logging.info("parsing")
    for i in range(l):
        proto = file_service_pb.KeyValue()
        proto.ParseFromString(records[i])
        proto_records[i] = proto

    logging.info("sorting")
    proto_records.sort(cmp=_compare_keys)

    logging.info("writing")
    blob_file_name = (ctx.mapreduce_spec.name + "-" + ctx.mapreduce_id +
                      "-output")
    output_path = files.blobstore.create(
        _blobinfo_uploaded_filename=blob_file_name)
    with output_writers.RecordsPool(output_path, ctx=ctx) as pool:
        for proto in proto_records:
            pool.append(proto.Encode())

    logging.info("finalizing")
    files.finalize(output_path)
    output_path = files.blobstore.get_file_name(
        files.blobstore.get_blob_key(output_path))

    entity = _OutputFile(key_name=output_path,
                         parent=_OutputFile.get_root_key(ctx.mapreduce_id))
    entity.put()
Exemple #14
0
def _sort_records_map(records):
  """Map function sorting records.

  Converts records to KeyValue protos, sorts them by key and writes them
  into new blobstore file. Creates _OutputFile entity to record resulting
  file name.

  Args:
    records: list of records which are serialized KeyValue protos.
  """
  ctx = context.get()
  l = len(records)
  key_records = [None] * l

  logging.debug("Parsing")
  for i in range(l):
    proto = file_service_pb.KeyValue()
    proto.ParseFromString(records[i])
    key_records[i] = (proto.key(), records[i])

  logging.debug("Sorting")
  key_records.sort(cmp=_compare_keys)

  logging.debug("Writing")
  blob_file_name = (ctx.mapreduce_spec.name + "-" +
                    ctx.mapreduce_id + "-output")
  output_path = files.blobstore.create(
      _blobinfo_uploaded_filename=blob_file_name)
  with output_writers.RecordsPool(output_path, ctx=ctx) as pool:
    for key_record in key_records:
      pool.append(key_record[1])

  logging.debug("Finalizing")
  files.finalize(output_path)
  output_path = files.blobstore.get_file_name(
      files.blobstore.get_blob_key(output_path))

  entity = _OutputFile(key_name=output_path,
                       parent=_OutputFile.get_root_key(ctx.mapreduce_id))
  entity.put()
Exemple #15
0
    def testShuffleFiles(self):
        """Test shuffling multiple files."""
        input_data = [(str(i), str(i)) for i in range(100)]
        input_data.sort()

        input_file = files.blobstore.create()

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        p = shuffler.ShufflePipeline("testjob",
                                     [input_file, input_file, input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler.ShufflePipeline.from_id(p.pipeline_id)

        output_files = p.outputs.default.value
        output_data = []
        for output_file in output_files:
            with files.open(output_file, "r") as f:
                for record in records.RecordsReader(f):
                    proto = file_service_pb.KeyValues()
                    proto.ParseFromString(record)
                    output_data.append((proto.key(), proto.value_list()))
        output_data.sort()

        expected_data = sorted([(str(k), [str(v), str(v),
                                          str(v)]) for (k, v) in input_data])
        self.assertEquals(expected_data, output_data)
Exemple #16
0
  def __iter__(self):
    """Iterate over records in input files.

    self._offsets is always correctly updated so that stopping iterations
    doesn't skip records and doesn't read the same record twice.
    """
    ctx = context.get()
    mapper_spec = ctx.mapreduce_spec.mapper
    shard_number = ctx.shard_state.shard_number
    filenames = mapper_spec.params[self.FILES_PARAM][shard_number]

    if len(filenames) != len(self._offsets):
      raise Exception("Files list and offsets do not match.")

    # Heap with (Key, Value, Index, reader) pairs.
    readers = []

    # Initialize heap
    for (i, filename) in enumerate(filenames):
      offset = self._offsets[i]
      reader = records.RecordsReader(files.BufferedFile(filename))
      reader.seek(offset)
      readers.append((None, None, i, reader))

    # Read records from heap and merge values with the same key.
    current_result = None
    while readers:
      (key, value, index, reader) = readers[0]

      if key is not None:
        if current_result and key != current_result[0]:
          # New key encountered. Yield corrent key.
          yield current_result
        if not current_result or key != current_result[0]:
          current_result = (key, [])
        current_result[1].append(value)

      # Read next key/value from reader.
      try:
        self._offsets[index] = reader.tell()
        start_time = time.time()
        binary_record = reader.read()
        # update counters
        if context.get():
          operation.counters.Increment(
              input_readers.COUNTER_IO_READ_BYTES,
              len(binary_record))(context.get())
          operation.counters.Increment(
              input_readers.COUNTER_IO_READ_MSEC,
              int((time.time() - start_time) * 1000))(context.get())
        proto = file_service_pb.KeyValue()
        proto.ParseFromString(binary_record)
        # Put read data back into heap.
        heapq.heapreplace(readers,
                          (proto.key(), proto.value(), index, reader))
      except EOFError:
        heapq.heappop(readers)

    # Yield leftovers.
    if current_result:
      yield current_result
Exemple #17
0
def _fetchMap(binary_record):
    """Map function of create fetch result,
  that create FetchResulDatum entity, will be store to datastore. 

  Arg:
    binary_record: key value data, that key is url to fetch,
      value is boolean value of can be fetch.

  Returns:
    url: to fetch url.
    fetch_result: the result of fetch.
  """
    proto = file_service_pb.KeyValue()
    proto.ParseFromString(binary_record)
    url = proto.key()
    could_fetch = _str2bool(proto.value())
    result = UNFETCHED
    fetched_url = ""
    fetch_date = None
    #Fetch to CrawlDbDatum
    try:
        query = CrawlDbDatum.query(CrawlDbDatum.url == url)
        crawl_db_datum_future = query.fetch_async()
    except Exception as e:
        logging.warning("Failed create key, caused by invalid url:" + url +
                        ":" + e.message)
        could_fetch = False

    if could_fetch:
        #start fetch
        fetcher = fetchers.SimpleHttpFetcher(
            1, fetcher_policy_yaml.fetcher_policy)
        try:
            fetch_result = fetcher.get(url)
            if fetch_result:
                #Storing to datastore
                crawl_db_datums = crawl_db_datum_future.get_result()
                fetche_datum = FetchedDbDatum(
                    parent=crawl_db_datums[0].key,
                    url=url,
                    fetched_url=fetch_result.get("fetched_url"),
                    fetch_time=fetch_result.get("time"),
                    fetched_content=fetch_result.get("content"),
                    content_type=fetch_result.get("mime_type"),
                    content_size=fetch_result.get("read_rate"),
                    response_rate=fetch_result.get("read_rate"),
                    http_headers=str(fetch_result.get("headers")))
                fetche_datum.put()
                #update time of last fetched
                result = FETCHED
                fetch_date = datetime.datetime.now()
                fetched_url = ("%s\n" % url)
        except Exception as e:
            logging.warning("Fetch Page Error Occurs:" + e.message)
            result = FAILED
    else:
        result = FAILED

    #Update status to all datums.
    crawl_db_datums = crawl_db_datum_future.get_result()
    for datum in crawl_db_datums:
        datum.last_status = result
        datum.last_fetched = fetch_date
    ndb.put_multi(crawl_db_datums)

    yield fetched_url