Beispiel #1
0
    def testShuffleNoFile(self):
        p = shuffler.ShufflePipeline("testjob", [])
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        p = shuffler.ShufflePipeline.from_id(p.pipeline_id)
        for filename in p.outputs.default.value:
            self.assertEqual(0, files.stat(filename).st_size)
Beispiel #2
0
  def testShuffleNoFile(self):
    bucket_name = "testbucket"
    p = shuffler.ShufflePipeline("testjob", {"bucket_name": bucket_name}, [])
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    p = shuffler.ShufflePipeline.from_id(p.pipeline_id)
    for filename in p.outputs.default.value:
      self.assertEqual(0, cloudstorage.stat(filename).st_size)
    self.assertEquals(1, len(self.emails))
Beispiel #3
0
    def testShuffleNoData(self):
        input_file = files.blobstore.create()
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        p = shuffler.ShufflePipeline("testjob",
                                     [input_file, input_file, input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)

        p = shuffler.ShufflePipeline.from_id(p.pipeline_id)
        for filename in p.outputs.default.value:
            self.assertEqual(0, files.stat(filename).st_size)
Beispiel #4
0
  def testShuffleNoData(self):
    bucket_name = "testbucket"
    test_filename = "testfile"
    full_filename = "/%s/%s" % (bucket_name, test_filename)

    gcs_file = cloudstorage.open(full_filename, mode="w")
    gcs_file.close()

    p = shuffler.ShufflePipeline("testjob", {"bucket_name": bucket_name},
                                 [full_filename, full_filename, full_filename])
    p.start()
    test_support.execute_until_empty(self.taskqueue)

    p = shuffler.ShufflePipeline.from_id(p.pipeline_id)
    for filename in p.outputs.default.value:
      self.assertEqual(0, cloudstorage.stat(filename).st_size)
    self.assertEquals(1, len(self.emails))
    def testShuffleFiles(self):
        """Test shuffling multiple files."""
        input_data = [(str(i), str(i)) for i in range(100)]
        input_data.sort()

        bucket_name = "testbucket"
        test_filename = "testfile"
        full_filename = "/%s/%s" % (bucket_name, test_filename)

        with cloudstorage.open(full_filename, mode="w") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = kv_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())

        p = shuffler.ShufflePipeline(
            "testjob", {"bucket_name": bucket_name},
            [full_filename, full_filename, full_filename])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler.ShufflePipeline.from_id(p.pipeline_id)

        output_files = p.outputs.default.value
        output_data = []
        for output_file in output_files:
            with cloudstorage.open(output_file) as f:
                for record in records.RecordsReader(f):
                    proto = kv_pb.KeyValues()
                    proto.ParseFromString(record)
                    output_data.append((proto.key(), proto.value_list()))
        output_data.sort()

        expected_data = sorted([(str(k), [str(v), str(v),
                                          str(v)]) for (k, v) in input_data])
        self.assertEquals(expected_data, output_data)
        self.assertEquals(1, len(self.emails))
Beispiel #6
0
    def testShuffleFiles(self):
        """Test shuffling multiple files."""
        input_data = [(str(i), str(i)) for i in range(100)]
        input_data.sort()

        input_file = files.blobstore.create()

        with files.open(input_file, "a") as f:
            with records.RecordsWriter(f) as w:
                for (k, v) in input_data:
                    proto = file_service_pb.KeyValue()
                    proto.set_key(k)
                    proto.set_value(v)
                    w.write(proto.Encode())
        files.finalize(input_file)
        input_file = files.blobstore.get_file_name(
            files.blobstore.get_blob_key(input_file))

        p = shuffler.ShufflePipeline("testjob",
                                     [input_file, input_file, input_file])
        p.start()
        test_support.execute_until_empty(self.taskqueue)
        p = shuffler.ShufflePipeline.from_id(p.pipeline_id)

        output_files = p.outputs.default.value
        output_data = []
        for output_file in output_files:
            with files.open(output_file, "r") as f:
                for record in records.RecordsReader(f):
                    proto = file_service_pb.KeyValues()
                    proto.ParseFromString(record)
                    output_data.append((proto.key(), proto.value_list()))
        output_data.sort()

        expected_data = sorted([(str(k), [str(v), str(v),
                                          str(v)]) for (k, v) in input_data])
        self.assertEquals(expected_data, output_data)