def test_read_carbon_from_local_by_folder_concurrently():
  reader = CarbonReader() \
    .builder() \
    .withFolder(LOCAL_DATA_PATH) \
    .withBatch(1000) \
    .build()

  readers = reader.splitAsArray(int(3))
  pool = ThreadPool(len(readers))

  def readLogic(carbonReader):
    i = 0
    while carbonReader.hasNext():
      rows = carbonReader.readNextBatchRow()
      i += len(rows)

    carbonReader.close()

  pool.map(readLogic, readers)
  pool.close()
  def readCarbon(key, secret, end_point, path, label):
    from jnius import autoclass
    java_list_class = autoclass('java.util.ArrayList')
    projection_list = java_list_class()
    projection_list.add("name")

    reader = CarbonReader() \
      .builder() \
      .withBatch(780) \
      .withFolder(path) \
      .withHadoopConf("fs.s3a.access.key", key) \
      .withHadoopConf("fs.s3a.secret.key", secret) \
      .withHadoopConf("fs.s3a.endpoint", end_point) \
      .projection(projection_list) \
      .filterEqual("name", label) \
      .build()

    data_list = []
    while reader.hasNext():
      rows = reader.readNextBatchRow()
      for row in rows:
        data_list.append(row)

    reader.close()
    return data_list
def test_run_read_carbon_by_file_lists():
  from jnius import autoclass
  java_list_class = autoclass('java.util.ArrayList')

  java_list = java_list_class()
  java_list.add(LOCAL_DATA_PATH + "/sub1/part-0-1196034485149392_batchno0-0-null-1196033673787967.carbondata")
  java_list.add(LOCAL_DATA_PATH + "/sub2/part-0-1196034758543568_batchno0-0-null-1196034721553227.carbondata")

  projection_list = java_list_class()
  projection_list.add("name")
  projection_list.add("age")
  projection_list.add("image1")
  projection_list.add("image2")
  projection_list.add("image3")

  reader = CarbonReader() \
    .builder() \
    .withFileLists(java_list) \
    .withBatch(1000) \
    .projection(projection_list) \
    .build()

  num = 0
  while reader.hasNext():
    rows = reader.readNextBatchRow()
    num += len(rows)

  assert 20 == num
  reader.close()
    def read_all(self, columns):
        # rebuilding the reader as need to read specific columns
        carbon_reader_builder = CarbonReader().builder(self.path)
        carbon_schema_reader = CarbonSchemaReader()
        if columns is not None:
            carbon_reader_builder = carbon_reader_builder.projection(columns)
            updatedSchema = carbon_schema_reader.reorderSchemaBasedOnProjection(
                columns, self.carbon_schema)
        else:
            # TODO Currently when projection is not added in carbon reader
            # carbon returns record in dimensions+measures,but here we need based on actual schema order
            # so for handling this adding projection columns based on schema
            updatedSchema = self.carbon_schema
            projection = carbon_schema_reader.getProjectionBasedOnSchema(
                updatedSchema)
            carbon_reader_builder = carbon_reader_builder.projection(
                projection)

        if self.use_s3:
            if self.proxy is None and self.proxy_port is None:
                carbon_reader = carbon_reader_builder \
                  .withHadoopConf("fs.s3a.access.key", self.key) \
                  .withHadoopConf("fs.s3a.secret.key", self.secret) \
                  .withHadoopConf("fs.s3a.endpoint", self.endpoint) \
                  .build_with_split(self.input_split)
            else:
                carbon_reader = carbon_reader_builder \
                  .withHadoopConf("fs.s3a.access.key", self.key) \
                  .withHadoopConf("fs.s3a.secret.key", self.secret) \
                  .withHadoopConf("fs.s3a.endpoint", self.endpoint) \
                  .withHadoopConf("fs.s3a.proxy.host", self.proxy) \
                  .withHadoopConf("fs.s3a.proxy.port", self.proxy_port) \
                  .build_with_split(self.input_split)
        else:
            carbon_reader = carbon_reader_builder.build_with_split(
                self.input_split)

        data = carbon_reader.read(updatedSchema)
        carbon_reader.close()
        return data
Exemple #5
0
def test_run_write_carbon_binary_base64_encode():
    jsonSchema = "[{stringField:string},{shortField:short},{intField:int},{binaryField:binary}]"
    path = "/tmp/data/writeCarbon" + str(time.time())

    if os.path.exists(path):
        shutil.rmtree(path)

    jpg_path = IMAGE_DATA_PATH + "/carbondatalogo.jpg"

    writer = CarbonWriter() \
      .builder() \
      .outputPath(path) \
      .withCsvInput(jsonSchema) \
      .writtenBy("pycarbon") \
      .build()

    with open(jpg_path, mode='rb+') as file_object:
        content = file_object.read()

    for i in range(0, 10):
        from jnius import autoclass

        arrayListClass = autoclass("java.util.ArrayList")
        data_list = arrayListClass()
        data_list.add("pycarbon")
        data_list.add(str(i))
        data_list.add(str(i * 10))
        data_list.add(base64.b64encode(content))
        writer.write(data_list.toArray())

    writer.close()

    reader = CarbonReader() \
      .builder() \
      .withFolder(path) \
      .withBatch(1000) \
      .build()

    i = 0
    while reader.hasNext():
        rows = reader.readNextBatchRow()
        for row in rows:
            i += 1
            for column in row:
                from jnius.jnius import ByteArray
                if 1 == i and isinstance(column,
                                         ByteArray) and len(column) > 1000:
                    with open(path + "/image.jpg", 'wb+') as file_object:
                        file_object.write(base64.b64decode(column.tostring()))

    assert 10 == i
    reader.close()

    shutil.rmtree(path)
def test_run_read_carbon_from_local():
  reader = CarbonReader() \
    .builder() \
    .withBatch(780) \
    .withFolder(LOCAL_DATA_PATH) \
    .build()

  num = 0
  while reader.hasNext():
    rows = reader.readNextBatchRow()
    num += len(rows)

  assert 30 == num
  reader.close()
def test_run_read_carbon_by_file():
  reader = CarbonReader() \
    .builder() \
    .withFile(LOCAL_DATA_PATH + "/sub1/part-0-1196034485149392_batchno0-0-null-1196033673787967.carbondata") \
    .withBatch(1000) \
    .build()

  num = 0
  while reader.hasNext():
    rows = reader.readNextBatchRow()
    num += len(rows)

  assert 10 == num
  reader.close()
def test_run_read_carbon_from_local_for_filter():
  reader = CarbonReader() \
    .builder() \
    .withBatch(10) \
    .withFolder(LOCAL_DATA_PATH) \
    .filterEqual("name", "robot0") \
    .build()

  num = 0
  while reader.hasNext():
    rows = reader.readNextBatchRow()
    num += len(rows)

  assert 3 == num
  reader.close()
def test_run_read_carbon_from_obs():
  reader = CarbonReader() \
    .builder() \
    .withBatch(1000) \
    .withFolder(S3_DATA_PATH) \
    .withHadoopConf("fs.s3a.access.key", pytest.config.getoption("--access_key")) \
    .withHadoopConf("fs.s3a.secret.key", pytest.config.getoption("--secret_key")) \
    .withHadoopConf("fs.s3a.endpoint", pytest.config.getoption("--end_point")) \
    .build()

  num = 0
  while reader.hasNext():
    rows = reader.readNextBatchRow()
    num += len(rows)

  assert 30 == num
  reader.close()
Exemple #10
0
def test_run_write_carbon():
    jsonSchema = "[{stringField:string},{shortField:short},{intField:int}]"
    path = "/tmp/data/writeCarbon" + str(time.time())

    if os.path.exists(path):
        shutil.rmtree(path)

    writer = CarbonWriter() \
      .builder() \
      .outputPath(path) \
      .withCsvInput(jsonSchema) \
      .writtenBy("pycarbon") \
      .build()

    for i in range(0, 10):
        from jnius import autoclass
        arrayListClass = autoclass("java.util.ArrayList")
        data_list = arrayListClass()
        data_list.add("pycarbon")
        data_list.add(str(i))
        data_list.add(str(i * 10))
        writer.write(data_list.toArray())

    writer.close()

    reader = CarbonReader() \
      .builder() \
      .withFolder(path) \
      .withBatch(1000) \
      .build()

    i = 0
    while reader.hasNext():
        rows = reader.readNextBatchRow()
        i += len(rows)

    assert 10 == i
    reader.close()

    shutil.rmtree(path)
def test_run_read_carbon_from_local_for_projection():
  from jnius import autoclass
  java_list_class = autoclass('java.util.ArrayList')
  projection_list = java_list_class()
  projection_list.add("name")
  projection_list.add("age")
  projection_list.add("image1")
  projection_list.add("image2")
  projection_list.add("image3")

  reader = CarbonReader() \
    .builder() \
    .withBatch(100) \
    .withFolder(LOCAL_DATA_PATH) \
    .projection(projection_list) \
    .build()

  num = 0
  while reader.hasNext():
    rows = reader.readNextBatchRow()
    num += len(rows)

  assert 30 == num
  reader.close()
def test_run_read_carbon_from_obs_for_filter():
  from jnius import autoclass
  java_list_class = autoclass('java.util.ArrayList')
  projection_list = java_list_class()
  projection_list.add("name")

  reader = CarbonReader() \
    .builder() \
    .withBatch(780) \
    .withFolder(S3_DATA_PATH) \
    .withHadoopConf("fs.s3a.access.key", pytest.config.getoption("--access_key")) \
    .withHadoopConf("fs.s3a.secret.key", pytest.config.getoption("--secret_key")) \
    .withHadoopConf("fs.s3a.endpoint", pytest.config.getoption("--end_point")) \
    .projection(projection_list) \
    .filterEqual("name", "robot0") \
    .build()

  num = 0
  while reader.hasNext():
    rows = reader.readNextBatchRow()
    num += len(rows)

  assert 3 == num
  reader.close()
Exemple #13
0
def test_run_write_carbon_binary_base64_encode_decodeInJava_many_files():
    jsonSchema = "[{stringField:string},{shortField:short},{intField:int},{binaryField:binary},{txtField:string}]"
    path = "/tmp/data/writeCarbon" + str(time.time())

    if os.path.exists(path):
        shutil.rmtree(path)

    jpg_path = IMAGE_DATA_PATH + "/flowers"

    from jnius import autoclass

    sdkUtilClass = autoclass("org.apache.carbondata.sdk.file.utils.SDKUtil")
    jpg_files = sdkUtilClass.listFiles(jpg_path, '.jpg')

    writer = CarbonWriter() \
      .builder() \
      .outputPath(path) \
      .withCsvInput(jsonSchema) \
      .writtenBy("pycarbon") \
      .withLoadOption("binary_decoder", "base64") \
      .withPageSizeInMb(1) \
      .build()

    for i in range(0, jpg_files.size()):
        jpg_path = jpg_files.get(i)
        with open(jpg_path, mode='rb+') as file_object:
            content = file_object.read()

        with open(str(jpg_path).replace('.jpg', '.txt'),
                  mode='r+') as file_object:
            txt = file_object.read()

        arrayListClass = autoclass("java.util.ArrayList")
        data_list = arrayListClass()
        data_list.add("pycarbon")
        data_list.add(str(i))
        data_list.add(str(i * 10))
        data_list.add(base64.b64encode(content))
        data_list.add(txt)
        writer.write(data_list.toArray())

    writer.close()

    reader = CarbonReader() \
      .builder() \
      .withFolder(path) \
      .withBatch(1000) \
      .build()

    i = 0
    while reader.hasNext():
        rows = reader.readNextBatchRow()
        for row in rows:
            i += 1
            for column in row:
                from jnius.jnius import ByteArray
                if isinstance(column,
                              ByteArray) and len(column) > 1000 and i < 20:
                    with open(path + "/image" + str(i) + ".jpg",
                              'wb+') as file_object:
                        file_object.write((column.tostring()))

    assert 3 == i
    reader.close()

    shutil.rmtree(path)
def test_download_carbon_from_obs_and_read():
  key = pytest.config.getoption("--access_key")
  secret = pytest.config.getoption("--secret_key")
  end_point = pytest.config.getoption("--end_point")

  def list_obs_files(obs_client, bucket_name, prefix):
    files = []

    pageSize = 1000
    index = 1
    nextMarker = None
    while True:
      resp = obs_client.listObjects(bucket_name, prefix=prefix, max_keys=pageSize, marker=nextMarker)
      for content in resp.body.contents:
        files.append(content.key)
      if not resp.body.is_truncated:
        break
      nextMarker = resp.body.next_marker
      index += 1

    return files

  def read_obs_files(key, secret, end_point, bucket_name, prefix, downloadPath):
    obsClient = ObsClient(
      access_key_id=key,
      secret_access_key=secret,
      server=end_point,
      long_conn_mode=True
    )
    files = list_obs_files(obsClient, bucket_name, prefix)
    numOfFiles = len(files)
    print(numOfFiles)
    num = 0
    for file in files:
      num = num + 1
      # obsClient.l
      obsClient.getObject(bucket_name, file, downloadPath=downloadPath + file)
      # resp.body.buffer
      if 0 == num % (numOfFiles / 10):
        print(str(num) + ":" + file)

    obsClient.close()

  downloadPath = '/tmp/carbonbinary/'

  if os.path.exists(downloadPath):
    shutil.rmtree(downloadPath)

  read_obs_files(key, secret, end_point, 'sdk', 'binary', downloadPath)

  reader = CarbonReader() \
    .builder() \
    .withBatch(1000) \
    .withFolder(downloadPath) \
    .build()

  num = 0
  while reader.hasNext():
    rows = reader.readNextBatchRow()
    num += len(rows)

  assert 30 == num
  reader.close()

  shutil.rmtree(downloadPath)
    def __init__(self,
                 path,
                 key=None,
                 secret=None,
                 endpoint=None,
                 proxy=None,
                 proxy_port=None,
                 filesystem=None):
        self.path = path
        self.url_path = urlparse(path)

        if filesystem is None:
            a_path = self.path
            if isinstance(a_path, list):
                a_path = a_path[0]
            self.fs = _get_fs_from_path(a_path)
        else:
            self.fs = _ensure_filesystem(filesystem)

        self.pieces = list()

        if self.url_path.scheme == 's3a':
            if key is None or secret is None or endpoint is None:
                raise ValueError('key, secret, endpoint should not be None')

            if proxy is None and proxy_port is None:
                carbon_splits = CarbonReader().builder(self.path) \
                  .withHadoopConf("fs.s3a.access.key", key) \
                  .withHadoopConf("fs.s3a.secret.key", secret) \
                  .withHadoopConf("fs.s3a.endpoint", endpoint) \
                  .getSplits()

                configuration = Configuration()
                configuration.set("fs.s3a.access.key", key)
                configuration.set("fs.s3a.secret.key", secret)
                configuration.set("fs.s3a.endpoint", endpoint)

                self.configuration = configuration

            elif proxy is not None and proxy_port is not None:
                carbon_splits = CarbonReader().builder(self.path) \
                  .withHadoopConf("fs.s3a.access.key", key) \
                  .withHadoopConf("fs.s3a.secret.key", secret) \
                  .withHadoopConf("fs.s3a.endpoint", endpoint) \
                  .withHadoopConf("fs.s3a.proxy.host", proxy) \
                  .withHadoopConf("fs.s3a.proxy.port", proxy_port) \
                  .getSplits()

                configuration = Configuration()
                configuration.set("fs.s3a.access.key", key)
                configuration.set("fs.s3a.secret.key", secret)
                configuration.set("fs.s3a.endpoint", endpoint)
                configuration.set("fs.s3a.proxy.host", proxy)
                configuration.set("fs.s3a.proxy.port", proxy_port)

                self.configuration = configuration
            else:
                raise ValueError('wrong proxy & proxy_port configuration')

            carbon_schema = CarbonSchemaReader().readSchema(
                self.path, self.configuration.conf)

            for split in carbon_splits:
                # split = self.url_path.scheme + "://" + self.url_path.netloc + split
                self.pieces.append(
                    CarbonDatasetPiece(path,
                                       carbon_schema,
                                       split,
                                       key=key,
                                       secret=secret,
                                       endpoint=endpoint,
                                       proxy=proxy,
                                       proxy_port=proxy_port))

        else:
            carbon_splits = CarbonReader().builder(self.path) \
              .getSplits()

            carbon_schema = CarbonSchemaReader().readSchema(self.path)

            for split in carbon_splits:
                # split = self.url_path.scheme + "://" + self.url_path.netloc + split
                self.pieces.append(
                    CarbonDatasetPiece(path, carbon_schema, split))

        self.number_of_splits = len(self.pieces)
        self.schema = self.getArrowSchema()
        # TODO add mechanism to get the file path based on file filter
        self.common_metadata_path = self.url_path.path + '/_common_metadata'
        self.common_metadata = None
        try:
            if self.fs.exists(self.common_metadata_path):
                with self.fs.open(self.common_metadata_path) as f:
                    self.common_metadata = ParquetFile(f).metadata
        except:
            self.common_metadata = None