Exemple #1
0
    def _register_reader(self):
        channel = SandboxRpcChannel()
        stub = subprocess_pb.CupidSubProcessService_Stub(channel)

        req = subprocess_pb.RegisterTableReaderRequest(
            inputTableHandle=self._handle, inputSplit=self.split_proto)
        controller = CupidRpcController()
        resp = stub.RegisterTableReader(controller, req, None)
        if controller.Failed():
            raise CupidError(controller.ErrorText())

        logger.info("RegisterTableReader response: %s", resp)
        logger.info("RegisterTableReaderResponse protobuf field size = %d",
                    len(resp.ListFields()))

        schema_json = json.loads(resp.schema)
        partition_schema_json = json.loads(resp.partitionSchema) \
            if resp.HasField('partitionSchema') else None

        schema_names = [d['name'] for d in schema_json]
        schema_types = [d['type'] for d in schema_json]
        pt_schema_names = [d['name'] for d in partition_schema_json]
        pt_schema_types = [d['type'] for d in partition_schema_json]
        schema = Schema.from_lists(schema_names, schema_types, pt_schema_names,
                                   pt_schema_types)

        return resp.readIterator, schema
Exemple #2
0
    def commit(self):
        channel = SandboxRpcChannel()
        stub = subprocess_pb.CupidSubProcessService_Stub(channel)

        commit_actions = [
            subprocess_pb.CommitActionInfo(
                commitFileName=self._block_id,
                attemptFileName=ATTEMPT_FILE_PREFIX + self._block_id,
                partSpec=self._partition_spec,
            )
        ]

        req = subprocess_pb.CommitTableFilesRequest(
            outputTableHandle=self._handle,
            projectName=self._project_name,
            tableName=self._table_name,
            commitActionInfos=commit_actions,
        )

        controller = CupidRpcController()
        for _ in range(options.retry_times):
            stub.CommitTableFiles(controller, req, None)
            if controller.Failed():
                time.sleep(0.1)
                controller = CupidRpcController()
            else:
                break
        if controller.Failed():
            raise CupidError(controller.ErrorText())
Exemple #3
0
    def _register_writer(self, partition=None):
        if isinstance(partition, TablePartition):
            partition = str(partition.spec)

        controller = CupidRpcController()
        channel = SandboxRpcChannel()
        stub = subprocess_pb.CupidSubProcessService_Stub(channel)

        table_schema = self._table_schema
        schema_str = '|' + '|'.join(str(col.type) for col in table_schema.simple_columns)
        req = subprocess_pb.RegisterTableWriterRequest(
            outputTableHandle=self._handle,
            projectName=self._project_name,
            tableName=self._table_name,
            attemptFileName=ATTEMPT_FILE_PREFIX + self._block_id,
            partSpec=partition.replace("'", '') if partition else None,
            schema=schema_str,
        )
        resp = stub.RegisterTableWriter(controller, req, None)
        write_label = resp.subprocessWriteTableLabel
        return write_label
Exemple #4
0
def create_download_session(session,
                            table_or_parts,
                            split_size=None,
                            split_count=None,
                            columns=None,
                            with_split_meta=False):
    channel = CupidTaskServiceRpcChannel(session)
    stub = task_service_pb.CupidTaskService_Stub(channel)

    if not isinstance(table_or_parts, (list, tuple, set, GeneratorType)):
        table_or_parts = [table_or_parts]

    if split_size is None and split_count is None:
        split_count = 1
    split_count = split_count or 0
    split_size = (split_size or 1024**2) // 1024**2

    table_pbs = []
    for t in table_or_parts:
        if isinstance(t, Table):
            if not columns:
                columns = t.schema.names
            table_kw = dict(
                projectName=t.project.name,
                tableName=t.name,
                columns=','.join(columns),
            )
        elif isinstance(t, TablePartition):
            if not columns:
                columns = t.table.schema.names
            table_kw = dict(
                projectName=t.table.project.name,
                tableName=t.table.name,
                columns=','.join(columns),
                partSpec=str(t.partition_spec).replace("'", '').strip(),
            )
        else:
            raise NotImplementedError
        table_pbs.append(task_service_pb.TableInputInfo(**table_kw))

    request = task_service_pb.SplitTablesRequest(
        lookupName=session.lookup_name,
        splitSize=split_size,
        splitCount=split_count,
        tableInputInfos=table_pbs,
        allowNoColumns=True,
        requireSplitMeta=with_split_meta,
    )

    controller = CupidRpcController()
    resp = stub.SplitTables(controller, request, None)
    if controller.Failed():
        raise CupidError(controller.ErrorText())
    logger.info("[CupidTask] splitTables call, CurrentInstanceId: %s, "
                "request: %s, response: %s" % (
                    session.lookup_name,
                    str(request),
                    str(resp),
                ))
    handle = resp.inputTableHandle

    channel = SandboxRpcChannel()
    stub = subprocess_pb.CupidSubProcessService_Stub(channel)

    if not with_split_meta:
        split_meta = itertools.repeat(None)
    else:
        req = subprocess_pb.GetSplitsMetaRequest(inputTableHandle=handle, )
        controller = CupidRpcController()
        resp = stub.GetSplitsMeta(controller, req, None)
        logger.info("[CupidTask] getSplitsMeta call, CurrentInstanceId: %s, "
                    "request: %s, response: %s" % (
                        session.lookup_name,
                        str(request),
                        str(resp),
                    ))
        if controller.Failed():
            split_meta = itertools.repeat(None)
            logger.warning('Failed to get results of getSplitsMeta, '
                           'may running on an old service')
        else:
            split_meta = resp.inputSplitsMeta

    req = subprocess_pb.GetSplitsRequest(inputTableHandle=handle)
    controller = CupidRpcController()
    resp = stub.GetSplits(controller, req, None)
    if controller.Failed():
        raise CupidError(controller.ErrorText())

    input_splits = []
    for info, meta in zip(resp.inputSplits, split_meta):
        input_splits.append(
            TableSplit(split_proto=info,
                       meta_proto=meta,
                       handle=handle,
                       columns=columns))
    logger.info("[SubProcess] getSplits call, CurrentInstanceId: %s, "
                "request: %s, response: %s" % (
                    session.lookup_name,
                    str(req),
                    str(resp),
                ))
    return CupidTableDownloadSession(session=session,
                                     handle=handle,
                                     splits=input_splits)