Exemple #1
0
class DataBlockVisitor(object):
    def __init__(self, data_source_name, kvstore_type, use_mock_etcd=False):
        self._kvstore = DBClient(kvstore_type, use_mock_etcd)
        self._data_source = retrieve_data_source(self._kvstore,
                                                 data_source_name)

    def LoadDataBlockRepByTimeFrame(self, start_time=None, end_time=None):
        partition_num = self._data_source.data_source_meta.partition_num
        data_block_fnames = {}
        for partition_id in range(0, partition_num):
            data_block_fnames[partition_id] = \
                self._list_data_block(partition_id)
        data_block_reps = {}
        for partition_id, fnames in data_block_fnames.items():
            manifest = self._sync_raw_data_manifest(partition_id)
            for idx, fname in enumerate(fnames):
                check_existed = (idx == len(fnames) - 1)
                rep = self._make_data_block_rep(partition_id, fname,
                                                check_existed)
                filtered = True
                reason = ''
                if rep is None:
                    reason = 'failed to create data block rep'
                elif end_time is not None and rep.end_time > end_time:
                    reason = 'excess time frame'
                elif start_time is not None and rep.end_time <= start_time:
                    reason = 'less time frame'
                elif self._filter_by_visible(rep.data_block_index, manifest):
                    reason = 'data block visible'
                else:
                    data_block_reps[rep.block_id] = rep
                    filtered = False
                if filtered:
                    logging.debug('skip %s since %s', fname, reason)
        return data_block_reps

    def LoadDataBlockReqByIndex(self, partition_id, data_block_index):
        partition_num = self._data_source.data_source_meta.partition_num
        if partition_id < 0 or partition_id >= partition_num:
            raise IndexError("partition {} out range".format(partition_id))
        dirpath = self._partition_data_block_dir(partition_id)
        meta_fname = encode_data_block_meta_fname(self._data_source_name(),
                                                  partition_id,
                                                  data_block_index)
        meta_fpath = os.path.join(dirpath, meta_fname)
        meta = load_data_block_meta(meta_fpath)
        manifest = self._sync_raw_data_manifest(partition_id)
        if meta is not None and \
                not self._filter_by_visible(meta.data_block_index, manifest):
            fname = encode_data_block_fname(self._data_source_name(), meta)
            return DataBlockRep(self._data_source_name(), fname, partition_id,
                                dirpath)
        return None

    def LoadDataBlockRepByBlockId(self, block_id):
        block_info = decode_block_id(block_id)
        dbr = self.LoadDataBlockReqByIndex(block_info['partition_id'],
                                           block_info['data_block_index'])
        if dbr:
            assert dbr.block_id == block_id, \
                    "Invalid datablock, expected %s, but got %s), please "\
                    "check datasource!"%(block_id, dbr.block_id)
        return dbr

    def _list_data_block(self, partition_id):
        dirpath = self._partition_data_block_dir(partition_id)
        if gfile.Exists(dirpath) and gfile.IsDirectory(dirpath):
            return [
                f for f in gfile.ListDirectory(dirpath)
                if f.endswith(DataBlockSuffix)
            ]
        return []

    def _partition_data_block_dir(self, partition_id):
        return os.path.join(data_source_data_block_dir(self._data_source),
                            partition_repr(partition_id))

    def _make_data_block_rep(self, partition_id, data_block_fname,
                             check_existed):
        try:
            rep = DataBlockRep(self._data_source.data_source_meta.name,
                               data_block_fname, partition_id,
                               self._partition_data_block_dir(partition_id),
                               check_existed)
        except Exception as e:  # pylint: disable=broad-except
            logging.warning("Failed to create data block rep for %s in"\
                            "partition %d reason %s", data_block_fname,
                            partition_id, e)
            return None
        return rep

    def _data_source_name(self):
        return self._data_source.data_source_meta.name

    def _sync_raw_data_manifest(self, partition_id):
        kvstore_key = partition_manifest_kvstore_key(self._data_source_name(),
                                                     partition_id)
        data = self._kvstore.get_data(kvstore_key)
        assert data is not None, "raw data manifest of partition "\
                                 "{} must be existed".format(partition_id)
        return text_format.Parse(data, dj_pb.RawDataManifest())

    def _filter_by_visible(self, index, manifest):
        join_state = manifest.join_example_rep.state
        if self._data_source.role == common_pb.FLRole.Follower and \
                join_state != dj_pb.JoinExampleState.Joined:
            return index > manifest.peer_dumped_index
        return False
    parser.add_argument('--raw_data_sub_dir',
                        type=str,
                        required=True,
                        help='the mysql base dir to subscribe new raw data')
    args = parser.parse_args()
    data_source = common_pb.DataSource()
    data_source.data_source_meta.name = args.data_source_name
    data_source.data_source_meta.partition_num = args.partition_num
    data_source.data_source_meta.start_time = args.start_time
    data_source.data_source_meta.end_time = args.end_time
    data_source.data_source_meta.negative_sampling_rate = \
            args.negative_sampling_rate
    if args.role.upper() == 'LEADER':
        data_source.role = common_pb.FLRole.Leader
    else:
        assert args.role.upper() == 'FOLLOWER'
        data_source.role = common_pb.FLRole.Follower
    data_source.output_base_dir = args.output_base_dir
    data_source.raw_data_sub_dir = args.raw_data_sub_dir
    data_source.state = common_pb.DataSourceState.Init
    kvstore = DBClient(args.kvstore_type)
    master_kvstore_key = common.data_source_kvstore_base_dir(
        data_source.data_source_meta.name)
    raw_data = kvstore.get_data(master_kvstore_key)
    if raw_data is None:
        logging.info("data source %s is not existed", args.data_source_name)
        common.commit_data_source(kvstore, data_source)
        logging.info("apply new data source %s", args.data_source_name)
    else:
        logging.info("data source %s has been existed", args.data_source_name)
Exemple #3
0
                        help='Max number of files in a job')
    parser.add_argument('--start_date',
                        type=str,
                        default=None,
                        help='Start date of input data, format %Y%m%d')
    parser.add_argument('--end_date',
                        type=str,
                        default=None,
                        help='End date of input data, format %Y%m%d')
    args = parser.parse_args()
    set_logger()

    use_mock_etcd = (args.kvstore_type == 'mock')
    kvstore = DBClient(args.kvstore_type, use_mock_etcd)
    kvstore_key = common.portal_kvstore_base_dir(args.data_portal_name)
    portal_manifest = kvstore.get_data(kvstore_key)
    data_portal_type = dp_pb.DataPortalType.PSI if \
        args.data_portal_type == 'PSI' else dp_pb.DataPortalType.Streaming
    if portal_manifest is None:
        portal_manifest = dp_pb.DataPortalManifest(
            name=args.data_portal_name,
            data_portal_type=data_portal_type,
            output_partition_num=args.output_partition_num,
            input_file_wildcard=args.input_file_wildcard,
            input_base_dir=args.input_base_dir,
            output_base_dir=args.output_base_dir,
            raw_data_publish_dir=args.raw_data_publish_dir,
            processing_job_id=-1)
        kvstore.set_data(kvstore_key, text_format.\
            MessageToString(portal_manifest))
    else:  # validation parameter consistency
    parser.add_argument('--output_base_dir', type=str, required=True,
                        help='the base dir of output directory')
    parser.add_argument('--raw_data_publish_dir', type=str, required=True,
                        help='the raw data publish dir in mysql')
    parser.add_argument('--long_running', action='store_true',
                        help='make the data portal long running')
    parser.add_argument('--check_success_tag', action='store_true',
                        help='Check that a _SUCCESS file exists before '
                             'processing files in a subfolder')
    args = parser.parse_args()
    set_logger()

    use_mock_etcd = (args.kvstore_type == 'mock')
    kvstore = DBClient(args.kvstore_type, use_mock_etcd)
    kvstore_key = common.portal_kvstore_base_dir(args.data_portal_name)
    if kvstore.get_data(kvstore_key) is None:
        portal_manifest = dp_pb.DataPortalManifest(
                name=args.data_portal_name,
                data_portal_type=(dp_pb.DataPortalType.PSI if
                                  args.data_portal_type == 'PSI' else
                                  dp_pb.DataPortalType.Streaming),
                output_partition_num=args.output_partition_num,
                input_file_wildcard=args.input_file_wildcard,
                input_base_dir=args.input_base_dir,
                output_base_dir=args.output_base_dir,
                raw_data_publish_dir=args.raw_data_publish_dir,
                processing_job_id=-1
            )
        kvstore.set_data(kvstore_key, text_format.\
            MessageToString(portal_manifest))