Example #1
0
 def setUp(self):
     data_source = common_pb.DataSource()
     data_source.data_source_meta.name = "milestone-f"
     data_source.data_source_meta.partition_num = 1
     data_source.output_base_dir = "./ds_output"
     self.raw_data_dir = "./raw_data"
     self.data_source = data_source
     self.raw_data_options = dj_pb.RawDataOptions(raw_data_iter='TF_RECORD',
                                                  compressed_type='')
     self.example_id_dump_options = dj_pb.ExampleIdDumpOptions(
         example_id_dump_interval=1, example_id_dump_threshold=1024)
     self.example_joiner_options = dj_pb.ExampleJoinerOptions(
         example_joiner='ATTRIBUTION_JOINER',
         min_matching_window=32,
         max_matching_window=51200,
         max_conversion_delay=interval_to_timestamp("124"),
         enable_negative_example_generator=True,
         data_block_dump_interval=32,
         data_block_dump_threshold=128,
         negative_sampling_rate=0.8,
     )
     if gfile.Exists(self.data_source.output_base_dir):
         gfile.DeleteRecursively(self.data_source.output_base_dir)
     if gfile.Exists(self.raw_data_dir):
         gfile.DeleteRecursively(self.raw_data_dir)
     self.kvstore = db_client.DBClient('etcd', True)
     self.kvstore.delete_prefix(
         common.data_source_kvstore_base_dir(
             self.data_source.data_source_meta.name))
     self.total_raw_data_count = 0
     self.total_example_id_count = 0
     self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager(
         self.kvstore, self.data_source)
     self.g_data_block_index = 0
    def test_universal_join_key_mapper_error(self):
        mapper_code = """
from fedlearner.data_join.key_mapper.key_mapping import BaseKeyMapper
class KeyMapperMock(BaseKeyMapper):
    def leader_mapping(self, item) -> dict:
        res = item.click_id.decode().split("_")
        raise ValueError
        return dict({"req_id":res[0], "cid":res[1]})

    def follower_mapping(self, item) -> dict:
        return dict()

    @classmethod
    def name(cls):
        return "TEST_MAPPER"
"""
        abspath = os.path.dirname(os.path.abspath(__file__))
        fname = os.path.realpath(
            os.path.join(
                abspath,
                "../../fedlearner/data_join/key_mapper/impl/keymapper_mock.py")
        )
        with open(fname, "w") as f:
            f.write(mapper_code)
        reload(key_mapper)

        self.example_joiner_options = dj_pb.ExampleJoinerOptions(
            example_joiner='UNIVERSAL_JOINER',
            min_matching_window=32,
            max_matching_window=51200,
            max_conversion_delay=interval_to_timestamp("258"),
            enable_negative_example_generator=True,
            data_block_dump_interval=32,
            data_block_dump_threshold=1024,
            negative_sampling_rate=0.8,
            join_expr="(cid,req_id) or (example_id)",
            join_key_mapper="TEST_MAPPER",
            negative_sampling_filter_expr='',
        )
        self.version = dsp.Version.V2

        sei = joiner_impl.create_example_joiner(
            self.example_joiner_options,
            self.raw_data_options,
            #dj_pb.WriterOptions(output_writer='TF_RECORD'),
            dj_pb.WriterOptions(output_writer='CSV_DICT'),
            self.kvstore,
            self.data_source,
            0)
        self.run_join(sei, 0)
        os.remove(fname)
Example #3
0
 def init(self,
          dsname,
          joiner_name,
          version=Version.V1,
          cache_type="memory"):
     data_source = common_pb.DataSource()
     data_source.data_source_meta.name = dsname
     data_source.data_source_meta.partition_num = 1
     data_source.output_base_dir = "%s_ds_output" % dsname
     self.raw_data_dir = "%s_raw_data" % dsname
     self.data_source = data_source
     self.raw_data_options = dj_pb.RawDataOptions(
         raw_data_iter='TF_RECORD',
         compressed_type='',
         raw_data_cache_type=cache_type,
     )
     self.example_id_dump_options = dj_pb.ExampleIdDumpOptions(
         example_id_dump_interval=1, example_id_dump_threshold=1024)
     self.example_joiner_options = dj_pb.ExampleJoinerOptions(
         example_joiner=joiner_name,
         min_matching_window=32,
         max_matching_window=51200,
         max_conversion_delay=interval_to_timestamp("124"),
         enable_negative_example_generator=True,
         data_block_dump_interval=32,
         data_block_dump_threshold=128,
         negative_sampling_rate=0.8,
         join_expr="example_id",
         join_key_mapper="DEFAULT",
         negative_sampling_filter_expr='',
     )
     if gfile.Exists(self.data_source.output_base_dir):
         gfile.DeleteRecursively(self.data_source.output_base_dir)
     if gfile.Exists(self.raw_data_dir):
         gfile.DeleteRecursively(self.raw_data_dir)
     self.kvstore = db_client.DBClient('etcd', True)
     self.kvstore.delete_prefix(
         common.data_source_kvstore_base_dir(
             self.data_source.data_source_meta.name))
     self.total_raw_data_count = 0
     self.total_example_id_count = 0
     self.manifest_manager = raw_data_manifest_manager.RawDataManifestManager(
         self.kvstore, self.data_source)
     self.g_data_block_index = 0
     self.version = version
    def test_universal_join_small_follower(self):
        self.example_joiner_options = dj_pb.ExampleJoinerOptions(
            example_joiner='UNIVERSAL_JOINER',
            min_matching_window=32,
            max_matching_window=20240,
            max_conversion_delay=interval_to_timestamp("128"),
            enable_negative_example_generator=False,
            data_block_dump_interval=32,
            data_block_dump_threshold=1024,
            negative_sampling_rate=0.8,
            join_expr="(id_type, example_id, trunc(event_time,1))",
            join_key_mapper="DEFAULT",
            negative_sampling_filter_expr='',
        )
        self.version = dsp.Version.V2

        sei = joiner_impl.create_example_joiner(
            self.example_joiner_options, self.raw_data_options,
            dj_pb.WriterOptions(output_writer='TF_RECORD'), self.kvstore,
            self.data_source, 0)
        self.run_join_small_follower(sei, 0.15)
Example #5
0
 args = parser.parse_args()
 worker_options = dj_pb.DataJoinWorkerOptions(
         use_mock_etcd=(args.kvstore_type == 'mock'),
         raw_data_options=dj_pb.RawDataOptions(
                 raw_data_iter=args.raw_data_iter,
                 compressed_type=args.compressed_type,
                 read_ahead_size=args.read_ahead_size,
                 read_batch_size=args.read_batch_size
             ),
         example_joiner_options=dj_pb.ExampleJoinerOptions(
                 example_joiner=args.example_joiner,
                 min_matching_window=args.min_matching_window,
                 max_matching_window=args.max_matching_window,
                 data_block_dump_interval=args.data_block_dump_interval,
                 data_block_dump_threshold=args.data_block_dump_threshold,
                 max_conversion_delay=interval_to_timestamp(\
                                         args.max_conversion_delay),
                 enable_negative_example_generator=\
                     args.enable_negative_example_generator,
             ),
         example_id_dump_options=dj_pb.ExampleIdDumpOptions(
                 example_id_dump_interval=args.example_id_dump_interval,
                 example_id_dump_threshold=args.example_id_dump_threshold
             ),
         batch_processor_options=dj_pb.BatchProcessorOptions(
                 batch_size=4096,
                 max_flying_item=-1
             ),
         data_block_builder_options=dj_pb.WriterOptions(
                 output_writer=args.data_block_builder,
                 compressed_type=args.data_block_compressed_type
             )