def _generate_input_tf_record(self, cands, base_dir): if not gfile.Exists(base_dir): gfile.MakeDirs(base_dir) fpaths = [] random.shuffle(cands) tfr_writers = [] partition_num = self._data_source_l.data_source_meta.partition_num for partition_id in range(partition_num): fpath = os.path.join(base_dir, str(partition_id) + common.RawDataFileSuffix) fpaths.append(fpath) tfr_writers.append(tf.io.TFRecordWriter(fpath)) for item in cands: partition_id = CityHash32(item) % partition_num feat = {} feat['raw_id'] = tf.train.Feature(bytes_list=tf.train.BytesList( value=[item.encode()])) f0 = 'follower' + str((partition_id << 30) + 0) + item f1 = 'follower' + str((partition_id << 30) + 1) + item f2 = 'follower' + str((partition_id << 30) + 2) + item feat['feat_0'] = tf.train.Feature(bytes_list=tf.train.BytesList( value=[f0.encode()])) feat['feat_1'] = tf.train.Feature(bytes_list=tf.train.BytesList( value=[f1.encode()])) feat['feat_2'] = tf.train.Feature(bytes_list=tf.train.BytesList( value=[f2.encode()])) example = tf.train.Example(features=tf.train.Features( feature=feat)) tfr_writers[partition_id].write(example.SerializeToString()) for tfr_writer in tfr_writers: tfr_writer.close() return fpaths
def generate_input_csv(base_dir, start_id, end_id, partition_num): for partition_id in range(partition_num): dirpath = os.path.join(base_dir, common.partition_repr(partition_id)) if not gfile.Exists(dirpath): gfile.MakeDirs(dirpath) assert gfile.IsDirectory(dirpath) csv_writers = [ SortRunMergerWriter(base_dir, 0, partition_id, 'CSV_DICT') for partition_id in range(partition_num) ] for idx in range(start_id, end_id): if idx % 262144 == 0: logging.info("Process at index %d", idx) partition_id = CityHash32(str(idx)) % partition_num raw = OrderedDict() raw['raw_id'] = str(idx) raw['feat_0'] = str((partition_id << 30) + 0) + str(idx) raw['feat_1'] = str((partition_id << 30) + 1) + str(idx) raw['feat_2'] = str((partition_id << 30) + 2) + str(idx) csv_writers[partition_id].append(raw) for partition_id, csv_writer in enumerate(csv_writers): fpaths = csv_writer.finish() logging.info("partition %d dump %d files", partition_id, len(fpaths)) for seq_id, fpath in enumerate(fpaths): logging.info(" %d. %s", seq_id, fpath) logging.info("---------------")
def _check_partitioner(self, map_task): output_partitions = gfile.ListDirectory(map_task.output_base_dir) output_partitions = [ x for x in output_partitions if "SUCCESS" not in x ] self.assertEqual(len(output_partitions), map_task.output_partition_num) partition_dirs = ["{}/{}".format(map_task.output_base_dir, x) \ for x in output_partitions] total_cnt = 0 for partition in output_partitions: dpath = "{}/{}".format(map_task.output_base_dir, partition) partition_id = partition.split("_")[-1] partition_id = int(partition_id) segments = gfile.ListDirectory(dpath) for segment in segments: fpath = "{}/{}".format(dpath, segment) event_time = 0 for record in tf.python_io.tf_record_iterator(fpath): tf_item = TfExampleItem(record) self.assertTrue( tf_item.event_time >= event_time, "{}, {}".format(tf_item.event_time, event_time)) event_time = tf_item.event_time ## assert order self.assertEqual(partition_id, CityHash32(tf_item.raw_id) \ % map_task.output_partition_num) total_cnt += 1 self.assertEqual(total_cnt, self._partition_item_num * self._input_partition_num)
def map_data(self, tf_item): assert isinstance(tf_item, TfExampleItem), \ "the input tf_item should be TfExampleItem" example_id = tf_item.example_id assert example_id != common.InvalidExampleId partition_id = CityHash32(example_id) % self.output_partition_num assert partition_id < len(self._writers) self._writers[partition_id].append(tf_item)
def _raw_data_part_fn(self): if self._check_finished_tag(): logging.warning("raw data has been parttedfor rank id of parti"\ "tioner %d", self._options.partitioner_rank_id) self._notify_part_finished() return self._sync_partitioner_state() assert self._dumped_process_index is not None assert len(self._flying_writers) == 0 fetcher = self._raw_data_batch_fetcher fetch_finished = False next_index = self._get_next_part_index() hint_index = None bp_options = self._options.batch_processor_options round_dumped_item = 0 while not fetch_finished: fetch_finished, batch, hint_index = \ fetcher.fetch_item_batch_by_index(next_index, hint_index) if batch is not None: for index, item in enumerate(batch): raw_id = getattr(item, self._part_field) partition_id = CityHash32(raw_id) % \ self._options.output_partition_num writer = self._get_file_writer(partition_id) writer.append_item(batch.begin_index+index, item) next_index += len(batch) round_dumped_item += len(batch) fly_item_cnt = fetcher.get_flying_item_count() if round_dumped_item // self._options.output_partition_num \ > (1<<21) or \ common.get_heap_mem_stats(None).CheckOomRisk( fly_item_cnt, self._options.memory_limit_ratio-0.05): self._finish_file_writers() self._set_next_part_index(next_index) hint_index = self._evict_staless_batch(hint_index, next_index-1) logging.info("consumed %d items", next_index-1) gc_cnt = gc.collect() logging.warning("finish writer partition trigger "\ "gc %d actively", gc_cnt) round_dumped_item = 0 self._wakeup_raw_data_fetcher() elif not fetch_finished: with self._cond: self._cond.wait(1) self._finish_file_writers() self._dump_finished_tag() for partition_id, metas in self._dumped_file_metas.items(): logging.info("part %d output %d files by partitioner", partition_id, len(metas)) for meta in metas: logging.info("%s", meta.encode_meta_to_fname()) logging.info("-----------------------------------") self._notify_part_finished()
def _raw_data_part_fn(self): if self._check_finished_tag(): logging.warning("raw data has been parttedfor rank id of parti"\ "tioner %d", self._options.partitioner_rank_id) self._notify_part_finished() return self._sync_partitioner_state() assert self._dumped_process_index is not None assert len(self._flying_writers) == 0 fetcher = self._raw_data_batch_fetcher fetch_finished = False iter_round = 0 next_index = self._get_next_part_index() hint_index = None bp_options = self._options.batch_processor_options signal_round_threhold = bp_options.max_flying_item * 3 // \ 5 // bp_options.batch_size + 1 while not fetch_finished: fetch_finished, batch, hint_index = \ fetcher.fetch_item_batch_by_index(next_index, hint_index) if batch is not None: for index, item in enumerate(batch): raw_id = getattr(item, self._part_field) partition_id = CityHash32(raw_id) % \ self._options.output_partition_num writer = self._get_file_writer(partition_id) writer.append_item(batch.begin_index + index, item) next_index += len(batch) iter_round += 1 oom_risk = common.get_oom_risk_checker().check_oom_risk(0.70) if iter_round % signal_round_threhold == 0 or oom_risk: self._finish_file_writers() self._set_next_part_index(next_index) hint_index = self._evict_staless_batch( hint_index, next_index - 1) logging.info("consumed %d items", next_index - 1) if oom_risk: gc_cnt = gc.collect() logging.warning("earily finish writer partition "\ "writer since oom risk, trigger "\ "gc %d actively", gc_cnt) self._wakeup_raw_data_fetcher() elif not fetch_finished: with self._cond: self._cond.wait(1) self._finish_file_writers() self._dump_finished_tag() for partition_id, metas in self._dumped_file_metas.items(): logging.info("part %d output %d files by partitioner", partition_id, len(metas)) for meta in metas: logging.info("%s", meta.encode_meta_to_fname()) logging.info("-----------------------------------") self._notify_part_finished()
async def get_response(query: Query): name = query.name.name if name.endswith(b'.'): name = name[:-1] if not name.endswith(b'.lo0.wtf'): raise DomainError() results = [] if query.type in (AAAA, ALL_RECORDS): name = name[:-8] vhost, username = name.rsplit(b'.', 1) vhost_hash = CityHash32(vhost) username_hash = CityHash32(username) ip = NETWORK[(username_hash << 32) + vhost_hash] results.append( RRHeader( name=query.name.name, type=AAAA, ttl=60, payload=Record_AAAA(address=str(ip).encode('ascii')), )) if query.type in (SOA, ALL_RECORDS) and name == b'': results.append(SOA_OBJ) return results
def _raw_data_part_fn(self): if self._check_finished_tag(): logging.warning("raw data has been parttedfor rank id of parti"\ "tioner %d", self._options.partitioner_rank_id) self._notify_part_finished() return self._sync_partitioner_state() assert self._dumped_process_index is not None assert len(self._flying_writers) == 0 fetcher = self._raw_data_batch_fetcher fetch_finished = False iter_round = 0 next_index = self._get_next_part_index() hint_index = None bp_options = self._options.batch_processor_options signal_round_threhold = bp_options.max_flying_item / \ bp_options.batch_size // 3 while not fetch_finished: fetch_finished, batch, hint_index = \ fetcher.fetch_item_batch_by_index(next_index, hint_index) if batch is not None: for index, item in enumerate(batch): raw_id = item.raw_id partition_id = CityHash32(raw_id) % \ self._options.output_partition_num writer = self._get_file_writer(partition_id) writer.append_item(batch.begin_index + index, item) next_index += len(batch) iter_round += 1 if iter_round % signal_round_threhold == 0: self._finish_file_writers() self._set_next_part_index(next_index) hint_index = self._evict_staless_batch( hint_index, next_index - 1) logging.info("consumed %d items", next_index - 1) self._wakeup_raw_data_fetcher() elif not fetch_finished: hint_index = self._evict_staless_batch(hint_index, next_index - 1) with self._cond: self._cond.wait(1) self._finish_file_writers() self._dump_finished_tag() for partition_id, metas in self._dumped_file_metas.items(): logging.info("part %d output %d files by partitioner", partition_id, len(metas)) for meta in metas: logging.info("%s", meta.encode_meta_to_fname()) logging.info("-----------------------------------") self._notify_part_finished()
def get_name(view, action): """ return name of permission acording to view and action :param view: view as 'module_path.view_name' :param action: action code or action method name """ if action is None: action = "" elif isinstance(action, int): action = str(action) elif isinstance(action, str): action_map_dict = dict((x,y) for y,x in Permission.action_map) print(action_map_dict) action = str(action_map_dict[action]) return CityHash32(view+action)
def partition(self): if self._check_finished_tag(): logging.warning("partition has finished for rank id of parti"\ "tioner %d", self._options.partitioner_rank_id) return next_index = 0 hint_index = 0 fetch_finished = False fetcher = self._raw_data_batch_fetcher writers = [RawDataPartitioner.OutputFileWriter(self._options, pid) for pid in range(self._options.output_partition_num)] iter_round = 0 bp_options = self._options.batch_processor_options signal_round_threhold = bp_options.max_flying_item / \ bp_options.batch_size // 8 while not fetch_finished: fetch_finished, batch, hint_index = \ fetcher.fetch_item_batch_by_index(next_index, hint_index) iter_round += 1 if batch is not None: for index, item in enumerate(batch): raw_id = item.raw_id partition_id = CityHash32(raw_id) % \ self._options.output_partition_num writer = writers[partition_id] writer.append_item(batch.begin_index+index, item) next_index = batch.begin_index + len(batch) if iter_round % signal_round_threhold == 0: hint_index = self._evict_staless_batch(hint_index, next_index-1) logging.info("consumed %d items", next_index-1) self._set_next_part_index(next_index) self._wakeup_raw_data_fetcher() elif not fetch_finished: hint_index = self._evict_staless_batch(hint_index, next_index-1) with self._cond: self._cond.wait(1) for partition_id, writer in enumerate(writers): writer.finish() fpaths = writer.get_output_files() logging.info("part %d output %d files by partitioner", partition_id, len(fpaths)) for fpath in fpaths: logging.info("%s", fpath) logging.info("-----------------------------------") self._dump_finished_tag() self._fetch_worker.stop_routine()
def test_potral_hourly_input_reducer_mapper(self): self._prepare_test() reducer = PotralHourlyInputReducer(self._portal_manifest, self._portal_options, self._date_time) mapper = PotralHourlyOutputMapper(self._portal_manifest, self._portal_options, self._date_time) expected_example_idx = 0 for tf_item in reducer.make_reducer(): example_id = '{}'.format(expected_example_idx).encode() mapper.map_data(tf_item) self.assertEqual(tf_item.example_id, example_id) expected_example_idx += 1 if expected_example_idx % 7 == 0: expected_example_idx += 1 mapper.finish_map() for partition_id in range(self._portal_manifest.output_partition_num): fpath = common.encode_portal_hourly_fpath( self._portal_manifest.output_data_base_dir, self._date_time, partition_id) freader = PotralHourlyInputReducer.InputFileReader( partition_id, fpath, self._portal_options) for example_idx in range(self._total_item_num): example_id = '{}'.format(example_idx).encode() if example_idx != 0 and (example_idx % 7) == 0: continue if partition_id != CityHash32(example_id) % \ self._portal_manifest.output_partition_num: continue for item in freader: self.assertEqual(example_id, item.tf_example_item.example_id) break self.assertFalse(freader.finished) try: next(freader) except StopIteration: self.assertTrue(True) else: self.assertTrue(False) self.assertTrue(freader.finished) if gfile.Exists(self._portal_manifest.input_data_base_dir): gfile.DeleteRecursively(self._portal_manifest.input_data_base_dir) if gfile.Exists(self._portal_manifest.output_data_base_dir): gfile.DeleteRecursively(self._portal_manifest.output_data_base_dir)
def get_name(obj=None, length=2, retry=0, separator="_", capitalize=False): """Test""" if obj is None: return get_random_name() obj = str(obj) hashed = CityHash32(obj) adjectives_joined = None for i in range(length - 1): # pylint: disable=unused-variable adj = adjectives[hashed % adjectives_length] if not adjectives_joined: adjectives_joined = adj if ( (hashed % scientists_length) is wozniak & (hashed % adjectives_length) is boring ): # Steve Wozniak is not boring adjectives_joined = "honored" continue adjectives_joined = separator.join( [adjectives[randint(0, adjectives_length - 1)], adjectives_joined] ) name = "%s%s%s" % ( adjectives_joined if adjectives_joined else "", separator if adjectives_joined else "", scientists[hashed % scientists_length], ) if bool(re.match(r".*boring.*wozniak.*", name)): # Steve Wozniak is not boring name = get_name(obj, length, retry) if retry > 0: name = "%s%s%d" % (name, separator, randint(0, 10)) if capitalize: name = name.upper() return name
def _generate_input_csv(self, cands, base_dir): if not gfile.Exists(base_dir): gfile.MakeDirs(base_dir) fpaths = [] random.shuffle(cands) csv_writers = [] partition_num = self._data_source_l.data_source_meta.partition_num for partition_id in range(partition_num): fpath = os.path.join(base_dir, str(partition_id) + '.rd') fpaths.append(fpath) csv_writers.append(csv_dict_writer.CsvDictWriter(fpath)) for item in cands: partition_id = CityHash32(item) % partition_num raw = OrderedDict() raw['raw_id'] = item raw['feat_0'] = str((partition_id << 30) + 0) + item raw['feat_1'] = str((partition_id << 30) + 1) + item raw['feat_2'] = str((partition_id << 30) + 2) + item csv_writers[partition_id].write(raw) for csv_writer in csv_writers: csv_writer.close() return fpaths
all_fpaths += [os.path.join(args.input_dir, f) for f in gfile.ListDirectory(args.input_dir)] if args.input_file_wildcard is not None and \ len(args.input_file_wildcard) > 0: all_fpaths = [fpath for fpath in all_fpaths if fnmatch(fpath, args.input_file_wildcard)] if len(all_fpaths) == 0: raise RuntimeError("no input files for partitioner") all_fpaths = list(set(all_fpaths)) all_fpaths.sort() partitioner_num = args.total_partitioner_num if partitioner_num > 1: origin_file_num = len(all_fpaths) all_fpaths = \ [fpath for fpath in all_fpaths if CityHash32(os.path.basename(fpath)) % partitioner_num == \ args.partitioner_rank_id] logging.info("Partitioner of rank id %d will process %d/%d "\ "input files", args.partitioner_rank_id, len(all_fpaths), origin_file_num) partitioner_options = dj_pb.RawDataPartitionerOptions( partitioner_name=args.partitioner_name, input_file_paths=all_fpaths, output_dir=args.output_dir, output_partition_num=args.output_partition_num, raw_data_options=dj_pb.RawDataOptions( raw_data_iter=args.raw_data_iter, compressed_type=args.compressed_type, read_ahead_size=args.read_ahead_size, read_batch_size=args.read_batch_size ),
def test_string_unicode_32(self): """Empty Python string has same hash value as empty Unicode string """ self.assertEqual(CityHash32(""), CityHash32(u""))
def test_consistent_encoding_32(self): """ASCII-range Unicode strings have the same hash values as ASCII strings """ text = u"abracadabra" self.assertEqual(CityHash32(text), CityHash32(text.encode("utf-8")))
for i in range(num): key = b"myval-%d" % i int(sha256(b"%s" % key).hexdigest(), 16) print("sha256 took {} s".format(time() - pt)) # CityHash try: from cityhash import CityHash32, CityHash64, CityHash128 except ImportError: print("CityHash not installed, pip install cityhash") else: # cityhash32 pt = time() for i in range(num): key = b"myval-%d" % i CityHash32(b"%s" % key) print("CityHash32 took {} s".format(time() - pt)) # cityhash64 pt = time() for i in range(num): key = b"myval-%d" % i CityHash64(b"%s" % key) print("CityHash64 took {} s".format(time() - pt)) # cityhash128 pt = time() for i in range(num): key = b"myval-%d" % i CityHash128(b"%s" % key) print("CityHash128 took {} s".format(time() - pt))
def test_unicode_2_32(self): """Accepts Unicode input outside of ASCII range""" test_case = u'\u2661' self.assertTrue(isinstance(CityHash32(test_case), int))
def test_unicode_1_32(self): """Accepts Unicode input""" test_case = u"abc" self.assertTrue(isinstance(CityHash32(test_case), int))
def __hash__(self): return CityHash32(self.data)
for i in range(num): key = b'myval-%d' % i int(sha256(b'%s' % key).hexdigest(), 16) print('sha256 took {} s'.format(time() - pt)) # CityHash try: from cityhash import CityHash32, CityHash64, CityHash128 except ImportError: print('CityHash not installed, pip install cityhash') else: # cityhash32 pt = time() for i in range(num): key = b'myval-%d' % i CityHash32(b'%s' % key) print('CityHash32 took {} s'.format(time() - pt)) # cityhash64 pt = time() for i in range(num): key = b'myval-%d' % i CityHash64(b'%s' % key) print('CityHash64 took {} s'.format(time() - pt)) # cityhash128 pt = time() for i in range(num): key = b'myval-%d' % i CityHash128(b'%s' % key) print('CityHash128 took {} s'.format(time() - pt))