Ejemplo n.º 1
0
 def _generate_input_tf_record(self, cands, base_dir):
     if not gfile.Exists(base_dir):
         gfile.MakeDirs(base_dir)
     fpaths = []
     random.shuffle(cands)
     tfr_writers = []
     partition_num = self._data_source_l.data_source_meta.partition_num
     for partition_id in range(partition_num):
         fpath = os.path.join(base_dir,
                              str(partition_id) + common.RawDataFileSuffix)
         fpaths.append(fpath)
         tfr_writers.append(tf.io.TFRecordWriter(fpath))
     for item in cands:
         partition_id = CityHash32(item) % partition_num
         feat = {}
         feat['raw_id'] = tf.train.Feature(bytes_list=tf.train.BytesList(
             value=[item.encode()]))
         f0 = 'follower' + str((partition_id << 30) + 0) + item
         f1 = 'follower' + str((partition_id << 30) + 1) + item
         f2 = 'follower' + str((partition_id << 30) + 2) + item
         feat['feat_0'] = tf.train.Feature(bytes_list=tf.train.BytesList(
             value=[f0.encode()]))
         feat['feat_1'] = tf.train.Feature(bytes_list=tf.train.BytesList(
             value=[f1.encode()]))
         feat['feat_2'] = tf.train.Feature(bytes_list=tf.train.BytesList(
             value=[f2.encode()]))
         example = tf.train.Example(features=tf.train.Features(
             feature=feat))
         tfr_writers[partition_id].write(example.SerializeToString())
     for tfr_writer in tfr_writers:
         tfr_writer.close()
     return fpaths
Ejemplo n.º 2
0
def generate_input_csv(base_dir, start_id, end_id, partition_num):
    for partition_id in range(partition_num):
        dirpath = os.path.join(base_dir, common.partition_repr(partition_id))
        if not gfile.Exists(dirpath):
            gfile.MakeDirs(dirpath)
        assert gfile.IsDirectory(dirpath)
    csv_writers = [
        SortRunMergerWriter(base_dir, 0, partition_id, 'CSV_DICT')
        for partition_id in range(partition_num)
    ]
    for idx in range(start_id, end_id):
        if idx % 262144 == 0:
            logging.info("Process at index %d", idx)
        partition_id = CityHash32(str(idx)) % partition_num
        raw = OrderedDict()
        raw['raw_id'] = str(idx)
        raw['feat_0'] = str((partition_id << 30) + 0) + str(idx)
        raw['feat_1'] = str((partition_id << 30) + 1) + str(idx)
        raw['feat_2'] = str((partition_id << 30) + 2) + str(idx)
        csv_writers[partition_id].append(raw)
    for partition_id, csv_writer in enumerate(csv_writers):
        fpaths = csv_writer.finish()
        logging.info("partition %d dump %d files", partition_id, len(fpaths))
        for seq_id, fpath in enumerate(fpaths):
            logging.info("  %d. %s", seq_id, fpath)
        logging.info("---------------")
Ejemplo n.º 3
0
    def _check_partitioner(self, map_task):
        output_partitions = gfile.ListDirectory(map_task.output_base_dir)
        output_partitions = [
            x for x in output_partitions if "SUCCESS" not in x
        ]
        self.assertEqual(len(output_partitions), map_task.output_partition_num)
        partition_dirs = ["{}/{}".format(map_task.output_base_dir, x) \
            for x in output_partitions]

        total_cnt = 0
        for partition in output_partitions:
            dpath = "{}/{}".format(map_task.output_base_dir, partition)
            partition_id = partition.split("_")[-1]
            partition_id = int(partition_id)
            segments = gfile.ListDirectory(dpath)
            for segment in segments:
                fpath = "{}/{}".format(dpath, segment)
                event_time = 0
                for record in tf.python_io.tf_record_iterator(fpath):
                    tf_item = TfExampleItem(record)
                    self.assertTrue(
                        tf_item.event_time >= event_time,
                        "{}, {}".format(tf_item.event_time, event_time))
                    event_time = tf_item.event_time  ## assert order
                    self.assertEqual(partition_id, CityHash32(tf_item.raw_id) \
                        % map_task.output_partition_num)
                    total_cnt += 1
        self.assertEqual(total_cnt,
                         self._partition_item_num * self._input_partition_num)
 def map_data(self, tf_item):
     assert isinstance(tf_item, TfExampleItem), \
         "the input tf_item should be TfExampleItem"
     example_id = tf_item.example_id
     assert example_id != common.InvalidExampleId
     partition_id = CityHash32(example_id) % self.output_partition_num
     assert partition_id < len(self._writers)
     self._writers[partition_id].append(tf_item)
Ejemplo n.º 5
0
 def _raw_data_part_fn(self):
     if self._check_finished_tag():
         logging.warning("raw data has been parttedfor rank id of parti"\
                         "tioner %d", self._options.partitioner_rank_id)
         self._notify_part_finished()
         return
     self._sync_partitioner_state()
     assert self._dumped_process_index is not None
     assert len(self._flying_writers) == 0
     fetcher = self._raw_data_batch_fetcher
     fetch_finished = False
     next_index = self._get_next_part_index()
     hint_index = None
     bp_options = self._options.batch_processor_options
     round_dumped_item = 0
     while not fetch_finished:
         fetch_finished, batch, hint_index = \
                 fetcher.fetch_item_batch_by_index(next_index, hint_index)
         if batch is not None:
             for index, item in enumerate(batch):
                 raw_id = getattr(item, self._part_field)
                 partition_id = CityHash32(raw_id) % \
                         self._options.output_partition_num
                 writer = self._get_file_writer(partition_id)
                 writer.append_item(batch.begin_index+index, item)
             next_index += len(batch)
             round_dumped_item += len(batch)
             fly_item_cnt = fetcher.get_flying_item_count()
             if round_dumped_item // self._options.output_partition_num \
                     > (1<<21) or \
                     common.get_heap_mem_stats(None).CheckOomRisk(
                         fly_item_cnt,
                         self._options.memory_limit_ratio-0.05):
                 self._finish_file_writers()
                 self._set_next_part_index(next_index)
                 hint_index = self._evict_staless_batch(hint_index,
                                                        next_index-1)
                 logging.info("consumed %d items", next_index-1)
                 gc_cnt = gc.collect()
                 logging.warning("finish writer partition trigger "\
                                 "gc %d actively", gc_cnt)
                 round_dumped_item = 0
                 self._wakeup_raw_data_fetcher()
         elif not fetch_finished:
             with self._cond:
                 self._cond.wait(1)
     self._finish_file_writers()
     self._dump_finished_tag()
     for partition_id, metas in self._dumped_file_metas.items():
         logging.info("part %d output %d files by partitioner",
                       partition_id, len(metas))
         for meta in metas:
             logging.info("%s", meta.encode_meta_to_fname())
         logging.info("-----------------------------------")
     self._notify_part_finished()
Ejemplo n.º 6
0
 def _raw_data_part_fn(self):
     if self._check_finished_tag():
         logging.warning("raw data has been parttedfor rank id of parti"\
                         "tioner %d", self._options.partitioner_rank_id)
         self._notify_part_finished()
         return
     self._sync_partitioner_state()
     assert self._dumped_process_index is not None
     assert len(self._flying_writers) == 0
     fetcher = self._raw_data_batch_fetcher
     fetch_finished = False
     iter_round = 0
     next_index = self._get_next_part_index()
     hint_index = None
     bp_options = self._options.batch_processor_options
     signal_round_threhold = bp_options.max_flying_item * 3 // \
             5 // bp_options.batch_size + 1
     while not fetch_finished:
         fetch_finished, batch, hint_index = \
                 fetcher.fetch_item_batch_by_index(next_index, hint_index)
         if batch is not None:
             for index, item in enumerate(batch):
                 raw_id = getattr(item, self._part_field)
                 partition_id = CityHash32(raw_id) % \
                         self._options.output_partition_num
                 writer = self._get_file_writer(partition_id)
                 writer.append_item(batch.begin_index + index, item)
             next_index += len(batch)
             iter_round += 1
             oom_risk = common.get_oom_risk_checker().check_oom_risk(0.70)
             if iter_round % signal_round_threhold == 0 or oom_risk:
                 self._finish_file_writers()
                 self._set_next_part_index(next_index)
                 hint_index = self._evict_staless_batch(
                     hint_index, next_index - 1)
                 logging.info("consumed %d items", next_index - 1)
                 if oom_risk:
                     gc_cnt = gc.collect()
                     logging.warning("earily finish writer partition "\
                                     "writer since oom risk, trigger "\
                                     "gc %d actively", gc_cnt)
                 self._wakeup_raw_data_fetcher()
         elif not fetch_finished:
             with self._cond:
                 self._cond.wait(1)
     self._finish_file_writers()
     self._dump_finished_tag()
     for partition_id, metas in self._dumped_file_metas.items():
         logging.info("part %d output %d files by partitioner",
                      partition_id, len(metas))
         for meta in metas:
             logging.info("%s", meta.encode_meta_to_fname())
         logging.info("-----------------------------------")
     self._notify_part_finished()
Ejemplo n.º 7
0
async def get_response(query: Query):
    name = query.name.name
    if name.endswith(b'.'): name = name[:-1]
    if not name.endswith(b'.lo0.wtf'): raise DomainError()
    results = []
    if query.type in (AAAA, ALL_RECORDS):
        name = name[:-8]
        vhost, username = name.rsplit(b'.', 1)
        vhost_hash = CityHash32(vhost)
        username_hash = CityHash32(username)
        ip = NETWORK[(username_hash << 32) + vhost_hash]
        results.append(
            RRHeader(
                name=query.name.name,
                type=AAAA,
                ttl=60,
                payload=Record_AAAA(address=str(ip).encode('ascii')),
            ))
    if query.type in (SOA, ALL_RECORDS) and name == b'':
        results.append(SOA_OBJ)
    return results
Ejemplo n.º 8
0
 def _raw_data_part_fn(self):
     if self._check_finished_tag():
         logging.warning("raw data has been parttedfor rank id of parti"\
                         "tioner %d", self._options.partitioner_rank_id)
         self._notify_part_finished()
         return
     self._sync_partitioner_state()
     assert self._dumped_process_index is not None
     assert len(self._flying_writers) == 0
     fetcher = self._raw_data_batch_fetcher
     fetch_finished = False
     iter_round = 0
     next_index = self._get_next_part_index()
     hint_index = None
     bp_options = self._options.batch_processor_options
     signal_round_threhold = bp_options.max_flying_item / \
             bp_options.batch_size // 3
     while not fetch_finished:
         fetch_finished, batch, hint_index = \
                 fetcher.fetch_item_batch_by_index(next_index, hint_index)
         if batch is not None:
             for index, item in enumerate(batch):
                 raw_id = item.raw_id
                 partition_id = CityHash32(raw_id) % \
                         self._options.output_partition_num
                 writer = self._get_file_writer(partition_id)
                 writer.append_item(batch.begin_index + index, item)
             next_index += len(batch)
             iter_round += 1
             if iter_round % signal_round_threhold == 0:
                 self._finish_file_writers()
                 self._set_next_part_index(next_index)
                 hint_index = self._evict_staless_batch(
                     hint_index, next_index - 1)
                 logging.info("consumed %d items", next_index - 1)
                 self._wakeup_raw_data_fetcher()
         elif not fetch_finished:
             hint_index = self._evict_staless_batch(hint_index,
                                                    next_index - 1)
             with self._cond:
                 self._cond.wait(1)
     self._finish_file_writers()
     self._dump_finished_tag()
     for partition_id, metas in self._dumped_file_metas.items():
         logging.info("part %d output %d files by partitioner",
                      partition_id, len(metas))
         for meta in metas:
             logging.info("%s", meta.encode_meta_to_fname())
         logging.info("-----------------------------------")
     self._notify_part_finished()
Ejemplo n.º 9
0
    def get_name(view, action):
        """ return name of permission acording to view and action

        :param view: view as 'module_path.view_name'
        :param action: action code or action method name
        """
        if action is None:
            action = ""
        elif isinstance(action, int):
            action = str(action)
        elif isinstance(action, str):
            action_map_dict = dict((x,y) for y,x in Permission.action_map)
            print(action_map_dict)
            action = str(action_map_dict[action])
        return CityHash32(view+action)
Ejemplo n.º 10
0
 def partition(self):
     if self._check_finished_tag():
         logging.warning("partition has finished for rank id of parti"\
                         "tioner %d", self._options.partitioner_rank_id)
         return
     next_index = 0
     hint_index = 0
     fetch_finished = False
     fetcher = self._raw_data_batch_fetcher
     writers = [RawDataPartitioner.OutputFileWriter(self._options, pid)
                for pid in range(self._options.output_partition_num)]
     iter_round = 0
     bp_options = self._options.batch_processor_options
     signal_round_threhold = bp_options.max_flying_item / \
             bp_options.batch_size // 8
     while not fetch_finished:
         fetch_finished, batch, hint_index = \
                 fetcher.fetch_item_batch_by_index(next_index, hint_index)
         iter_round += 1
         if batch is not None:
             for index, item in enumerate(batch):
                 raw_id = item.raw_id
                 partition_id = CityHash32(raw_id) % \
                         self._options.output_partition_num
                 writer = writers[partition_id]
                 writer.append_item(batch.begin_index+index, item)
             next_index = batch.begin_index + len(batch)
             if iter_round % signal_round_threhold == 0:
                 hint_index = self._evict_staless_batch(hint_index,
                                                        next_index-1)
                 logging.info("consumed %d items", next_index-1)
             self._set_next_part_index(next_index)
             self._wakeup_raw_data_fetcher()
         elif not fetch_finished:
             hint_index = self._evict_staless_batch(hint_index,
                                                    next_index-1)
             with self._cond:
                 self._cond.wait(1)
     for partition_id, writer in enumerate(writers):
         writer.finish()
         fpaths = writer.get_output_files()
         logging.info("part %d output %d files by partitioner",
                       partition_id, len(fpaths))
         for fpath in fpaths:
             logging.info("%s", fpath)
         logging.info("-----------------------------------")
     self._dump_finished_tag()
     self._fetch_worker.stop_routine()
Ejemplo n.º 11
0
    def test_potral_hourly_input_reducer_mapper(self):
        self._prepare_test()
        reducer = PotralHourlyInputReducer(self._portal_manifest,
                                           self._portal_options,
                                           self._date_time)
        mapper = PotralHourlyOutputMapper(self._portal_manifest,
                                          self._portal_options,
                                          self._date_time)
        expected_example_idx = 0
        for tf_item in reducer.make_reducer():
            example_id = '{}'.format(expected_example_idx).encode()
            mapper.map_data(tf_item)
            self.assertEqual(tf_item.example_id, example_id)
            expected_example_idx += 1
            if expected_example_idx % 7 == 0:
                expected_example_idx += 1
        mapper.finish_map()
        for partition_id in range(self._portal_manifest.output_partition_num):
            fpath = common.encode_portal_hourly_fpath(
                self._portal_manifest.output_data_base_dir, self._date_time,
                partition_id)
            freader = PotralHourlyInputReducer.InputFileReader(
                partition_id, fpath, self._portal_options)
            for example_idx in range(self._total_item_num):
                example_id = '{}'.format(example_idx).encode()
                if example_idx != 0 and (example_idx % 7) == 0:
                    continue
                if partition_id != CityHash32(example_id) % \
                        self._portal_manifest.output_partition_num:
                    continue
                for item in freader:
                    self.assertEqual(example_id,
                                     item.tf_example_item.example_id)
                    break
                self.assertFalse(freader.finished)
            try:
                next(freader)
            except StopIteration:
                self.assertTrue(True)
            else:
                self.assertTrue(False)
            self.assertTrue(freader.finished)

        if gfile.Exists(self._portal_manifest.input_data_base_dir):
            gfile.DeleteRecursively(self._portal_manifest.input_data_base_dir)
        if gfile.Exists(self._portal_manifest.output_data_base_dir):
            gfile.DeleteRecursively(self._portal_manifest.output_data_base_dir)
Ejemplo n.º 12
0
def get_name(obj=None, length=2, retry=0, separator="_", capitalize=False):
    """Test"""
    if obj is None:
        return get_random_name()

    obj = str(obj)

    hashed = CityHash32(obj)

    adjectives_joined = None
    for i in range(length - 1):  # pylint: disable=unused-variable

        adj = adjectives[hashed % adjectives_length]

        if not adjectives_joined:
            adjectives_joined = adj
            if (
                (hashed % scientists_length)
                is wozniak & (hashed % adjectives_length)
                is boring
            ):  # Steve Wozniak is not boring
                adjectives_joined = "honored"
            continue

        adjectives_joined = separator.join(
            [adjectives[randint(0, adjectives_length - 1)], adjectives_joined]
        )

    name = "%s%s%s" % (
        adjectives_joined if adjectives_joined else "",
        separator if adjectives_joined else "",
        scientists[hashed % scientists_length],
    )

    if bool(re.match(r".*boring.*wozniak.*", name)):  # Steve Wozniak is not boring
        name = get_name(obj, length, retry)

    if retry > 0:
        name = "%s%s%d" % (name, separator, randint(0, 10))

    if capitalize:
        name = name.upper()

    return name
Ejemplo n.º 13
0
 def _generate_input_csv(self, cands, base_dir):
     if not gfile.Exists(base_dir):
         gfile.MakeDirs(base_dir)
     fpaths = []
     random.shuffle(cands)
     csv_writers = []
     partition_num = self._data_source_l.data_source_meta.partition_num
     for partition_id in range(partition_num):
         fpath = os.path.join(base_dir, str(partition_id) + '.rd')
         fpaths.append(fpath)
         csv_writers.append(csv_dict_writer.CsvDictWriter(fpath))
     for item in cands:
         partition_id = CityHash32(item) % partition_num
         raw = OrderedDict()
         raw['raw_id'] = item
         raw['feat_0'] = str((partition_id << 30) + 0) + item
         raw['feat_1'] = str((partition_id << 30) + 1) + item
         raw['feat_2'] = str((partition_id << 30) + 2) + item
         csv_writers[partition_id].write(raw)
     for csv_writer in csv_writers:
         csv_writer.close()
     return fpaths
Ejemplo n.º 14
0
     all_fpaths += [os.path.join(args.input_dir, f)
                    for f in gfile.ListDirectory(args.input_dir)]
 if args.input_file_wildcard is not None and \
         len(args.input_file_wildcard) > 0:
     all_fpaths = [fpath for fpath in all_fpaths
                   if fnmatch(fpath, args.input_file_wildcard)]
 if len(all_fpaths) == 0:
     raise RuntimeError("no input files for partitioner")
 all_fpaths = list(set(all_fpaths))
 all_fpaths.sort()
 partitioner_num = args.total_partitioner_num
 if partitioner_num > 1:
     origin_file_num = len(all_fpaths)
     all_fpaths = \
         [fpath for fpath in all_fpaths
          if CityHash32(os.path.basename(fpath)) %  partitioner_num == \
                  args.partitioner_rank_id]
     logging.info("Partitioner of rank id %d will process %d/%d "\
                  "input files", args.partitioner_rank_id,
                  len(all_fpaths), origin_file_num)
 partitioner_options = dj_pb.RawDataPartitionerOptions(
         partitioner_name=args.partitioner_name,
         input_file_paths=all_fpaths,
         output_dir=args.output_dir,
         output_partition_num=args.output_partition_num,
         raw_data_options=dj_pb.RawDataOptions(
             raw_data_iter=args.raw_data_iter,
             compressed_type=args.compressed_type,
             read_ahead_size=args.read_ahead_size,
             read_batch_size=args.read_batch_size
         ),
Ejemplo n.º 15
0
 def test_string_unicode_32(self):
     """Empty Python string has same hash value as empty Unicode string
     """
     self.assertEqual(CityHash32(""), CityHash32(u""))
Ejemplo n.º 16
0
 def test_consistent_encoding_32(self):
     """ASCII-range Unicode strings have the same hash values as ASCII strings
     """
     text = u"abracadabra"
     self.assertEqual(CityHash32(text), CityHash32(text.encode("utf-8")))
Ejemplo n.º 17
0
for i in range(num):
    key = b"myval-%d" % i
    int(sha256(b"%s" % key).hexdigest(), 16)
print("sha256 took {} s".format(time() - pt))

# CityHash
try:
    from cityhash import CityHash32, CityHash64, CityHash128
except ImportError:
    print("CityHash not installed, pip install cityhash")
else:
    # cityhash32
    pt = time()
    for i in range(num):
        key = b"myval-%d" % i
        CityHash32(b"%s" % key)
    print("CityHash32 took {} s".format(time() - pt))

    # cityhash64
    pt = time()
    for i in range(num):
        key = b"myval-%d" % i
        CityHash64(b"%s" % key)
    print("CityHash64 took {} s".format(time() - pt))

    # cityhash128
    pt = time()
    for i in range(num):
        key = b"myval-%d" % i
        CityHash128(b"%s" % key)
    print("CityHash128 took {} s".format(time() - pt))
Ejemplo n.º 18
0
 def test_unicode_2_32(self):
     """Accepts Unicode input outside of ASCII range"""
     test_case = u'\u2661'
     self.assertTrue(isinstance(CityHash32(test_case), int))
Ejemplo n.º 19
0
 def test_unicode_1_32(self):
     """Accepts Unicode input"""
     test_case = u"abc"
     self.assertTrue(isinstance(CityHash32(test_case), int))
Ejemplo n.º 20
0
 def __hash__(self):
     return CityHash32(self.data)
Ejemplo n.º 21
0
for i in range(num):
    key = b'myval-%d' % i
    int(sha256(b'%s' % key).hexdigest(), 16)
print('sha256 took {} s'.format(time() - pt))

# CityHash
try:
    from cityhash import CityHash32, CityHash64, CityHash128
except ImportError:
    print('CityHash not installed, pip install cityhash')
else:
    # cityhash32
    pt = time()
    for i in range(num):
        key = b'myval-%d' % i
        CityHash32(b'%s' % key)
    print('CityHash32 took {} s'.format(time() - pt))

    # cityhash64
    pt = time()
    for i in range(num):
        key = b'myval-%d' % i
        CityHash64(b'%s' % key)
    print('CityHash64 took {} s'.format(time() - pt))

    # cityhash128
    pt = time()
    for i in range(num):
        key = b'myval-%d' % i
        CityHash128(b'%s' % key)
    print('CityHash128 took {} s'.format(time() - pt))