def publish_raw_data(self, partition_id, fpaths, timestamps=None):
     if not fpaths:
         logging.warning("no raw data will be published")
         return
     if timestamps is not None and len(fpaths) != len(timestamps):
         raise RuntimeError("the number of raw data file "\
                            "and timestamp mismatch")
     new_raw_data_pubs = []
     for index, fpath in enumerate(fpaths):
         if not gfile.Exists(fpath):
             raise ValueError('{} is not existed'.format(fpath))
         raw_data_pub = dj_pb.RawDatePub(raw_data_meta=dj_pb.RawDataMeta(
             file_path=fpath, start_index=-1))
         if timestamps is not None:
             raw_data_pub.raw_data_meta.timestamp.MergeFrom(
                 timestamps[index])
         new_raw_data_pubs.append(raw_data_pub)
     next_pub_index = None
     item_index = 0
     data = text_format.MessageToString(new_raw_data_pubs[item_index])
     while item_index < len(new_raw_data_pubs):
         next_pub_index = self._forward_pub_index(partition_id,
                                                  next_pub_index)
         etcd_key = common.raw_data_pub_etcd_key(self._raw_data_pub_dir,
                                                 partition_id,
                                                 next_pub_index)
         if self._etcd.cas(etcd_key, None, data):
             logging.info("Success publish %s at index %d for partition"\
                          "%d", data, next_pub_index, partition_id)
             next_pub_index += 1
             item_index += 1
             if item_index < len(new_raw_data_pubs):
                 raw_data_pub = new_raw_data_pubs[item_index]
                 data = text_format.MessageToString(raw_data_pub)
 def _try_to_sub_raw_data(self, partition_id):
     sub_src_dir = path.join(self._raw_data_sub_dir,
                             common.partition_repr(partition_id))
     with self._lock:
         manifest = self._sync_manifest(partition_id)
         if manifest.finished:
             return
         next_sub_index = manifest.next_raw_data_sub_index
         add_candidates = []
         raw_data_finished = False
         while True:
             etcd_key = common.raw_data_pub_etcd_key(
                 self._raw_data_sub_dir, partition_id, next_sub_index)
             pub_data = self._etcd.get_data(etcd_key)
             if pub_data is None:
                 break
             raw_data_pub = text_format.Parse(pub_data, dj_pb.RawDatePub())
             if raw_data_pub.HasField('raw_data_meta'):
                 add_candidates.append(raw_data_pub.raw_data_meta)
                 next_sub_index += 1
             elif raw_data_pub.HasField('raw_data_finished'):
                 logging.warning("meet finish pub at pub index %d for "\
                                 "partition %d",
                                 next_sub_index, partition_id)
                 raw_data_finished = True
                 break
         self._store_raw_data_metas(partition_id, add_candidates)
         new_manifest = self._sync_manifest(partition_id)
         new_manifest.next_raw_data_sub_index = next_sub_index
         new_manifest.finished = raw_data_finished
         self._update_manifest(new_manifest)
 def _try_to_sub_raw_data(self, partition_id):
     manifest = self._sync_manifest(partition_id)
     if manifest.finished or len(self._raw_data_sub_dir) == 0:
         return 0
     next_sub_index = manifest.next_raw_data_sub_index
     add_candidates = []
     raw_data_finished = False
     prev_next_sub_index = manifest.next_raw_data_sub_index
     while True:
         kvstore_key = common.raw_data_pub_kvstore_key(
                 self._raw_data_sub_dir,
                 partition_id, next_sub_index
             )
         pub_data = self._kvstore.get_data(kvstore_key)
         if pub_data is None:
             break
         raw_data_pub = text_format.Parse(pub_data, dj_pb.RawDatePub(),
                                          allow_unknown_field=True)
         if raw_data_pub.HasField('raw_data_meta'):
             add_candidates.append(raw_data_pub.raw_data_meta)
             next_sub_index += 1
         elif raw_data_pub.HasField('raw_data_finished'):
             logging.warning("meet finish pub at pub index %d for "\
                             "partition %d",
                             next_sub_index, partition_id)
             raw_data_finished = True
             break
     self._store_raw_data_metas(partition_id, add_candidates)
     new_manifest = self._sync_manifest(partition_id)
     new_manifest.next_raw_data_sub_index = next_sub_index
     new_manifest.finished = raw_data_finished
     self._update_manifest(new_manifest)
     return next_sub_index - prev_next_sub_index
Exemple #4
0
 def _check_finish_tag(self, partition_id, last_index):
     if last_index >= 0:
         etcd_key = common.raw_data_pub_etcd_key(self._raw_data_pub_dir,
                                                 partition_id, last_index)
         data = self._etcd.get_data(etcd_key)
         if data is not None:
             pub_item = text_format.Parse(data, dj_pb.RawDatePub())
             return pub_item.HasField('raw_data_finished')
     return False
 def _check_finish_tag(self, partition_id, last_index):
     if last_index >= 0:
         kvstore_key = common.raw_data_pub_kvstore_key(
             self._raw_data_pub_dir, partition_id, last_index)
         data = self._kvstore.get_data(kvstore_key)
         if data is not None:
             pub_item = text_format.Parse(data,
                                          dj_pb.RawDatePub(),
                                          allow_unknown_field=True)
             return pub_item.HasField('raw_data_finished')
     return False
 def finish_raw_data(self, partition_id):
     data = text_format.MessageToString(
         dj_pb.RawDatePub(raw_data_finished=empty_pb2.Empty()))
     next_pub_index = None
     while True:
         next_pub_index = self._forward_pub_index(partition_id,
                                                  next_pub_index)
         etcd_key = common.raw_data_pub_etcd_key(self._raw_data_pub_dir,
                                                 partition_id,
                                                 next_pub_index)
         if self._etcd.cas(etcd_key, None, data):
             logging.info("Success finish raw data for partition"\
                          "%d", partition_id)
             break
 def finish_raw_data(self, partition_id):
     data = text_format.MessageToString(
         dj_pb.RawDatePub(raw_data_finished=empty_pb2.Empty()))
     next_pub_index = None
     while True:
         next_pub_index = self._forward_pub_index(partition_id,
                                                  next_pub_index)
         if self._check_finish_tag(partition_id, next_pub_index - 1):
             logging.warning("partition %d has been published finish tag"\
                             "at index %d", partition_id,
                             next_pub_index-1)
             break
         kvstore_key = common.raw_data_pub_kvstore_key(
             self._raw_data_pub_dir, partition_id, next_pub_index)
         if self._kvstore.cas(kvstore_key, None, data):
             logging.info("Success finish raw data for partition"\
                          "%d", partition_id)
             break
 def publish_raw_data(self, partition_id, fpaths, timestamps=None):
     if not fpaths:
         logging.warning("no raw data will be published")
         return
     if timestamps is not None and len(fpaths) != len(timestamps):
         raise RuntimeError("the number of raw data file "\
                            "and timestamp mismatch")
     new_raw_data_pubs = []
     for index, fpath in enumerate(fpaths):
         if not gfile.Exists(fpath):
             raise ValueError('{} is not existed'.format(fpath))
         raw_data_pub = dj_pb.RawDatePub(raw_data_meta=dj_pb.RawDataMeta(
             file_path=fpath, start_index=-1))
         if timestamps is not None:
             raw_data_pub.raw_data_meta.timestamp.MergeFrom(
                 timestamps[index])
         new_raw_data_pubs.append(raw_data_pub)
     next_pub_index = None
     item_index = 0
     data = text_format.MessageToString(new_raw_data_pubs[item_index])
     while item_index < len(new_raw_data_pubs):
         next_pub_index = self._forward_pub_index(partition_id,
                                                  next_pub_index)
         if self._check_finish_tag(partition_id, next_pub_index - 1):
             logging.warning("partition %d has been published finish tag "\
                             "at index %d", partition_id, next_pub_index-1)
             break
         kvstore_key = common.raw_data_pub_kvstore_key(
             self._raw_data_pub_dir, partition_id, next_pub_index)
         if self._kvstore.cas(kvstore_key, None, data):
             logging.info("Success publish %s at index %d for partition"\
                          "%d", data, next_pub_index, partition_id)
             next_pub_index += 1
             item_index += 1
             if item_index < len(new_raw_data_pubs):
                 raw_data_pub = new_raw_data_pubs[item_index]
                 data = text_format.MessageToString(raw_data_pub)
     if item_index < len(new_raw_data_pubs) - 1:
         logging.warning("%d files are not published since meet finish "\
                         "tag for partition %d. list following",
                         len(new_raw_data_pubs) - item_index, partition_id)
         for idx, pub in enumerate(new_raw_data_pubs[item_index:]):
             logging.warning("%d. %s", idx, pub.raw_data_meta.file_path)