Example #1
0
 def _init_raw_data_reps(self):
     existed_rep = set()
     manifest = self._get_manifest()
     self._raw_data_reps = self._get_etcd_raw_data_rep()
     self._process_sequence = list(self._raw_data_reps.keys())
     self._process_sequence.sort()
     last_fname = (None if len(self._process_sequence) == 0 else
                   self._process_sequence[-1])
     for fpath in self._list_raw_data_dir():
         fname = ntpath.basename(fpath)
         if (fname in self._raw_data_reps
                 or (last_fname is not None and fname > last_fname)):
             continue
         raw_data_rep = dj_pb.RawDataRep(unindexed=empty_pb2.Empty(),
                                         raw_data_path=fpath)
         self._update_raw_data_rep(fname, raw_data_rep)
         self._process_sequence.append(fname)
     self._process_sequence.sort()
     meet_unindexed = False
     for fname in self._process_sequence:
         rep = self._raw_data_reps[fname]
         assert rep is not None
         if rep.HasField('unindexed'):
             meet_unindexed = True
         elif meet_unindexed:
             logging.fatal('indexed raw data should be consecutive')
             os._exit(-1)  # pylint: disable=protected-access
Example #2
0
 def index_raw_data_rep(self, index, start_index):
     with self._lock:
         if index >= len(self._process_sequence):
             raise IndexError("index {} is out of range".format(index))
         unindexed_rep = None
         for (idx, fname) in enumerate(self._process_sequence):
             self._sync_raw_data_rep(fname)
             if idx < index:
                 if not self._raw_data_reps[fname].HasField('index'):
                     raise RuntimeError(
                         "file process before has not been indexed")
             elif idx == index:
                 meeted = True
                 unindexed_rep = self._raw_data_reps[fname]
                 if unindexed_rep.HasField('index'):
                     raise RuntimeError("{} has been indexed".format(fname))
                 if idx > 0:
                     prev_fname = self._process_sequence[idx - 1]
                     prev_rep = self._raw_data_reps[prev_fname]
                     if prev_rep.index.start_index > start_index:
                         raise RuntimeError(
                             "the start index is not incremental")
             elif self._raw_data_reps[fname].HasField('index'):
                 raise RuntimeError("file process after has been indexed")
         indexed_rep = dj_pb.RawDataRep()
         indexed_rep.raw_data_path = unindexed_rep.raw_data_path
         indexed_rep.index.start_index = start_index
         fname = self._process_sequence[index]
         self._update_raw_data_rep(fname, indexed_rep)
Example #3
0
 def _get_etcd_raw_data_rep(self):
     etcd_prefix = self._get_manifest_etcd_key()
     kvs = self._etcd.get_prefix_kvs(etcd_prefix)
     raw_data_reps = {}
     for (fpath, data) in kvs:
         fname = ntpath.basename(fpath)
         raw_data_reps[fname] = text_format.Parse(data, dj_pb.RawDataRep())
     return raw_data_reps
Example #4
0
 def _sync_raw_data_rep(self, fname):
     assert fname in self._raw_data_reps
     if self._raw_data_reps[fname] is None:
         fdir = self._get_parition_etcd_key()
         fpath = '/'.join([fdir, fname])
         data = self._etcd.get_data(fpath)
         self._raw_data_reps[fname] = text_format.Parse(
             data, dj_pb.RawDataRep())
     return self._raw_data_reps[fname]