def _init_raw_data_reps(self): existed_rep = set() manifest = self._get_manifest() self._raw_data_reps = self._get_etcd_raw_data_rep() self._process_sequence = list(self._raw_data_reps.keys()) self._process_sequence.sort() last_fname = (None if len(self._process_sequence) == 0 else self._process_sequence[-1]) for fpath in self._list_raw_data_dir(): fname = ntpath.basename(fpath) if (fname in self._raw_data_reps or (last_fname is not None and fname > last_fname)): continue raw_data_rep = dj_pb.RawDataRep(unindexed=empty_pb2.Empty(), raw_data_path=fpath) self._update_raw_data_rep(fname, raw_data_rep) self._process_sequence.append(fname) self._process_sequence.sort() meet_unindexed = False for fname in self._process_sequence: rep = self._raw_data_reps[fname] assert rep is not None if rep.HasField('unindexed'): meet_unindexed = True elif meet_unindexed: logging.fatal('indexed raw data should be consecutive') os._exit(-1) # pylint: disable=protected-access
def index_raw_data_rep(self, index, start_index): with self._lock: if index >= len(self._process_sequence): raise IndexError("index {} is out of range".format(index)) unindexed_rep = None for (idx, fname) in enumerate(self._process_sequence): self._sync_raw_data_rep(fname) if idx < index: if not self._raw_data_reps[fname].HasField('index'): raise RuntimeError( "file process before has not been indexed") elif idx == index: meeted = True unindexed_rep = self._raw_data_reps[fname] if unindexed_rep.HasField('index'): raise RuntimeError("{} has been indexed".format(fname)) if idx > 0: prev_fname = self._process_sequence[idx - 1] prev_rep = self._raw_data_reps[prev_fname] if prev_rep.index.start_index > start_index: raise RuntimeError( "the start index is not incremental") elif self._raw_data_reps[fname].HasField('index'): raise RuntimeError("file process after has been indexed") indexed_rep = dj_pb.RawDataRep() indexed_rep.raw_data_path = unindexed_rep.raw_data_path indexed_rep.index.start_index = start_index fname = self._process_sequence[index] self._update_raw_data_rep(fname, indexed_rep)
def _get_etcd_raw_data_rep(self): etcd_prefix = self._get_manifest_etcd_key() kvs = self._etcd.get_prefix_kvs(etcd_prefix) raw_data_reps = {} for (fpath, data) in kvs: fname = ntpath.basename(fpath) raw_data_reps[fname] = text_format.Parse(data, dj_pb.RawDataRep()) return raw_data_reps
def _sync_raw_data_rep(self, fname): assert fname in self._raw_data_reps if self._raw_data_reps[fname] is None: fdir = self._get_parition_etcd_key() fpath = '/'.join([fdir, fname]) data = self._etcd.get_data(fpath) self._raw_data_reps[fname] = text_format.Parse( data, dj_pb.RawDataRep()) return self._raw_data_reps[fname]