def __init__(self, partition_id, fpath): self._partition_id = partition_id self._fpath = fpath self._tmp_fpath = self._get_tmp_fpath() self._csv_dict_writer = csv_dict_writer.CsvDictWriter( self._tmp_fpath )
def _get_csv_dict_writer(self): if self._csv_dict_writer is None: fname = common.encode_merged_sort_run_fname( self._partition_id, self._process_index) fpath = os.path.join(self._merged_dir, fname) self._csv_dict_writer = csv_dict_writer.CsvDictWriter(fpath) self._merged_fpaths.append(fpath) return self._csv_dict_writer
def __init__(self, process_index, output_dir): self._process_index = process_index self._output_dir = output_dir self._tmp_fpath = self._gen_tmp_fpath() self._fpath = None self._csv_writer = csv_dict_writer.CsvDictWriter(self._tmp_fpath) self._start_index = None self._end_index = None
def _generate_input_csv(self, cands, base_dir): if not gfile.Exists(base_dir): gfile.MakeDirs(base_dir) fpaths = [] random.shuffle(cands) csv_writers = [] partition_num = self._data_source_l.data_source_meta.partition_num for partition_id in range(partition_num): fpath = os.path.join(base_dir, str(partition_id) + '.rd') fpaths.append(fpath) csv_writers.append(csv_dict_writer.CsvDictWriter(fpath)) for item in cands: partition_id = CityHash32(item) % partition_num raw = OrderedDict() raw['raw_id'] = item raw['feat_0'] = str((partition_id << 30) + 0) + item raw['feat_1'] = str((partition_id << 30) + 1) + item raw['feat_2'] = str((partition_id << 30) + 2) + item csv_writers[partition_id].write(raw) for csv_writer in csv_writers: csv_writer.close() return fpaths
def _get_csv_dict_writer(self): if self._csv_dict_writer is None: self._tmp_fpath = common.gen_tmp_fpath(self._merged_dir) self._csv_dict_writer = \ csv_dict_writer.CsvDictWriter(self._tmp_fpath) return self._csv_dict_writer