def check_index_recon(embeds_path, index_or_index_path, embeds_format='labeled_numpy', sort=True, **kwargs): index = faiss.read_index(index_or_index_path) if isinstance( index_or_index_path, str) else index_or_index_path faiss.downcast_index(index).make_direct_map() embeds_list, _ = load_embeds(embeds_path=embeds_path, format=embeds_format, sort=sort, **kwargs) # tic("Gathering targets ...") # all_tgt_embeds = [] # for file_path in Tqdm.tqdm(embeds_paths): # embeds_group, batch_group = pickle_load(file_path) # for embeds, batch in zip(embeds_group[embeds_key], batch_group): # all_tgt_embeds.append(embeds[embeds_idx]) # # toc("Done!") tic("Checking embedding reconstruction difference ...") all_embeds = np.concatenate(embeds_list) all_embeds_recon = index.reconstruct_n(0, len(all_embeds)) embeds_diff = np.linalg.norm(all_embeds - all_embeds_recon) toc("Passed embedding reconstruction difference check.") \ if embeds_diff == 0 else toc(f"Embedding reconstruction difference: {embeds_diff}.")
def parallel_process_by_queue(num_p, data_iter, target, args, ctx: BaseContext = None, task_unit_size=5000, print_out=__debug__): if isinstance(target, MPTarget): target.use_queue = True if ctx is None: ctx = get_context('spawn') iq = Queue(ctx=ctx) oq: Manager = ctx.Manager().Queue() tic(f"Creating input queue with task unit size {task_unit_size}", verbose=print_out) cnt_task_unit = 0 for item in tqdm(slices__(data_iter, task_unit_size)): iq.put(item) cnt_task_unit += 1 jobs = [None] * num_p for i in range(num_p): jobs[i] = ctx.Process(target=target, args=(i, iq, oq) + args) toc() tic(f"Working on {cnt_task_unit} task units with {num_p} processes", verbose=print_out) start_and_wait_jobs(jobs) out = [] while not oq.empty(): out.append(oq.get_nowait()) toc() return out
def dispatch_data(num_p: int, data_iter: Union[Iterator, Iterable, List], args: Tuple, print_out=__debug__): if num_p <= 0: raise ValueError( f"The number of processes specified in `nump_p` must be positive, but it is {num_p}." ) tic("Splitting task", verbose=print_out) splits = split_iter(it=data_iter, num_splits=num_p, use_tqdm=print_out) toc(print_out=print_out) num_p = len(splits) if num_p == 0: raise ValueError( f"The number of data splits is zero. Possibly no data was read from the provided iterator." ) else: job_args = [None] * num_p for pidx in range(num_p): if print_out: hprint_pairs(('pid', pidx), ('workload', len(splits[pidx]))) job_args[pidx] = (pidx, splits[pidx]) + args return job_args
def iter_feature_data(csv_file_path, num_meta_data_fields=0, num_label_fields=1, use_tqdm=True, disp_msg=None, verbose=__debug__, fields_as_list=True, parse_labels_as_ints=False, parse_feats_as_floats=False, parse=False, replace_nan=None, num_p=1): """ NOTE this is multi-processing wrap for the actual csv-based feature data reading by the private `_iter_feature_data` method. """ if num_p <= 1: return _iter_feature_data(csv_file_path=csv_file_path, num_meta_data_fields=num_meta_data_fields, num_label_fields=num_label_fields, use_tqdm=use_tqdm, disp_msg=disp_msg, verbose=verbose, fields_as_list=fields_as_list, parse_labels_as_ints=parse_labels_as_ints, parse_feats_as_floats=parse_feats_as_floats, parse=parse, replace_nan=replace_nan) else: import utix.mpex as mpex timex.tic( f"Loading L1 feature file at {csv_file_path} with multi-processing" ) rst = mpex.mp_read_from_files( num_p=num_p, input_path=csv_file_path, target=mpex.MPTarget(target=partial( _iter_feature_data, num_meta_data_fields=num_meta_data_fields, num_label_fields=num_label_fields, use_tqdm=use_tqdm, disp_msg=disp_msg, verbose=verbose, fields_as_list=fields_as_list, parse_labels_as_ints=parse_labels_as_ints, parse_feats_as_floats=parse_feats_as_floats, parse=parse, replace_nan=replace_nan), pass_pid=False, pass_each=True, is_target_iter=True), result_merge='chain') timex.toc() return rst
def load_embeds(embeds_path, format='labeled_numpy', read_embeds=True, read_labels=True, use_tqdm: bool = True, tqdm_msg: str = None, sort=True, **kwargs): if tqdm_msg is None: if read_embeds and read_labels: tqdm_msg = f'loading embeds with labels at {embeds_path}' elif read_embeds: tqdm_msg = f'loading embeds at {embeds_path}' elif read_labels: tqdm_msg = f'loading labels at {embeds_path}' else: return embeds_it = iter_embeds(embeds_path=embeds_path, format=format, read_embeds=read_embeds, read_labels=read_labels, use_tqdm=use_tqdm, tqdm_msg=tqdm_msg, sort=sort, **kwargs) tic('Load embeddings ...') if format == 'labeled_numpy': output = list(embeds_it) if read_embeds and read_labels: embeds_list, labels_list = gx.unzip(output) gx.hprint_message( f"Total number of embedding batches at {embeds_path} to index", len(embeds_list)) output = (embeds_list, labels_list) elif read_embeds or read_labels: gx.hprint_message( f"Total number of embedding batches at {embeds_path} to index", len(output)) else: raise NotImplementedError('the embedding file format is not supported') toc(msg=f'Done!') return output
from functools import partial import utix.mpex as mpex import utix.ioex as ioex import utix.pathex as paex import utix.strex as strex import utix.timex as timex if __name__ == '__main__': src = r'E:\Data\dfsv1f\source_data\main_data\features_2020223\slot_value_features.json' trg = r'./tmp1.txt' timex.tic('with mp iter') it = mpex.mp_read(data_iter=[src], provider=mpex.MPProvider(create_iterator=partial(ioex.iter_all_lines_from_all_files, use_tqdm=True), chunk_size=1000), producer=mpex.MPTarget(target=strex.hash_str, pass_each=True, pass_pid=False), ) hashes1 = list(it) timex.toc() timex.tic('no mp iter') hashes2 = [strex.hash_str(x) for x in ioex.iter_all_lines_from_all_files(src)] timex.toc() print(hashes1.sort() == hashes2.sort())
def build_index(embeds_path, output_path, num_clusters=65536, use_gpu=False, train_ratio=1.0, embeds_format='labeled_numpy', sort=True, **kwargs): # embeds_file_paths = pathex.get_sorted_files_from_all_sub_dirs__(embeds_path, full_path=True) # gx.write_all_lines(path.join(output_dir, f'{EMBEDS_INDEX_FILE_PREFIX}_{embeds_key}_files.txt'), embeds_file_paths) # text_file_path = path.join(output_dir, f'{EMBEDS_INDEX_FILE_PREFIX}_{embeds_key}.txt') # index_file_path = path.join(output_dir, f'{EMBEDS_INDEX_FILE_PREFIX}_{embeds_key}.idx') embeds_list, _ = load_embeds(embeds_path=embeds_path, format=embeds_format, sort=sort, **kwargs) tic('Initializing index ...') if not num_clusters: num_clusters = len(embeds_list) // 100 index = faiss.index_factory(embeds_list[0].shape[-1], f"IVF{num_clusters},Flat", faiss.METRIC_INNER_PRODUCT) if use_gpu: index = faiss.index_cpu_to_all_gpus(index) tic('Concatenating embeddings ...') if 0 < train_ratio < 1: gx.hprint_message( f"will sample subset for training with ratio {train_ratio}...") all_embeds = np.concatenate(embeds_list if train_ratio == 1 else list( gx.sampled_iter(embeds_list, train_ratio))) toc(msg=f'Initialization done!') tic(f'Training embeddings of shape {all_embeds.shape} ...') index.train(all_embeds) if use_gpu: index = faiss.index_gpu_to_cpu(index) toc(msg='Index training done!') tic('Add embeddings to index ...') del all_embeds embed_index_start = 0 for embeds in tqdm(embeds_list): embed_count = embeds.shape[0] index.add_with_ids( embeds, np.arange(embed_index_start, embed_index_start + embed_count)) embed_index_start += embed_count # with open(text_file_path, 'w+') as wf: # for embeds, batch in embeds_iter(embeds_file_paths=embeds_file_paths, embeds_key=embeds_key, sample_file=sample_file, sample_ratio=train_ratio, embeds_idx=embeds_idx, use_tqdm=True, yield_batch=True): # write_all_lines_to_stream(wf=wf, iterable=batch[embeds_txt_key], use_tqdm=False) # embed_count = embeds.shape[0] # index.add_with_ids(embeds, np.arange(embed_index_start, embed_index_start + embed_count)) # embed_index_start += embed_count if path.exists(output_path): os.remove(output_path) gx.hprint_message('saving indexed embeddings to', output_path) faiss.write_index(index, output_path) toc(msg='Indexing done!') return index
def train_test_val_split_for_files(file_paths: List, train_test_val_ratios: Tuple[float, float, float], output_path: Union[str, Tuple[str, str, str]], copy_files=True, overwrite=False, sort=False, shuffle=True, rnd_seed=-1, verbose=__debug__, num_p=1): if verbose: tic(f"Splitting {len(file_paths)} files into train/test/val sets with split ratios {train_test_val_ratios}", newline=True) if len(train_test_val_ratios) != 3: raise ValueError( f"must specify three ratios for the train/test/validation set splits; got {len(train_test_val_ratios)} ratios '{','.join((str(x) for x in train_test_val_ratios))}'" ) if sort: file_paths.sort() elif shuffle: with numpy_local_seed(rnd_seed) as _: if rnd_seed >= 0: file_paths.sort() # NOTE reproducibility needs this sort np.random.shuffle(file_paths) if isinstance(output_path, str): train_dir = path.join(output_path, 'train') test_dir = path.join(output_path, 'test') val_dir = path.join(output_path, 'val') elif len(output_path) == 3: train_dir, test_dir, val_dir = output_path else: raise ValueError( msg_invalid_arg_value(arg_val=output_path, arg_name='output_path')) ensure_sum_to_one_arg(arg_val=train_test_val_ratios, arg_name='train_test_val_ratios', warning=True) paex.ensure_dir_existence(train_dir, clear_dir=overwrite, verbose=verbose) paex.ensure_dir_existence(test_dir, clear_dir=overwrite, verbose=verbose) paex.ensure_dir_existence(val_dir, clear_dir=overwrite, verbose=verbose) splits = split_list_by_ratios(list_to_split=file_paths, split_ratios=train_test_val_ratios, check_ratio_sum_to_one=False) for cur_path_list, cur_output_dir in zip(splits, (train_dir, test_dir, val_dir)): if copy_files: batch_copy( src_paths=cur_path_list, dst_dir=cur_output_dir, solve_conflict=True, use_tqdm=verbose, tqdm_msg=f"copy files to {path.basename(cur_output_dir)}" if verbose else None, num_p=num_p) else: batch_move( src_paths=cur_path_list, dst_dir=cur_output_dir, solve_conflict=True, undo_move_on_failure=verbose, use_tqdm=True, tqdm_msg=f"move files to {path.basename(cur_output_dir)}" if verbose else None) if verbose: toc()
def get_mp_cache_files(num_p, file_paths, sort=True, verbose=__debug__, cache_dir_path=None, chunk_size=100000, sort_use_basename=False, rebuild_on_change=True): if isinstance(file_paths, str): file_paths = [file_paths] else: file_paths = paex.sort_paths(file_paths, sort=sort, sort_by_basename=sort_use_basename) num_file_paths = len(file_paths) if verbose: hprint_pairs(('number of files', num_file_paths), ('num_p', num_p)) if num_file_paths < num_p: if cache_dir_path is None: if len(file_paths) == 1: cache_dir_path = paex.add_to_main_name(file_paths[0], prefix='.mp.') else: cache_dir_path = path.join(path.dirname(file_paths[0]), '.mp') cache_file_ext_name = paex.get_ext_name(file_paths[0]) tic('Constructs multi-processing cache files at path ' + path.join(cache_dir_path, '*' + cache_file_ext_name)) mp_cache_file_paths = None files_id_path = cache_dir_path + '.id' if path.exists(cache_dir_path): if path.exists(files_id_path): old_files_id = ioex.read_all_text(files_id_path).strip() new_files_id = ioex.get_files_id( file_paths ) # the file paths are already sorted above, so the files_id would be the same for the same files if they are not changed if new_files_id != old_files_id: hprint_message(f'Files are changed; rebuilding cache at', cache_dir_path) import shutil, os shutil.rmtree(cache_dir_path) # removes file cache os.remove(files_id_path) # removes the id file else: mp_cache_file_paths = paex.get_files_by_pattern( dir_or_dirs=cache_dir_path, pattern='*' + cache_file_ext_name, full_path=True, recursive=False, sort=sort, sort_use_basename=sort_use_basename) if not mp_cache_file_paths: wprint_message( 'Cache directory exists, but nothing there', cache_dir_path) else: hprint_message(f'Files id does not exist; rebuilding cache at', cache_dir_path) import shutil shutil.rmtree(cache_dir_path) # removes file cache if not mp_cache_file_paths: ioex.write_all_text(ioex.get_files_id(file_paths), files_id_path) ioex.write_all_lines( iterable=ioex.iter_all_lines_from_all_files(file_paths), output_path=cache_dir_path, create_dir=True, chunk_size=chunk_size, chunked_file_ext_name=cache_file_ext_name) mp_cache_file_paths = paex.get_files_by_pattern( dir_or_dirs=cache_dir_path, pattern='*' + cache_file_ext_name, full_path=True, recursive=False, sort=sort, sort_use_basename=sort_use_basename) if mp_cache_file_paths: hprint_message(title='number of multi-processing cache files', content=len(mp_cache_file_paths)) else: raise IOError('multi-processing cache files are not found') file_paths = mp_cache_file_paths num_p = min(num_p, len(file_paths)) toc('Done!') return num_p, file_paths