def merge_arrays(array1, array2, result_fields='union'): """data in array2 overrides data in array1""" fields1 = get_fields(array1) fields2 = get_fields(array2) #TODO: check that common fields have the same type if result_fields == 'union': names1 = set(array1.dtype.names) fields_notin1 = [(name, type_) for name, type_ in fields2 if name not in names1] output_fields = fields1 + fields_notin1 elif result_fields == 'array1': output_fields = fields1 else: raise ValueError('%s in not a valid value for result_fields argument' % result_fields) output_dtype = np.dtype(output_fields) ids1 = array1['id'] ids2 = array2['id'] all_ids = np.union1d(ids1, ids2) max_id = all_ids[-1] # compute new id_to_rownum id_to_rownum = np.empty(max_id + 1, dtype=int) id_to_rownum.fill(-1) for rownum, rowid in enumerate(all_ids): id_to_rownum[rowid] = rownum # 1) create resulting array ids1_complete = len(ids1) == len(all_ids) ids2_complete = len(ids2) == len(all_ids) output_is_arr1 = array1.dtype == output_dtype and ids1_complete output_is_arr2 = array2.dtype == output_dtype and ids2_complete arr1_complete = set(fields1) >= set(output_fields) and ids1_complete arr2_complete = set(fields2) >= set(output_fields) and ids2_complete if output_is_arr2: output_array = array2 elif output_is_arr1: #TODO: modifying array1 in-place suits our particular needs for now # but it should really be a (non-default) option output_array = array1 elif arr1_complete or arr2_complete: output_array = np.empty(len(all_ids), dtype=output_dtype) else: output_array = np.empty(len(all_ids), dtype=output_dtype) output_array[:] = get_missing_record(output_array) # 2) copy data from array1 (if it will not be overridden) if not arr2_complete: output_array = merge_subset_in_array(output_array, id_to_rownum, array1, first=True) # 3) copy data from array2 if not output_is_arr2: output_array = merge_subset_in_array(output_array, id_to_rownum, array2) return output_array, id_to_rownum
def append_table(input_table, output_table, chunksize=10000, condition=None, stop=None, show_progress=False): if input_table.dtype != output_table.dtype: output_fields = get_fields(output_table) else: output_fields = None if stop is None: numrows = len(input_table) else: numrows = stop if not chunksize: chunksize = numrows num_chunks, remainder = divmod(numrows, chunksize) if remainder > 0: num_chunks += 1 if output_fields is not None: expanded_data = np.empty(chunksize, dtype=np.dtype(output_fields)) expanded_data[:] = get_missing_record(expanded_data) #noinspection PyUnusedLocal def copy_chunk(chunk_idx, chunk_num): chunk_start = chunk_num * chunksize chunk_stop = min(chunk_start + chunksize, numrows) if condition is not None: input_data = input_table.readWhere(condition, start=chunk_start, stop=chunk_stop) else: input_data = input_table.read(chunk_start, chunk_stop) if output_fields is not None: # use our pre-allocated buffer (except for the last chunk) if len(input_data) == len(expanded_data): default_values = {} output_data = add_and_drop_fields(input_data, output_fields, default_values, expanded_data) else: default_values = {} output_data = add_and_drop_fields(input_data, output_fields, default_values) else: output_data = input_data output_table.append(output_data) output_table.flush() if show_progress: loop_wh_progress(copy_chunk, range(num_chunks)) else: for chunk in range(num_chunks): copy_chunk(chunk, chunk) return output_table
def mergeArrays(array1, array2, result_fields='union'): fields1 = get_fields(array1) fields2 = get_fields(array2) #TODO: check that common fields have the same type if result_fields == 'union': names1 = set(array1.dtype.names) fields_notin1 = [(name, type_) for name, type_ in fields2 if name not in names1] output_fields = fields1 + fields_notin1 elif result_fields == 'array1': output_fields = fields1 else: raise ValueError('%s in not a valid value for result_fields argument' % result_fields) output_dtype = np.dtype(output_fields) ids1 = array1['id'] ids2 = array2['id'] all_ids = np.union1d(ids1, ids2) max_id = all_ids[-1] # compute new id_to_rownum id_to_rownum = np.empty(max_id + 1, dtype=int) id_to_rownum.fill(-1) for rownum, rowid in enumerate(all_ids): id_to_rownum[rowid] = rownum # 1) create resulting array ids1_complete = len(ids1) == len(all_ids) ids2_complete = len(ids2) == len(all_ids) output_is_arr1 = array1.dtype == output_dtype and ids1_complete output_is_arr2 = array2.dtype == output_dtype and ids2_complete arr1_complete = set(fields1) >= set(output_fields) and ids1_complete arr2_complete = set(fields2) >= set(output_fields) and ids2_complete if output_is_arr2: output_array = array2 elif output_is_arr1: output_array = array1 elif arr1_complete or arr2_complete: output_array = np.empty(len(all_ids), dtype=output_dtype) else: output_array = np.empty(len(all_ids), dtype=output_dtype) output_array[:] = get_missing_record(output_array) # 2) copy data from array1 if not arr2_complete: output_array = mergeSubsetInArray(output_array, id_to_rownum, array1, first=True) # 3) copy data from array2 if not output_is_arr2: output_array = mergeSubsetInArray(output_array, id_to_rownum, array2) return output_array, id_to_rownum
def _initial_values(self, array, to_give_birth, num_birth): #TODO: use default values for fields which have one children = np.empty(num_birth, dtype=array.dtype) children[:] = get_missing_record(array) return children