Ejemplo n.º 1
0
def merge_arrays(array1, array2, result_fields='union'):
    """data in array2 overrides data in array1"""

    fields1 = get_fields(array1)
    fields2 = get_fields(array2)

    #TODO: check that common fields have the same type
    if result_fields == 'union':
        names1 = set(array1.dtype.names)
        fields_notin1 = [(name, type_) for name, type_ in fields2
                         if name not in names1]
        output_fields = fields1 + fields_notin1
    elif result_fields == 'array1':
        output_fields = fields1
    else:
        raise ValueError('%s in not a valid value for result_fields argument' %
                         result_fields)

    output_dtype = np.dtype(output_fields)

    ids1 = array1['id']
    ids2 = array2['id']
    all_ids = np.union1d(ids1, ids2)
    max_id = all_ids[-1]

    # compute new id_to_rownum
    id_to_rownum = np.empty(max_id + 1, dtype=int)
    id_to_rownum.fill(-1)
    for rownum, rowid in enumerate(all_ids):
        id_to_rownum[rowid] = rownum

    # 1) create resulting array
    ids1_complete = len(ids1) == len(all_ids)
    ids2_complete = len(ids2) == len(all_ids)
    output_is_arr1 = array1.dtype == output_dtype and ids1_complete
    output_is_arr2 = array2.dtype == output_dtype and ids2_complete
    arr1_complete = set(fields1) >= set(output_fields) and ids1_complete
    arr2_complete = set(fields2) >= set(output_fields) and ids2_complete
    if output_is_arr2:
        output_array = array2
    elif output_is_arr1:
        #TODO: modifying array1 in-place suits our particular needs for now
        # but it should really be a (non-default) option
        output_array = array1
    elif arr1_complete or arr2_complete:
        output_array = np.empty(len(all_ids), dtype=output_dtype)
    else:
        output_array = np.empty(len(all_ids), dtype=output_dtype)
        output_array[:] = get_missing_record(output_array)

    # 2) copy data from array1 (if it will not be overridden)
    if not arr2_complete:
        output_array = merge_subset_in_array(output_array, id_to_rownum,
                                             array1, first=True)

    # 3) copy data from array2
    if not output_is_arr2:
        output_array = merge_subset_in_array(output_array, id_to_rownum, array2)

    return output_array, id_to_rownum
Ejemplo n.º 2
0
def append_table(input_table, output_table, chunksize=10000, condition=None,
                 stop=None, show_progress=False):
    if input_table.dtype != output_table.dtype:
        output_fields = get_fields(output_table)
    else:
        output_fields = None

    if stop is None:
        numrows = len(input_table)
    else:
        numrows = stop

    if not chunksize:
        chunksize = numrows

    num_chunks, remainder = divmod(numrows, chunksize)
    if remainder > 0:
        num_chunks += 1

    if output_fields is not None:
        expanded_data = np.empty(chunksize, dtype=np.dtype(output_fields))
        expanded_data[:] = get_missing_record(expanded_data)

    #noinspection PyUnusedLocal
    def copy_chunk(chunk_idx, chunk_num):
        chunk_start = chunk_num * chunksize
        chunk_stop = min(chunk_start + chunksize, numrows)
        if condition is not None:
            input_data = input_table.readWhere(condition, start=chunk_start,
                                               stop=chunk_stop)
        else:
            input_data = input_table.read(chunk_start, chunk_stop)

        if output_fields is not None:
            # use our pre-allocated buffer (except for the last chunk)
            if len(input_data) == len(expanded_data):
                default_values = {}
                output_data = add_and_drop_fields(input_data, output_fields,
                                                  default_values, expanded_data)
            else:
                default_values = {}
                output_data = add_and_drop_fields(input_data, output_fields, default_values)
        else:
            output_data = input_data

        output_table.append(output_data)
        output_table.flush()

    if show_progress:
        loop_wh_progress(copy_chunk, range(num_chunks))
    else:
        for chunk in range(num_chunks):
            copy_chunk(chunk, chunk)

    return output_table
Ejemplo n.º 3
0
def mergeArrays(array1, array2, result_fields='union'):
    fields1 = get_fields(array1)
    fields2 = get_fields(array2)
    #TODO: check that common fields have the same type
    if result_fields == 'union':
        names1 = set(array1.dtype.names)
        fields_notin1 = [(name, type_) for name, type_ in fields2
                         if name not in names1]
        output_fields = fields1 + fields_notin1
    elif result_fields == 'array1':
        output_fields = fields1
    else:
        raise ValueError('%s in not a valid value for result_fields argument' %
                         result_fields)

    output_dtype = np.dtype(output_fields)

    ids1 = array1['id']
    ids2 = array2['id']
    all_ids = np.union1d(ids1, ids2)
    max_id = all_ids[-1]

    # compute new id_to_rownum
    id_to_rownum = np.empty(max_id + 1, dtype=int)
    id_to_rownum.fill(-1)
    for rownum, rowid in enumerate(all_ids):
        id_to_rownum[rowid] = rownum

    # 1) create resulting array
    ids1_complete = len(ids1) == len(all_ids)
    ids2_complete = len(ids2) == len(all_ids)
    output_is_arr1 = array1.dtype == output_dtype and ids1_complete
    output_is_arr2 = array2.dtype == output_dtype and ids2_complete
    arr1_complete = set(fields1) >= set(output_fields) and ids1_complete
    arr2_complete = set(fields2) >= set(output_fields) and ids2_complete
    if output_is_arr2:
        output_array = array2
    elif output_is_arr1:
        output_array = array1
    elif arr1_complete or arr2_complete:
        output_array = np.empty(len(all_ids), dtype=output_dtype)
    else:
        output_array = np.empty(len(all_ids), dtype=output_dtype)
        output_array[:] = get_missing_record(output_array)

    # 2) copy data from array1
    if not arr2_complete:
        output_array = mergeSubsetInArray(output_array, id_to_rownum,
                                          array1, first=True)

    # 3) copy data from array2
    if not output_is_arr2:
        output_array = mergeSubsetInArray(output_array, id_to_rownum, array2)

    return output_array, id_to_rownum
Ejemplo n.º 4
0
 def _initial_values(self, array, to_give_birth, num_birth):
     #TODO: use default values for fields which have one
     children = np.empty(num_birth, dtype=array.dtype)
     children[:] = get_missing_record(array)
     return children