def compute(self, context, link, target_expr, missing_value=None): """ link must be a Link instance target_expr can be any expression (it will be evaluated on the target rows) """ assert isinstance(link, Link) assert isinstance(target_expr, Expr), str(type(target_expr)) #noinspection PyProtectedMember target_ids = context[link._link_field] target_context = self.target_context(context) id_to_rownum = target_context.id_to_rownum missing_int = missing_values[int] target_rows = id_to_rownum[target_ids] target_values = expr_eval(target_expr, target_context) if missing_value is None: missing_value = get_missing_value(target_values) result_values = target_values[target_rows] # it is a bit faster with numexpr (mixed_links: 0.22s -> 0.17s) return ne.evaluate("where((ids != mi) & (rows != mi), values, mv)", {'ids': target_ids, 'rows': target_rows, 'values': result_values, 'mi': missing_int, 'mv': missing_value})
def compute(self, func, args, kwargs, filter_value=None): values = func(*args, **kwargs) if filter_value is None: return values else: missing_value = get_missing_value(values) return np.where(filter_value, values, missing_value)
def compute(self, context, *args, **kwargs): filter_value = kwargs.pop('filter', None) func = self.get_compute_func() values = func(*args, **kwargs) if filter_value is None: return values else: missing_value = get_missing_value(values) return np.where(filter_value, values, missing_value)
def eval_rows(self, source_rows, expr_value, context): result = np.empty(context_length(context), dtype=expr_value.dtype) result.fill(get_missing_value(expr_value)) id_sort_indices = np.argsort(source_rows) sorted_rownum = source_rows[id_sort_indices] sorted_values = expr_value[id_sort_indices] groups = groupby(izip(sorted_rownum, sorted_values), key=itemgetter(0)) aggregate_func = self.aggregate_func for rownum, values in groups: if rownum == -1: continue # Note that v[n] is faster than using an itemgetter, even with map result[rownum] = aggregate_func(v[1] for v in values) return result
def add_and_drop_fields(array, output_fields, output_array=None): output_dtype = np.dtype(output_fields) output_names = set(output_dtype.names) input_names = set(array.dtype.names) common_fields = output_names & input_names missing_fields = output_names - input_names if output_array is None: output_array = np.empty(len(array), dtype=output_dtype) for fname in missing_fields: output_array[fname] = get_missing_value(output_array[fname]) else: assert output_array.dtype == output_dtype for fname in common_fields: output_array[fname] = array[fname] return output_array
def evaluate(self, context): link = self.get_link(context) target_ids = expr_eval(Variable(link._link_field), context) target_context = self.target_context(context) id_to_rownum = target_context.id_to_rownum missing_int = missing_values[int] target_rows = id_to_rownum[target_ids] target_values = expr_eval(self.target_expression, target_context) missing_value = self.missing_value if missing_value is None: missing_value = get_missing_value(target_values) valid_link = (target_ids != missing_int) & (target_rows != missing_int) return np.where(valid_link, target_values[target_rows], missing_value)
def fill_missing_values(self, ids, values, context, filler='auto'): '''ids: ids present in past period context: current period context''' if filler is 'auto': filler = get_missing_value(values) result = np.empty(context_length(context), dtype=values.dtype) result.fill(filler) if len(ids): id_to_rownum = context.id_to_rownum # if there was more objects in the past than in the current # period. Currently, remove() keeps old ids, so this never # happens, but if we ever change remove(), we'll need to add # such a check everywhere we use id_to_rownum # invalid_ids = ids > len(id_to_rownum) # if np.any(invalid_ids): # fix ids rows = id_to_rownum[ids] safe_put(result, rows, values) return result
def eval_rows(self, source_rows, target_filter, context): target_context = self.target_context(context) value_column = expr_eval(self.target_expr, target_context) if target_filter is not None: value_column = value_column[target_filter] assert len(source_rows) == len(value_column) result = np.empty(context_length(context), dtype=value_column.dtype) result.fill(get_missing_value(value_column)) id_sort_indices = np.argsort(source_rows) sorted_rownum = source_rows[id_sort_indices] sorted_values = value_column[id_sort_indices] groups = groupby(izip(sorted_rownum, sorted_values), key=itemgetter(0)) aggregate_func = self.aggregate_func for rownum, values in groups: if rownum == -1: continue result[rownum] = aggregate_func(v[1] for v in values) return result
def evaluate(self, context): link = self.get_link(context) target_ids = expr_eval(Variable(link._link_field), context) target_context = self.target_context(context) id_to_rownum = target_context.id_to_rownum missing_int = missing_values[int] target_rows = id_to_rownum[target_ids] target_values = expr_eval(self.target_expression, target_context) missing_value = self.missing_value if missing_value is None: missing_value = get_missing_value(target_values) result_values = target_values[target_rows] # it is a bit faster with numexpr (mixed_links: 0.22s -> 0.17s) return ne.evaluate("where((ids != mi) & (rows != mi), values, mv)", {'ids': target_ids, 'rows': target_rows, 'values': result_values, 'mi': missing_int, 'mv': missing_value})
def optimize_processes(self): """ Common subexpression elimination """ #<<<<<<< HEAD if filler is 'auto': filler = get_missing_value(values) result = np.empty(context_length(context), dtype=values.dtype) result.fill(filler) if len(ids): id_to_rownum = context.id_to_rownum # if there was more objects in the past than in the current # period. Currently, remove() keeps old ids, so this never # happens, but if we ever change remove(), we'll need to add # such a check everywhere we use id_to_rownum # invalid_ids = ids > len(id_to_rownum) # if np.any(invalid_ids): # fix ids rows = id_to_rownum[ids] safe_put(result, rows, values) return result
def merge_subset_in_array(output, id_to_rownum, subset, first=False): if subset.dtype == output.dtype and len(subset) == len(output): return subset elif subset.dtype == output.dtype: safe_put(output, id_to_rownum[subset['id']], subset) return output output_names = output.dtype.names subset_names = subset.dtype.names names_to_copy = set(subset_names) & set(output_names) if len(subset) == len(output): for fname in names_to_copy: output[fname] = subset[fname] return output else: rownums = id_to_rownum[subset['id']] #TODO: this is a gross approximation, more research is needed to get # a better threshold. It might also depend on "first". if len(names_to_copy) > len(output_names) / 2: if first: subset_all_cols = np.empty(len(subset), dtype=output.dtype) for fname in set(output_names) - set(subset_names): subset_all_cols[fname] = \ get_missing_value(subset_all_cols[fname]) else: subset_all_cols = output[rownums] # Note that all rows which correspond to rownums == -1 have # wrong values (they have the value of the last row) but it is # not necessary to correct them since they will not be copied # back into output_array. # np.putmask(subset_all_cols, rownums == -1, missing_row) for fname in names_to_copy: subset_all_cols[fname] = subset[fname] safe_put(output, rownums, subset_all_cols) else: for fname in names_to_copy: safe_put(output[fname], rownums, subset[fname]) return output