def translate_parent(self, root_df): if len(self.parser.parent_nodes) == 0: return root_df root_tbl = get_node_table_name(self.parser.model, self.parser.root) root_id = self.parser.get_key_prop().id for f in self.parser.parent_nodes: df = self.translate_table(root_tbl, props=[]) n = f.head first = True while n is not None: edge_tbl = n.edge_up_tbl df = df.join(self.translate_edge(edge_tbl, reversed=False)) if first: df = df.map(lambda x: (x[1][1], ({root_id: x[0]},) + (x[1][0],))) \ .mapValues(lambda x: merge_dictionary(x[0], x[1])) first = False else: df = df.map(lambda x: (x[1][1], x[1][0])) cur_props = n.props tbl = n.tbl n_df = self.translate_table(tbl, props=cur_props) df = n_df.join(df) \ .mapValues(lambda x: merge_and_fill_empty_props(x, cur_props)) n = n.child df = df.map(lambda x: make_key_from_property(x[1], root_id)) root_df = root_df.leftOuterJoin(df).mapValues( lambda x: merge_dictionary(x[0], x[1])) return root_df
def get_direct_children(self, root_df): """ Get data of all directed nodes and attach to root node :param root_df: :return: """ for n in self.parser.flatten_props: # if n is a child of root node, we don't need to swap order of the pair ids edge_df = self.translate_edge(n.edge, not n.props_from_child) props = n.props if n.sorted_by is not None: sorting_prop = PropFactory.adding_prop(self.parser.doc_type, n.sorted_by, n.sorted_by, []) props.append(sorting_prop) child_df = self.translate_table(n.tbl_name, props=props) child_by_root = edge_df.join(child_df).map( lambda x: tuple([x[1][0], x[1][1]])) if n.sorted_by is not None: child_by_root = child_by_root.groupByKey() child_by_root = child_by_root.mapValues( lambda it: sort_by_field(it, sorting_prop.id, n.desc_order )[0]) child_by_root = child_by_root.mapValues( lambda x: {k: v for (k, v) in x.items() if k != sorting_prop.id}) root_df = root_df.leftOuterJoin(child_by_root).mapValues( lambda x: merge_and_fill_empty_props(x, n.props)) child_df.unpersist() child_by_root.unpersist() return root_df
def join_no_aggregate(self, df, joining_df, dual_props): joining_df = self.get_props_from_df(joining_df, dual_props) df = df.leftOuterJoin( joining_df).mapValues(lambda x: merge_and_fill_empty_props( x, [p.get("dst") for p in dual_props])) joining_df.unpersist() return df
def join_and_aggregate(self, df, joining_df, dual_props, joining_node): frame_zero = get_frame_zero(joining_node.getting_fields) joining_df = self.get_props_from_df(joining_df, dual_props)\ .mapValues(get_normal_frame(joining_node.getting_fields))\ .aggregateByKey(frame_zero, seq_aggregate_with_reducer, merge_aggregate_with_reducer)\ .mapValues(lambda x: {x1: x2 for (x0, x1, x2) in x}) df = df.leftOuterJoin(joining_df)\ .mapValues(lambda x: merge_and_fill_empty_props(x, [p.get('dst') for p in dual_props])) joining_df.unpersist() return df
def collect_leaf(self, child, edge_df, collected_leaf_dfs, root_props=None): root_props = self.root_props if root_props is None else root_props if type(child) is LeafNode: child_df = self.translate_table(child.tbl_name, props=self.parser.props) if child_df.isEmpty(): return child_df = child_df.join(edge_df).mapValues( lambda x: merge_and_fill_empty_props(x, root_props, to_tuple=True)) collected_leaf_dfs['final'] = child_df if 'final' not in collected_leaf_dfs \ else collected_leaf_dfs['final'].union(child_df).distinct() if child.name in collected_leaf_dfs: collected_leaf_dfs[child.name].unpersist() child.done = True
def translate_special(self, root_df): """ If etlMapping have special_props entry that defines a special function, run this translation :param root_df: The special function also have the same root with hosted document (case or subject) :return: Return the origin rdd with result from special function included inside """ if len(self.parser.special_nodes) == 0: return root_df root_tbl = get_node_table_name(self.parser.model, self.parser.root) root_id = self.parser.get_key_prop().id for f in self.parser.special_nodes: if f.fn[0] == "sliding": df = self.translate_table(root_tbl, props=[]) n = f.head first = True while n is not None: edge_tbl = n.edge_up_tbl df = df.join(self.translate_edge(edge_tbl)) if first: df = df.map(lambda x: (x[1][1], ({ root_id: x[0] }, ) + (x[1][0], ))).mapValues( lambda x: merge_dictionary(x[0], x[1])) first = False else: df = df.map(lambda x: (x[1][1], x[1][0])) cur_props = n.props tbl = n.tbl n_df = self.translate_table(tbl, props=cur_props) df = n_df.join(df).mapValues( lambda x: merge_and_fill_empty_props(x, cur_props)) n = n.child df = df.map(lambda x: make_key_from_property(x[1], root_id)) (n, fn1, fn2) = tuple(f.fn[1:]) fid = self.parser.get_prop_by_name(f.name).id df = df.mapValues(lambda x: tuple([ v for (k, v) in list( collections.OrderedDict(sorted(x.items())).items()) ])) df = sliding(df, int(n.strip()), fn1.strip(), fn2.strip()).mapValues(lambda x: {fid: x}) root_df = root_df.leftOuterJoin(df).mapValues( lambda x: merge_dictionary(x[0], x[1])) return root_df
def merge_auth_root(self, root): df = self.translate_table(root.tbl_name, props=root.props) child = root.root_child props = copy(root.props) while child is not None: edge_tbl = child.edge_to_parent child_props = child.props df = df.join(self.translate_edge(edge_tbl)) \ .map(lambda x: (x[1][1], x[1][0])) tbl_name = child.tbl_name df = df.join(self.translate_table(tbl_name, props=child_props)) \ .mapValues(lambda x: merge_and_fill_empty_props(x, child_props)) props.extend(child_props) child = child.root_child project_id_prop = self.parser.get_prop_by_name('project_id') if project_id_prop is None: project_id_prop = PropFactory.adding_prop(self.parser.doc_type, 'project_id', None, []) root_id = project_id_prop.id return df.mapValues(lambda x: construct_project_id(x, props, root_id))
def merge_roots_to_children(self): collected_leaf_dfs = {} collected_collecting_dfs = {} for root in self.parser.roots: if root.root_child is None: df = self.translate_table(root.tbl_name, props=root.props) root_id = self.parser.get_prop_by_name('{}_id'.format(root.name)).id else: df = self.merge_auth_root(root) props = root.props for child in root.children: edge_tbl = child.parents[root.name] tmp_df = df.join(self.translate_edge(edge_tbl)) if root.root_child is None: tmp_df = tmp_df.map(lambda x: (x[1][1], ({root_id: x[0]},) + (x[1][0],)))\ .mapValues(lambda x: merge_and_fill_empty_props(x, props, to_tuple=True)) else: tmp_df = tmp_df.map(lambda x: (x[1][1], x[1][0])) \ .mapValues(lambda x: tuple([(k, v) for (k, v) in x.items()])) self.collect_collecting_child(child, tmp_df, collected_collecting_dfs) return collected_collecting_dfs, collected_leaf_dfs
def collect_leaf(self, child, edge_df, collected_leaf_dfs, root_props=None): root_props = self.root_props if root_props is None else root_props if isinstance(child, LeafNode): child_df = self.translate_table(child.tbl_name, props=self.parser.props) child_df = child_df.mapValues( lambda x: merge_dictionary({"source_node": child.name}, x)) if child_df.isEmpty(): return child_df = child_df.join(edge_df).mapValues( lambda x: merge_and_fill_empty_props( x, root_props, to_tuple=True)) collected_leaf_dfs["final"] = ( child_df if "final" not in collected_leaf_dfs else collected_leaf_dfs["final"].union(child_df).distinct()) if child.name in collected_leaf_dfs: collected_leaf_dfs[child.name].unpersist() child.done = True