Beispiel #1
0
    def construct_aggregation_tree(self, flat_paths):
        reversed_index = {}
        list_nodes = []
        for path in flat_paths:
            n_name = self.mapping["root"]
            current_parent_edge = None
            level = 0
            for i, p in enumerate(path.path):
                if (n_name, current_parent_edge) in reversed_index:
                    n_current = list_nodes[reversed_index[(
                        n_name, current_parent_edge)]]
                else:
                    n_current = AggregatedNode(
                        n_name,
                        get_node_table_name(self.model, n_name),
                        current_parent_edge,
                        level,
                    )
                    list_nodes.append(n_current)
                    reversed_index[(n_name,
                                    current_parent_edge)] = len(list_nodes) - 1

                child_name, edge_tbl = get_edge_table(self.model, n_name, p)

                n_child = (list_nodes[reversed_index[(child_name,
                                                      edge_tbl)]] if
                           (child_name,
                            edge_tbl) in reversed_index else AggregatedNode(
                                child_name,
                                get_node_table_name(self.model, child_name),
                                edge_tbl,
                                level + 1,
                            ))
                n_child.parent = n_current
                if i == len(path.path) - 1:
                    for reducer in path.reducers:
                        prop = self.create_prop_from_json(
                            self.doc_type, reducer, None)
                        n_child.reducers.append(Reducer(prop, reducer["fn"]))

                n_current.add_child(n_child)
                if (child_name, edge_tbl) not in reversed_index:
                    list_nodes.append(n_child)
                    reversed_index[(child_name,
                                    edge_tbl)] = len(list_nodes) - 1

                n_name = child_name
                current_parent_edge = edge_tbl
                level += 1

        return list_nodes, Parser.get_leaves(list_nodes)
Beispiel #2
0
    def get_table_list_from_path(self, p, root, path):
        r = []
        splitted_path = path.split(".") if path else []

        node = get_node_table_name(p.model, root)
        r.append(node)

        for i in splitted_path:
            root, node = get_edge_table(p.model, root, i)
            r.append(node)

            node = get_node_table_name(p.model, root)
            r.append(node)
        return r
Beispiel #3
0
    def translate_parent(self, root_df):
        if len(self.parser.parent_nodes) == 0:
            return root_df
        root_tbl = get_node_table_name(self.parser.model, self.parser.root)
        root_id = self.parser.get_key_prop().id
        for f in self.parser.parent_nodes:
            df = self.translate_table(root_tbl, props=[])
            n = f.head
            first = True
            while n is not None:
                edge_tbl = n.edge_up_tbl
                df = df.join(self.translate_edge(edge_tbl, reversed=False))
                if first:
                    df = df.map(lambda x: (x[1][1], ({root_id: x[0]},) + (x[1][0],))) \
                        .mapValues(lambda x: merge_dictionary(x[0], x[1]))
                    first = False
                else:
                    df = df.map(lambda x: (x[1][1], x[1][0]))
                cur_props = n.props
                tbl = n.tbl
                n_df = self.translate_table(tbl, props=cur_props)

                df = n_df.join(df) \
                    .mapValues(lambda x: merge_and_fill_empty_props(x, cur_props))
                n = n.child
            df = df.map(lambda x: make_key_from_property(x[1], root_id))
            root_df = root_df.leftOuterJoin(df).mapValues(
                lambda x: merge_dictionary(x[0], x[1]))
        return root_df
Beispiel #4
0
 def json_to_parent_node(self, path):
     words = path.split(".")
     nodes = [
         tuple([_f for _f in re.split("[\[\]]", w) if _f]) for w in words
     ]
     first = None
     prev = None
     prev_label = self.root
     for nd in nodes:
         n = nd[0]
         p = nd[1] if len(nd) > 1 else None
         parent_name, edge_tbl = get_edge_table(self.model, prev_label, n)
         parent_tbl = get_node_table_name(self.model, parent_name)
         if p is not None:
             json_props = [{
                 "name": p[0],
                 "src": p[1]
             } for p in self.get_src_name(p.split(","))]
             props = self.create_props_from_json(self.doc_type,
                                                 json_props,
                                                 node_label=parent_name)
         else:
             props = []
         cur = ParentNode(parent_name, parent_tbl, edge_tbl, props)
         if prev is not None:
             prev.child = cur
         else:
             first = cur
         prev_label = parent_name
         prev = cur
     return first
Beispiel #5
0
 def json_to_parent_node(self, path):
     words = path.split('.')
     nodes = [tuple(filter(None, re.split('[\[\]]', w))) for w in words]
     first = None
     prev = None
     prev_label = self.root
     for nd in nodes:
         n = nd[0]
         p = nd[1] if len(nd) > 1 else None
         parent_name, edge_tbl = get_edge_table(self.model, prev_label, n)
         parent_tbl = get_node_table_name(self.model, parent_name)
         if p is not None:
             json_props = [{
                 'name': p[0],
                 'src': p[1]
             } for p in self.get_src_name(p.split(','))]
             props = self.create_props_from_json(self.doc_type,
                                                 json_props,
                                                 node_label=parent_name)
         else:
             props = []
         cur = ParentNode(parent_name, parent_tbl, edge_tbl, props)
         if prev is not None:
             prev.child = cur
         else:
             first = cur
         prev_label = parent_name
         prev = cur
     return first
Beispiel #6
0
 def json_to_special_node(self, path):
     """
     Create node in the path of special aggregation
     :param path: path define the node and the prop to be aggregated
     :return:
     """
     words = path.split(".")
     nodes = [
         tuple([_f for _f in re.split("[\[\]]", w) if _f]) for w in words
     ]
     first = None
     prev = None
     prev_label = self.root
     for (n, str_p) in nodes:
         child_name, edge_tbl = get_edge_table(self.model, prev_label, n)
         child_tbl = get_node_table_name(self.model, child_name)
         json_props = [{"name": p, "src": p} for p in str_p.split(",")]
         props = self.create_props_from_json(self.doc_type,
                                             json_props,
                                             node_label=child_name)
         cur = SpecialNode(child_name, child_tbl, edge_tbl, props)
         if prev is not None:
             prev.child = cur
         else:
             first = cur
         prev_label = child_name
         prev = cur
     return first
Beispiel #7
0
 def translate(self):
     root_tbl = get_node_table_name(self.parser.model, self.parser.root)
     root_df = self.translate_table(root_tbl, props=self.parser.props)
     root_df = self.translate_special(root_df)
     root_df = self.translate_parent(root_df)
     root_df = self.get_direct_children(root_df)
     if len(self.parser.aggregated_nodes) == 0:
         return root_df
     return root_df.join(self.aggregate_nested_properties()).mapValues(
         lambda x: merge_dictionary(x[0], x[1]))
Beispiel #8
0
 def get_props_for_nodes(self):
     prop_nodes = {}
     roots = {}
     for (k, v) in self.mapping.get("injecting_props", {}).items():
         if k == "project" and "project_code" not in [
                 p.get("name") for p in v.get("props")
         ]:
             v.get("props").append({"name": PROJECT_CODE, "src": "code"})
         if k != "program":
             prop_nodes[k] = CollectingNode(
                 k,
                 get_node_table_name(self.model, k),
                 props=self.create_props_from_json(self.doc_type,
                                                   v.get("props"),
                                                   node_label=k),
             )
         else:
             node_props = v.get("props")
             node_props.append({"name": PROGRAM_NAME, "src": "name"})
             roots[k] = RootNode(
                 k,
                 get_node_table_name(self.model, k),
                 self.create_props_from_json(self.doc_type,
                                             node_props,
                                             node_label=k,
                                             is_additional=True),
             )
     if "project" not in prop_nodes.keys():
         prop_nodes["project"] = CollectingNode(
             "project",
             get_node_table_name(self.model, "project"),
             props=self.create_props_from_json(
                 self.doc_type,
                 [{
                     "name": PROJECT_CODE,
                     "src": "code"
                 }],
                 node_label="project",
                 is_additional=True,
             ),
         )
     return prop_nodes, roots
Beispiel #9
0
    def create_auth_path_root(self):
        program_table_name = get_node_table_name(self.model, 'program')
        project_table_name = get_node_table_name(self.model, 'project')
        _, edge_up_tbl = get_edge_table(self.model, 'project', 'programs')
        root_program = RootNode(
            'auth_path_root', program_table_name,
            self.create_props_from_json(self.doc_type, [{
                'name': 'program_name',
                'src': 'name'
            }],
                                        node_label='program'))
        root_project = RootNode(
            'project', project_table_name,
            self.create_props_from_json(self.doc_type, [{
                'name': 'project_code',
                'src': 'code'
            }],
                                        node_label='project'), edge_up_tbl)
        root_program.root_child = root_project

        return root_program
Beispiel #10
0
 def add_collecting_node(self, child, collectors, fst):
     parent_name = get_node_label(
         self.model, get_parent_name(self.model, child.name, fst))
     _, edge_up_tbl = get_edge_table(self.model, child.name, fst)
     tbl_name = get_node_table_name(
         self.model, get_parent_label(self.model, child.name, fst))
     collecting_node = (collectors[parent_name] if parent_name in collectors
                        else CollectingNode(parent_name, tbl_name))
     collecting_node.add_child(child)
     child.add_parent(collecting_node.name, edge_up_tbl)
     collectors[parent_name] = collecting_node
     return collecting_node
Beispiel #11
0
    def add_root_node(self, child, roots, segment):
        root_name = get_node_label(
            self.model, get_parent_name(self.model, child.name, segment))
        _, edge_up_tbl = get_edge_table(self.model, child.name, segment)
        root_tbl_name = get_node_table_name(
            self.model, get_parent_label(self.model, child.name, segment))
        top_node = roots[root_name] if root_name in roots \
            else RootNode(root_name, root_tbl_name,
                          self.create_props_from_json(self.doc_type,
                                                      self.mapping['injecting_props'][root_name]['props'],
                                                      node_label=root_name))
        child.add_parent(top_node.name, edge_up_tbl)
        top_node.add_child(child)

        roots[root_name] = top_node
Beispiel #12
0
    def get_orphan_paths(self, selected_category, leaves):
        leaves_name = [
            k for (k, v) in self.dictionary.schema.items()
            if v.get('category', None) == selected_category
        ]
        orphan_leaves = set([])
        for name in leaves_name:
            self.leaves.add(
                LeafNode(name, get_node_table_name(self.model, name)))
            if name not in leaves:
                orphan_leaves.add(name)

        if len(orphan_leaves) > 0:
            return self.get_shortest_path_from_root(['program', 'project'],
                                                    orphan_leaves)
        return set([])
Beispiel #13
0
    def get_collecting_nodes(self):
        def selected_category_comparer(dictionary, x):
            return get_node_category(dictionary, x) == selected_category

        selected_category = self.mapping.get("category", "data_file")
        flat_paths = self.create_collecting_paths_from_root(
            "program",
            lambda x: selected_category_comparer(self.dictionary, x))
        leaves = set([p.src for p in flat_paths])
        for l in leaves:
            self.leaves.add(LeafNode(l, get_node_table_name(self.model, l)))

        nodes_with_props, roots = self.get_props_for_nodes()
        self.collectors, self.roots = self.create_tree_from_generated_edges(
            flat_paths, nodes_with_props, roots)

        self.update_level()
        self.collectors.sort()
Beispiel #14
0
 def translate_special(self, root_df):
     """
     If etlMapping have special_props entry that defines a special function, run this translation
     :param root_df: The special function also have the same root with hosted document (case or subject)
     :return: Return the origin rdd with result from special function included inside
     """
     if len(self.parser.special_nodes) == 0:
         return root_df
     root_tbl = get_node_table_name(self.parser.model, self.parser.root)
     root_id = self.parser.get_key_prop().id
     for f in self.parser.special_nodes:
         if f.fn[0] == "sliding":
             df = self.translate_table(root_tbl, props=[])
             n = f.head
             first = True
             while n is not None:
                 edge_tbl = n.edge_up_tbl
                 df = df.join(self.translate_edge(edge_tbl))
                 if first:
                     df = df.map(lambda x: (x[1][1], ({
                         root_id: x[0]
                     }, ) + (x[1][0], ))).mapValues(
                         lambda x: merge_dictionary(x[0], x[1]))
                     first = False
                 else:
                     df = df.map(lambda x: (x[1][1], x[1][0]))
                 cur_props = n.props
                 tbl = n.tbl
                 n_df = self.translate_table(tbl, props=cur_props)
                 df = n_df.join(df).mapValues(
                     lambda x: merge_and_fill_empty_props(x, cur_props))
                 n = n.child
             df = df.map(lambda x: make_key_from_property(x[1], root_id))
             (n, fn1, fn2) = tuple(f.fn[1:])
             fid = self.parser.get_prop_by_name(f.name).id
             df = df.mapValues(lambda x: tuple([
                 v for (k, v) in list(
                     collections.OrderedDict(sorted(x.items())).items())
             ]))
             df = sliding(df, int(n.strip()), fn1.strip(),
                          fn2.strip()).mapValues(lambda x: {fid: x})
             root_df = root_df.leftOuterJoin(df).mapValues(
                 lambda x: merge_dictionary(x[0], x[1]))
     return root_df
Beispiel #15
0
 def create_tree_from_generated_edges(self, flat_paths, nodes_with_props,
                                      roots):
     collectors = nodes_with_props
     checking_set = set(self.generated_edges)
     for p in flat_paths:
         segments = list(p.path)
         _, edge_up_tbl = get_edge_table(self.model, p.src, segments[0])
         if edge_up_tbl not in checking_set:
             continue
         if p.src not in collectors:
             tbl_name = get_node_table_name(self.model, p.src)
             collectors[p.src] = CollectingNode(p.src, tbl_name)
         child = collectors[p.src]
         if len(segments) > 1:
             for fst in segments[0:len(segments) - 1]:
                 _, edge_up_tbl = get_edge_table(self.model, p.src,
                                                 segments[0])
                 if edge_up_tbl not in checking_set:
                     break
                 child = self.add_collecting_node(child, collectors, fst)
         self.add_root_node(child, roots, segments[-1])
     return list(collectors.values()), list(roots.values())
Beispiel #16
0
 def add_root_node(self, child, roots, segment):
     root_name = get_node_label(
         self.model, get_parent_name(self.model, child.name, segment))
     _, edge_up_tbl = get_edge_table(self.model, child.name, segment)
     root_tbl_name = get_node_table_name(
         self.model, get_parent_label(self.model, child.name, segment))
     top_node = (roots[root_name] if root_name in roots else RootNode(
         root_name,
         root_tbl_name,
         self.create_props_from_json(
             self.doc_type,
             [{
                 "name": "program_name",
                 "src": "name"
             }],
             node_label=root_name,
             is_additional=True,
         ),
     ))
     child.add_parent(top_node.name, edge_up_tbl)
     top_node.add_child(child)
     roots[root_name] = top_node