Ejemplo n.º 1
0
Archivo: base.py Proyecto: paultag/pupa
    def import_directory(self, datadir):
        """ import a JSON directory into the database """
        # id: json
        data_by_id = {}
        # hash(json): id
        seen_hashes = {}

        # load all json, mapped by json_id
        for fname in glob.glob(os.path.join(datadir, self._type + '_*.json')):
            with open(fname) as f:
                data = json.load(f)
                json_id = data.pop('_id')
                objhash = omnihash(data)
                if objhash not in seen_hashes:
                    seen_hashes[objhash] = json_id
                    data_by_id[json_id] = data
                else:
                    self.duplicates[json_id] = seen_hashes[objhash]

        # toposort the nodes so parents are imported first
        network = Network()
        in_network = set()
        import_order = []

        for json_id, data in data_by_id.items():
            parent_id = data.get('parent_id', None)
            if parent_id:
                # Right. There's an import dep. We need to add the edge from
                # the parent to the current node, so that we import the parent
                # before the current node.
                network.add_edge(parent_id, json_id)
            else:
                # Otherwise, there is no parent, and we just need to add it to
                # the network to add whenever we feel like it during the import
                # phase.
                network.add_node(json_id)

        # resolve the sorted import order
        for jid in network.sort():
            import_order.append((jid, data_by_id[jid]))
            in_network.add(jid)

        # ensure all data made it into network
        if in_network != set(data_by_id.keys()):
            raise Exception("import is missing nodes in network set")

        # time to actually do the import
        for json_id, data in import_order:
            parent_id = data.get('parent_id', None)
            if parent_id:
                # If we've got a parent ID, let's resolve it's JSON id
                # (scrape-time) to a Database ID (needs to have had the
                # parent imported first - which we asserted is true via
                # the topological sort)
                data['parent_id'] = self.resolve_json_id(parent_id)
            obj, what = self.import_json(data)
            self.json_to_db_id[json_id] = obj.id
            self.results[what] += 1

        return {self._type: self.results}
Ejemplo n.º 2
0
def test_cycles_simple():
    network = Network()
    network.add_node("A")
    network.add_node("B")
    network.add_edge("A", "B")
    network.add_edge("B", "A")
    assert chash(network.cycles()) == chash([("A", "B", "A")])
Ejemplo n.º 3
0
Archivo: base.py Proyecto: paultag/pupa
    def import_directory(self, datadir):
        """ import a JSON directory into the database """
        # id: json
        data_by_id = {}
        # hash(json): id
        seen_hashes = {}

        # load all json, mapped by json_id
        for fname in glob.glob(os.path.join(datadir, self._type + '_*.json')):
            with open(fname) as f:
                data = json.load(f)
                json_id = data.pop('_id')
                objhash = omnihash(data)
                if objhash not in seen_hashes:
                    seen_hashes[objhash] = json_id
                    data_by_id[json_id] = data
                else:
                    self.duplicates[json_id] = seen_hashes[objhash]

        # toposort the nodes so parents are imported first
        network = Network()
        in_network = set()
        import_order = []

        for json_id, data in data_by_id.items():
            parent_id = data.get('parent_id', None)
            if parent_id:
                # Right. There's an import dep. We need to add the edge from
                # the parent to the current node, so that we import the parent
                # before the current node.
                network.add_edge(parent_id, json_id)
            else:
                # Otherwise, there is no parent, and we just need to add it to
                # the network to add whenever we feel like it during the import
                # phase.
                network.add_node(json_id)

        # resolve the sorted import order
        for jid in network.sort():
            import_order.append((jid, data_by_id[jid]))
            in_network.add(jid)

        # ensure all data made it into network
        if in_network != set(data_by_id.keys()):
            raise Exception("import is missing nodes in network set")

        # time to actually do the import
        for json_id, data in import_order:
            parent_id = data.get('parent_id', None)
            if parent_id:
                # If we've got a parent ID, let's resolve it's JSON id
                # (scrape-time) to a Database ID (needs to have had the
                # parent imported first - which we asserted is true via
                # the topological sort)
                data['parent_id'] = self.resolve_json_id(parent_id)
            obj, what = self.import_json(data)
            self.json_to_db_id[json_id] = obj.id
            self.results[what] += 1

        return {self._type: self.results}
Ejemplo n.º 4
0
def test_cyclic_graph_error_simple():
    network = Network()
    network.add_node("A")
    network.add_node("B")
    network.add_edge("A", "B")
    network.add_edge("B", "A")

    with pytest.raises(CyclicGraphError):
        list(network.sort())
Ejemplo n.º 5
0
def test_dot_debug():
    network = Network()

    network.add_node("A")
    network.add_node("B")
    network.add_edge("A", "B")

    dot = network.dot()
    assert dot == "digraph graphname {A -> B;}"
Ejemplo n.º 6
0
def test_sort_order_basic():
    network = Network()
    network.add_node("A")
    network.add_node("B")
    network.add_node("C")

    network.add_edge("A", "B")
    network.add_edge("B", "C")

    assert (list(network.sort())) == ["A", "B", "C"]
Ejemplo n.º 7
0
def test_cyclic_graph_error_massive():
    network = Network()

    entries = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "A"]
    for i, e in enumerate(entries[:-1]):
        network.add_node(e)
        network.add_edge(e, entries[1 + i])

    with pytest.raises(CyclicGraphError):
        list(network.sort())
Ejemplo n.º 8
0
def test_link_before_nodes():
    network = Network()

    network.add_edge("A", "B")
    network.add_edge("B", "C")
    network.add_edge("C", "D")

    network.add_node("A")
    network.add_node("B")
    network.add_node("C")
    network.add_node("D")

    assert list(network.sort()) == ["A", "B", "C", "D"]
Ejemplo n.º 9
0
def test_sort_order_double():
    network = Network()
    network.add_node("A")
    network.add_node("B")
    network.add_node("C")

    network.add_edge("A", "B")
    network.add_edge("A", "C")
    network.add_edge("C", "B")

    # A  =>  B
    #       /
    # A => C

    assert (list(network.sort())) == ["A", "C", "B"]
Ejemplo n.º 10
0
Archivo: base.py Proyecto: Vanuan/pupa
    def _order_imports(self, dicts):
        # id: json
        data_by_id = {}
        # hash(json): id
        seen_hashes = {}

        # load all json, mapped by json_id
        for data in dicts:
            json_id = data.pop('_id')
            objhash = omnihash(data)
            if objhash not in seen_hashes:
                seen_hashes[objhash] = json_id
                data_by_id[json_id] = data
            else:
                self.duplicates[json_id] = seen_hashes[objhash]

        # toposort the nodes so parents are imported first
        network = Network()
        in_network = set()
        import_order = []

        for json_id, data in data_by_id.items():
            parent_id = data.get('parent_id', None)
            network.add_node(json_id)
            if parent_id:
                # Right. There's an import dep. We need to add the edge from
                # the parent to the current node, so that we import the parent
                # before the current node.
                network.add_edge(parent_id, json_id)

        # resolve the sorted import order
        for jid in network.sort():
            import_order.append((jid, data_by_id[jid]))
            in_network.add(jid)

        # ensure all data made it into network (paranoid check, should never fail)
        if in_network != set(data_by_id.keys()):    # pragma: no cover
            raise Exception("import is missing nodes in network set")

        return import_order
Ejemplo n.º 11
0
def test_internal_node_removal():
    network = Network()

    network.add_node("A")
    network.add_node("B")
    network.add_node("C")
    network.add_node("D")

    network.add_edge("A", "B")
    network.add_edge("B", "C")
    network.add_edge("C", "D")
    network.add_edge("A", "C")  # Useful for ensuring the ending list
    # is deterministic.

    # Ensure that we can't remove an internal node without a ValueError
    # by default.
    with pytest.raises(ValueError):
        network.prune_node("B")

    # OK. Now that we know that works, let's prune it harder.
    network.prune_node("B", remove_backrefs=True)

    # And make sure "B" is gone.
    assert list(network.sort()) == ["A", "C", "D"]
Ejemplo n.º 12
0
def test_cycles_complex():
    network = Network()
    network.add_node("A")
    network.add_node("B")
    network.add_node("C")
    network.add_node("D")

    network.add_edge("A", "B")
    network.add_edge("B", "C")
    network.add_edge("C", "D")
    network.add_edge("D", "A")

    network.add_edge("D", "C")
    network.add_edge("C", "B")
    network.add_edge("B", "D")

    # with open("/home/tag/debug.dot", 'w') as fd:
    #     fd.write(network.dot())

    assert chash(network.cycles()) == chash([
        ('B', 'C', 'B'),
        ('C', 'D', 'C'),
        ('A', 'B', 'D', 'A')
    ])
Ejemplo n.º 13
0
    def import_from_json(self, datadir):
        # load all json, mapped by json_id
        raw_objects = {}
        for fname in glob.glob(os.path.join(datadir, self._type + '_*.json')):
            with open(fname) as f:
                data = json.load(f)
                # prepare object from json
                if data['_type'] != 'person':
                    data['jurisdiction_id'] = self.jurisdiction_id
                data = self.prepare_object_from_json(data)
                # convert dict=>class and store in raw_objects
                obj = self._model_class.from_dict(data)
                json_id = obj._id
                raw_objects[json_id] = obj

        # map duplicate ids to first occurance of same object
        inverse = defaultdict(list)
        for json_id, obj in raw_objects.items():
            inverse[_hash(obj)].append(json_id)

        self.duplicates = {}

        for json_ids in inverse.values():
            for json_id in json_ids[1:]:
                self.duplicates[json_id] = json_ids[0]

        # now do import, ignoring duplicates

        # Firstly, before we start, let's de-dupe the pool.
        import_pool = {k: v for k, v in raw_objects.items() if k not in self.duplicates}

        # Now, we create a pupa.utils.topsort.Network object, so that
        # we can contain the import dependencies.
        network = Network()

        to_import = []  # Used to hold the import order
        seen = set()   # Used to ensure we got all nodes.

        for json_id, obj in import_pool.items():
            parent_id = getattr(obj, 'parent_id', None)
            if parent_id:
                # Right. There's an import dep. We need to add the edge from
                # the parent to the current node, so that we import the parent
                # before the current node.
                network.add_edge(parent_id, json_id)
            else:
                # Otherwise, there is no parent, and we just need to add it to
                # the network to add whenever we feel like it during the import
                # phase.
                network.add_node(json_id)

        for link in network.sort():
            to_import.append((link, import_pool[link]))
            seen.add(link)  # This extra step is to make sure that our plan
            # is actually importing all entries into the database.

        if seen != set(import_pool.keys()):  # If it's gone wrong (shouldn't)
            raise ValueError("""Something went wrong internally with the
                                dependency resolution.""")
            # We'll blow up, since we've not done our job and failed to import
            # all of our files into the Database.

        for json_id, obj in to_import:
            parent_id = getattr(obj, 'parent_id', None)
            if parent_id:
                # If we've got a parent ID, let's resolve it's JSON id
                # (scrape-time) to a Database ID (needs to have had the
                # parent imported first - which we asserted is true via
                # the topological sort)
                obj.parent_id = self.resolve_json_id(parent_id)
            self.json_to_db_id[json_id] = self.import_object(obj)

        return {self._type: self.results}
Ejemplo n.º 14
0
def test_sort_order_staged():
    network = Network()

    network.add_node("A1")
    network.add_node("A2")
    network.add_node("A3")

    network.add_edge("A1", "A2")
    network.add_edge("A1", "A3")
    network.add_edge("A2", "A3")

    network.add_node("B1")
    network.add_node("B2")
    network.add_node("B3")

    network.add_edge("B1", "B2")
    network.add_edge("B1", "B3")
    network.add_edge("B2", "B3")

    network.add_edge("B1", "A1")

    network.add_node("C1")
    network.add_node("C2")
    network.add_node("C3")

    network.add_edge("C1", "C2")
    network.add_edge("C1", "C3")
    network.add_edge("C2", "C3")

    network.add_edge("C1", "A1")
    network.add_edge("C1", "B1")

    network.add_edge("C1", "B1")
    network.add_edge("B1", "A1")
    network.add_edge("A1", "C2")
    network.add_edge("A1", "C3")

    # with open("/home/tag/debug.dot", 'w') as fd:
    #     fd.write(network.dot())

    sorted_order = list(network.sort())

    assert sorted_order.pop(0) == "C1"
    assert sorted_order.pop(0) == "B1"
    assert sorted_order.pop(0) in ("A1", "B2")
    #                          ^^ This makes more sense after you dot debug it
    assert sorted_order.pop(0) in ("A1", "B2")