Esempio n. 1
0
def restore_pid2node(filename):
    """read a textual PID->int map from stdin and write its binary version to
    filename

    """
    with open(filename, "wb") as dst:
        for line in sys.stdin:
            (str_pid, str_int) = line.split()
            PidToNodeMap.write_record(dst, str_pid, int(str_int))
Esempio n. 2
0
    def setUpClass(cls):
        """create reasonably sized (~2 MB) PID->int map to test on-disk DB

        """
        cls.tmpdir = tempfile.mkdtemp(prefix="swh.graph.test.")
        cls.fname = os.path.join(cls.tmpdir, "pid2int.bin")
        with open(cls.fname, "wb") as f:
            for (pid, i) in gen_records(length=10000):
                PidToNodeMap.write_record(f, pid, i)
Esempio n. 3
0
    def test_update(self):
        fname2 = self.fname + ".update"
        shutil.copy(self.fname, fname2)  # fresh map copy
        map2 = PidToNodeMap(fname2, mode="rb+")
        for (pid, int) in islice(map2, 11):  # update the first N items
            new_int = int + 42
            map2[pid] = new_int
            self.assertEqual(map2[pid], new_int)  # check updated value

        os.unlink(fname2)  # tmpdir will be cleaned even if we don't reach this
Esempio n. 4
0
def write(ctx, map_type, filename):
    """Write a map to disk sequentially.

    read from stdin a textual PID->node mapping (for pid2node, or a simple
    sequence of PIDs for node2pid) and write it to disk in the requested binary
    map format

    note that no sorting is applied, so the input should already be sorted as
    required by the chosen map type (by PID for pid2node, by int for node2pid)

    """
    with open(filename, "wb") as f:
        if map_type == "pid2node":
            for line in sys.stdin:
                (pid, int_str) = line.rstrip().split(maxsplit=1)
                PidToNodeMap.write_record(f, pid, int(int_str))
        elif map_type == "node2pid":
            for line in sys.stdin:
                pid = line.rstrip()
                NodeToPidMap.write_record(f, pid)
        else:
            raise ValueError("invalid map type: " + map_type)
Esempio n. 5
0
def map_lookup(graph, identifiers):
    """Lookup identifiers using on-disk maps.

    Depending on the identifier type lookup either a PID into a PID->node (and
    return the node integer identifier) or, vice-versa, lookup a node integer
    identifier into a node->PID (and return the PID).  The desired behavior is
    chosen depending on the syntax of each given identifier.

    Identifiers can be passed either directly on the command line or on
    standard input, separate by blanks. Logical lines (as returned by
    readline()) in stdin will be preserved in stdout.

    """
    success = True  # no identifiers failed to be looked up
    pid2node = PidToNodeMap(f"{graph}.{PID2NODE_EXT}")
    node2pid = NodeToPidMap(f"{graph}.{NODE2PID_EXT}")

    def lookup(identifier):
        nonlocal success, pid2node, node2pid
        is_pid = None
        try:
            int(identifier)
            is_pid = False
        except ValueError:
            try:
                parse_persistent_identifier(identifier)
                is_pid = True
            except swh.model.exceptions.ValidationError:
                success = False
                logging.error(f'invalid identifier: "{identifier}", skipping')

        try:
            if is_pid:
                return str(pid2node[identifier])
            else:
                return node2pid[int(identifier)]
        except KeyError:
            success = False
            logging.error(f'identifier not found: "{identifier}", skipping')

    if identifiers:  # lookup identifiers passed via CLI
        for identifier in identifiers:
            print(lookup(identifier))
    else:  # lookup identifiers passed via stdin, preserving logical lines
        for line in sys.stdin:
            results = [lookup(id) for id in line.rstrip().split()]
            if results:  # might be empty if all IDs on the same line failed
                print(" ".join(results))

    sys.exit(0 if success else 1)
Esempio n. 6
0
 def __enter__(self):
     self.gateway = JavaGateway.launch_gateway(
         java_path=None,
         javaopts=self.config["java_tool_options"].split(),
         classpath=self.config["classpath"],
         die_on_exit=True,
         redirect_stdout=sys.stdout,
         redirect_stderr=_get_pipe_stderr(),
     )
     self.entry = self.gateway.jvm.org.softwareheritage.graph.Entry()
     self.entry.load_graph(self.graph_path)
     self.node2pid = NodeToPidMap(self.graph_path + "." + NODE2PID_EXT)
     self.pid2node = PidToNodeMap(self.graph_path + "." + PID2NODE_EXT)
     self.stream_proxy = JavaStreamProxy(self.entry)
     return self
Esempio n. 7
0
 def setUp(self):
     self.map = PidToNodeMap(self.fname)
Esempio n. 8
0
class TestPidToNodeMap(unittest.TestCase):
    @classmethod
    def setUpClass(cls):
        """create reasonably sized (~2 MB) PID->int map to test on-disk DB

        """
        cls.tmpdir = tempfile.mkdtemp(prefix="swh.graph.test.")
        cls.fname = os.path.join(cls.tmpdir, "pid2int.bin")
        with open(cls.fname, "wb") as f:
            for (pid, i) in gen_records(length=10000):
                PidToNodeMap.write_record(f, pid, i)

    @classmethod
    def tearDownClass(cls):
        shutil.rmtree(cls.tmpdir)

    def setUp(self):
        self.map = PidToNodeMap(self.fname)

    def tearDown(self):
        self.map.close()

    def test_lookup(self):
        for (pid, pos) in MAP_PAIRS:
            self.assertEqual(self.map[pid], pos)

    def test_missing(self):
        with self.assertRaises(KeyError):
            self.map["swh:1:ori:0101010100000000000000000000000000000000"],
        with self.assertRaises(KeyError):
            self.map["swh:1:cnt:0101010100000000000000000000000000000000"],

    def test_type_error(self):
        with self.assertRaises(TypeError):
            self.map[42]
        with self.assertRaises(TypeError):
            self.map[1.2]

    def test_update(self):
        fname2 = self.fname + ".update"
        shutil.copy(self.fname, fname2)  # fresh map copy
        map2 = PidToNodeMap(fname2, mode="rb+")
        for (pid, int) in islice(map2, 11):  # update the first N items
            new_int = int + 42
            map2[pid] = new_int
            self.assertEqual(map2[pid], new_int)  # check updated value

        os.unlink(fname2)  # tmpdir will be cleaned even if we don't reach this

    def test_iter_type(self):
        for t in PID_TYPES:
            first_20 = list(islice(self.map.iter_type(t), 20))
            k = first_20[0][1]
            expected = [("swh:1:{}:{:040x}".format(t, i), i)
                        for i in range(k, k + 20)]
            assert first_20 == expected

    def test_iter_prefix(self):
        for t in PID_TYPES:
            prefix = self.map.iter_prefix("swh:1:{}:00".format(t))
            first_20 = list(islice(prefix, 20))
            k = first_20[0][1]
            expected = [("swh:1:{}:{:040x}".format(t, i), i)
                        for i in range(k, k + 20)]
            assert first_20 == expected
Esempio n. 9
0
def dump_pid2node(filename):
    for (pid, int) in PidToNodeMap(filename):
        print("{}\t{}".format(pid, int))