def setUpClass(cls): """create reasonably sized (~1 MB) int->PID map to test on-disk DB """ cls.tmpdir = tempfile.mkdtemp(prefix="swh.graph.test.") cls.fname = os.path.join(cls.tmpdir, "int2pid.bin") with open(cls.fname, "wb") as f: for (pid, _i) in gen_records(length=10000): NodeToPidMap.write_record(f, pid)
def restore_node2pid(filename, length): """read a textual int->PID map from stdin and write its binary version to filename """ node2pid = NodeToPidMap(filename, mode="wb", length=length) for line in sys.stdin: (str_int, str_pid) = line.split() node2pid[int(str_int)] = str_pid node2pid.close()
def test_update(self): fname2 = self.fname + ".update" shutil.copy(self.fname, fname2) # fresh map copy map2 = NodeToPidMap(fname2, mode="rb+") for (int, pid) in islice(map2, 11): # update the first N items new_pid = pid.replace(":0", ":f") # mangle first hex digit map2[int] = new_pid self.assertEqual(map2[int], new_pid) # check updated value os.unlink(fname2) # tmpdir will be cleaned even if we don't reach this
class TestNodeToPidMap(unittest.TestCase): @classmethod def setUpClass(cls): """create reasonably sized (~1 MB) int->PID map to test on-disk DB """ cls.tmpdir = tempfile.mkdtemp(prefix="swh.graph.test.") cls.fname = os.path.join(cls.tmpdir, "int2pid.bin") with open(cls.fname, "wb") as f: for (pid, _i) in gen_records(length=10000): NodeToPidMap.write_record(f, pid) @classmethod def tearDownClass(cls): shutil.rmtree(cls.tmpdir) def setUp(self): self.map = NodeToPidMap(self.fname) def tearDown(self): self.map.close() def test_lookup(self): for (pid, pos) in MAP_PAIRS: self.assertEqual(self.map[pos], pid) def test_out_of_bounds(self): with self.assertRaises(IndexError): self.map[1000000] with self.assertRaises(IndexError): self.map[-1000000] def test_update(self): fname2 = self.fname + ".update" shutil.copy(self.fname, fname2) # fresh map copy map2 = NodeToPidMap(fname2, mode="rb+") for (int, pid) in islice(map2, 11): # update the first N items new_pid = pid.replace(":0", ":f") # mangle first hex digit map2[int] = new_pid self.assertEqual(map2[int], new_pid) # check updated value os.unlink(fname2) # tmpdir will be cleaned even if we don't reach this
def write(ctx, map_type, filename): """Write a map to disk sequentially. read from stdin a textual PID->node mapping (for pid2node, or a simple sequence of PIDs for node2pid) and write it to disk in the requested binary map format note that no sorting is applied, so the input should already be sorted as required by the chosen map type (by PID for pid2node, by int for node2pid) """ with open(filename, "wb") as f: if map_type == "pid2node": for line in sys.stdin: (pid, int_str) = line.rstrip().split(maxsplit=1) PidToNodeMap.write_record(f, pid, int(int_str)) elif map_type == "node2pid": for line in sys.stdin: pid = line.rstrip() NodeToPidMap.write_record(f, pid) else: raise ValueError("invalid map type: " + map_type)
def map_lookup(graph, identifiers): """Lookup identifiers using on-disk maps. Depending on the identifier type lookup either a PID into a PID->node (and return the node integer identifier) or, vice-versa, lookup a node integer identifier into a node->PID (and return the PID). The desired behavior is chosen depending on the syntax of each given identifier. Identifiers can be passed either directly on the command line or on standard input, separate by blanks. Logical lines (as returned by readline()) in stdin will be preserved in stdout. """ success = True # no identifiers failed to be looked up pid2node = PidToNodeMap(f"{graph}.{PID2NODE_EXT}") node2pid = NodeToPidMap(f"{graph}.{NODE2PID_EXT}") def lookup(identifier): nonlocal success, pid2node, node2pid is_pid = None try: int(identifier) is_pid = False except ValueError: try: parse_persistent_identifier(identifier) is_pid = True except swh.model.exceptions.ValidationError: success = False logging.error(f'invalid identifier: "{identifier}", skipping') try: if is_pid: return str(pid2node[identifier]) else: return node2pid[int(identifier)] except KeyError: success = False logging.error(f'identifier not found: "{identifier}", skipping') if identifiers: # lookup identifiers passed via CLI for identifier in identifiers: print(lookup(identifier)) else: # lookup identifiers passed via stdin, preserving logical lines for line in sys.stdin: results = [lookup(id) for id in line.rstrip().split()] if results: # might be empty if all IDs on the same line failed print(" ".join(results)) sys.exit(0 if success else 1)
def __enter__(self): self.gateway = JavaGateway.launch_gateway( java_path=None, javaopts=self.config["java_tool_options"].split(), classpath=self.config["classpath"], die_on_exit=True, redirect_stdout=sys.stdout, redirect_stderr=_get_pipe_stderr(), ) self.entry = self.gateway.jvm.org.softwareheritage.graph.Entry() self.entry.load_graph(self.graph_path) self.node2pid = NodeToPidMap(self.graph_path + "." + NODE2PID_EXT) self.pid2node = PidToNodeMap(self.graph_path + "." + PID2NODE_EXT) self.stream_proxy = JavaStreamProxy(self.entry) return self
def setUp(self): self.map = NodeToPidMap(self.fname)
def dump_node2pid(filename): for (int, pid) in NodeToPidMap(filename): print("{}\t{}".format(int, pid))