def compress(self): (graph, collapsed, uf) = self.pp.process() (afwd, aback, collapsed_nodes) = self._construct_asymmetric_graphs( graph, collapsed, uf) (delta_fwd, fwd_stats) = self._delta_encode_graph(afwd, collapsed_nodes) (delta_back, back_stats) = self._delta_encode_graph(aback, collapsed_nodes) sizes = {self.pp.id2num(k): uf.get_size(k) for k in uf.leaders()} nbits_size_entry = nbits_for_int(max(sizes.values())) header_byts = self._compress_header(fwd_stats, back_stats, nbits_size_entry) (node_byts, index) = self._compress_nodes(delta_fwd, fwd_stats, delta_back, back_stats, collapsed_nodes) nbits_index_entry = nbits_for_int(max(index)) header_byts.extend(nbits_index_entry.to_bytes(1, byteorder="big")) header_byts.extend(len(index).to_bytes(4, byteorder="big")) wbs = WriterBitString() for (idx, sz) in zip(index, [sizes[x] for x in sorted(sizes.keys())]): wbs.write_int(idx, nbits_index_entry) wbs.write_int(sz, nbits_size_entry) header_byts.extend(wbs.to_bytearray()) self.header_byts = header_byts self.node_byts = node_byts self.is_initialized = True
def compress(self): # Write header self.compressed = bytearray() info = {} for t in (False, True): degrees = [abs(x) for x in self.pp.get_degrees(transpose=t)] deltas = [abs(x) for x in self.pp.get_deltas(transpose=t)] nbits_degree = util.nbits_for_int(max(degrees)) nbits_delta = util.nbits_for_int(max(deltas)) graph = self.pp.get_graph(transpose=t) info[t] = (nbits_degree, nbits_delta, graph) self.compressed.extend(nbits_degree.to_bytes(1, byteorder="big")) self.compressed.extend(nbits_delta.to_bytes(1, byteorder="big")) #print(max(self.pp.get_degrees(transpose=t))) #print(max(self.pp.get_deltas(transpose=t))) #print(info) compressed_nodes, index = self.compress_nodes(info[False], info[True]) nbits_index_entry = util.nbits_for_int(max(index)) self.compressed.extend(nbits_index_entry.to_bytes(1, byteorder="big")) self.compressed.extend(len(index).to_bytes(4, byteorder="big")) bs = util.WriterBitString() for i in index: bs.write_int(i, width=nbits_index_entry) self.compressed.extend(bs.to_bytearray()) self.compressed.extend(compressed_nodes)
def __init__(self, pp): self.metadata = pp.get_metadata() self.iti = pp.get_id2num_map() self.id_bits = util.nbits_for_int(pp.get_graph().get_node_count()) self.encoded_json_bits = '' self.default_node_data = {} self.default_relation_data = {} self.default_time = [] self.keys_dict = { elt: util.int2bitstr(i, Encoder.key_bits) for (i, elt) in enumerate(Encoder.key_strings) } self.vals_dict = { elt: util.int2bitstr(i, Encoder.val_bits) for (i, elt) in enumerate(Encoder.val_strings) } self.typs_dict = { elt: util.int2bitstr(i, Encoder.typ_bits) for (i, elt) in enumerate(Encoder.typ_strings) } self.labels_dict = { elt: util.int2bitstr(i, Encoder.label_bits) for (i, elt) in enumerate(Encoder.prov_label_strings) } self.common_strs_dict = self.construct_common_strs_dict()
def _delta_encode_graph(self, graph, collapsed_nodes): delta_graph = Graph() stats = {} for x in [True, False]: stats[x] = {"nbits_degree": 1, "nbits_delta": 1} for node in graph.get_vertices(): delta_graph.add_vertex(node) edges = graph.get_outgoing_edges(node) if not edges: continue c = node in collapsed_nodes stats[c]["nbits_degree"] = max(stats[c]["nbits_degree"], nbits_for_int(len(edges))) prev = node for edge in sorted(edges): delta_graph.add_edge(node, edge - prev) prev = edge abs_deltas = [abs(x) for x in delta_graph.get_outgoing_edges(node)] stats[c]["nbits_delta"] = max(stats[c]["nbits_delta"], nbits_for_int(max(abs_deltas))) return (delta_graph, stats)
def construct_identifier_ids(self): ''' Adds the identifiers for relations to the identifier-to-id dictionary and writes this to file. The first 32 bits of the file represent the number of nodes. ''' if self.ids: return self.ids if not self.rankings: self.rank() node_bits = util.nbits_for_int(len(self.rankings)) self.ids = self.rankings.copy() for identifier, metadata in self.metadata.items(): # put IDs for relation identifiers in the dictionary if metadata.typ in pj.RELATION_TYPS: if metadata.typ == 'used': head = metadata.data["prov:entity"] tail = metadata.data["prov:activity"] elif metadata.typ == 'wasGeneratedBy': head = metadata.data["prov:activity"] tail = metadata.data["prov:entity"] elif metadata.typ == 'wasDerivedFrom': head = metadata.data["prov:usedEntity"] tail = metadata.data["prov:generatedEntity"] elif metadata.typ == 'wasInformedBy': head = metadata.data["prov:informant"] tail = metadata.data["prov:informed"] elif metadata.typ == 'relation': head = metadata.data["prov:sender"] tail = metadata.data["prov:receiver"] # add number of nodes to make sure we don't overlap with actual node IDs self.ids[identifier] = ((self.rankings[head] << node_bits) + self.rankings[tail] + len(self.g)) sorted_idents = sorted(self.ids.keys(), key=lambda v: self.ids[v]) with open(pj.PATH + "/identifiers.txt", 'wb') as f: f.write(len(self.rankings).to_bytes(4, byteorder='big')) with open(pj.PATH + "/identifiers.txt", 'a') as f: for i in sorted_idents: f.write(i + ",") return self.ids
def _number_identifiers(self, graph, collapsed, uf): sizes = {x: uf.get_size(x) for x in uf.leaders()} ranker = TransposeBfsRanker(graph, collapsed, self.metadata, sizes) id_map = ranker.rank() node_count = len(id_map) node_bits = nbits_for_int(node_count) for _id, entry in self.metadata.items(): if entry.typ in RELATION_TYPS: head = entry.data[RELATION_TYPS[entry.typ][0]] tail = entry.data[RELATION_TYPS[entry.typ][1]] id_map[_id] = ((id_map[head] << node_bits) + id_map[tail] + node_count) sorted_ids = sorted(id_map.keys(), key=lambda v: id_map[v]) with open("identifiers.txt", 'wb') as f: f.write(node_count.to_bytes(4, byteorder='big')) with open("identifiers.txt", 'a') as f: for _id in sorted_ids: f.write(_id + ",") self.id_map = id_map self.id_map_rev = {v: k for (k, v) in id_map.items()}
class Encoder(): MAX_STRING_SIZE_BITS = 10 RELATIVE_NODE = '@' typ_strings = { 'prefix', 'activity', 'relation', 'entity', 'agent', 'message', 'used', 'wasGeneratedBy', 'wasInformedBy', 'wasDerivedFrom', 'unknown' } key_strings = { # cf: keys 'cf:id', 'cf:boot_id', 'cf:machine_id', 'cf:date', 'cf:taint', 'cf:type', 'cf:version', 'cf:allowed', 'cf:sender', 'cf:receiver', 'cf:jiffies', 'cf:offset', 'cf:hasParent', 'cf:uid', 'cf:uuid', 'cf:gid', 'cf:pid', 'cf:vpid', 'cf:mode', 'cf:sock_type', 'cf:family', 'cf:seq', 'cf:protocol', 'cf:message', 'cf:address', 'cf:pathname', 'cf:camflow', 'cf:machine', 'cf:sysname', 'cf:nodename', 'cf:release', # prov: keys 'prov:label', 'prov:entity', 'prov:activity', 'prov:informant', 'prov:informed', 'prov:usedEntity', 'prov:generatedEntity', 'prov:type', # additional keys for relative versions RELATIVE_NODE } # (prov:label is the key, and have a value following) prov_label_strings = { '[address]', '[path]', '[TODO]', '[task]', '[unknown]', '[block special]', '[char special]', '[directory]', '[fifo]', '[link]', '[file]', '[socket]', '[mmaped_file]' } val_strings = { # Booleans 'false', 'true', # Edge labels 'unknown', 'read', 'write', 'create', 'pass', 'change', 'mmap_write', 'attach', 'associate', 'bind', 'connect', 'listen', 'accept', 'open', 'parent', 'version', 'link', 'named', 'ifc', 'exec', 'clone', 'version', 'search', 'mmap_read', 'mmap_exec', 'send', 'receive', 'perm_read', 'perm_write', 'perm_exec', # Node types 'string', 'relation', 'task', 'inode_unknown', 'link', 'file', 'directory', 'char', 'block', 'fifo', 'socket', 'msg', 'shm', 'sock', 'address', 'sb', 'file_name', 'ifc', 'disc_entity', 'disc_activity', 'disc_agent', 'disc_node', 'packet', 'mmaped_file', # For prov 'cf:file' } key_bits = util.nbits_for_int(len(key_strings)) label_bits = 8 typ_bits = util.nbits_for_int(len(typ_strings)) val_bits = util.nbits_for_int(len(val_strings)) date_bits = { "year": 12, "month": 4, "day": 5, "hour": 5, "minute": 6, "sec": 6, } date_types = ["year", "month", "day", "hour", "minute", "sec"] date_type_bits = util.nbits_for_int(len(date_types)) # number of common strings we will map to dictionary strdict_threshold = 300 strdict_bits = util.nbits_for_int(strdict_threshold) def __init__(self, pp): self.metadata = pp.get_metadata() self.iti = pp.get_id2num_map() self.id_bits = util.nbits_for_int(pp.get_graph().get_node_count()) self.encoded_json_bits = '' self.default_node_data = {} self.default_relation_data = {} self.default_time = [] self.keys_dict = { elt: util.int2bitstr(i, Encoder.key_bits) for (i, elt) in enumerate(Encoder.key_strings) } self.vals_dict = { elt: util.int2bitstr(i, Encoder.val_bits) for (i, elt) in enumerate(Encoder.val_strings) } self.typs_dict = { elt: util.int2bitstr(i, Encoder.typ_bits) for (i, elt) in enumerate(Encoder.typ_strings) } self.labels_dict = { elt: util.int2bitstr(i, Encoder.label_bits) for (i, elt) in enumerate(Encoder.prov_label_strings) } self.common_strs_dict = self.construct_common_strs_dict() def construct_common_strs_dict(self): strval_counter = Counter() for identifier, metadata in self.metadata.items(): typ = metadata.typ data = metadata.data values = [str(v) for v in data.values()] c = Counter(values) strval_counter.update(c) if 'cf:date' in data: del strval_counter[data['cf:date']] if typ in RELATION_TYPS: if typ == 'used': head = data["prov:entity"] tail = data["prov:activity"] elif typ == 'wasGeneratedBy': head = data["prov:activity"] tail = data["prov:entity"] elif typ == 'wasDerivedFrom': head = data["prov:usedEntity"] tail = data["prov:generatedEntity"] elif typ == 'wasInformedBy': head = data["prov:informant"] tail = data["prov:informed"] elif typ == 'relation': head = data["prov:sender"] tail = data["prov:receiver"] del strval_counter[head] del strval_counter[tail] return { val: util.int2bitstr(i, Encoder.strdict_bits) for i, (val, _) in enumerate( strval_counter.most_common(Encoder.strdict_threshold)) } def write_to_file(self, outfile): with open(outfile, 'wb') as f: bitstring.BitArray(bin=self.encoded_json_bits).tofile(f) with open(PATH + "/prov_data_dicts.txt", 'w') as f: f.write(str(self.keys_dict)) f.write(str(self.vals_dict)) f.write(str(self.labels_dict)) f.write(str(self.typs_dict)) keys = '' values = '' for key, val in self.common_strs_dict.items(): values += val keys += str(key) + "," with open(PATH + "/common_strs.txt", 'w') as f: f.write(keys) with open(PATH + "/common_strs.bin", 'bw') as f: bitstring.BitArray(bin=values).tofile(f)