def compress(self):
        (graph, collapsed, uf) = self.pp.process()
        (afwd, aback, collapsed_nodes) = self._construct_asymmetric_graphs(
            graph, collapsed, uf)
        (delta_fwd,
         fwd_stats) = self._delta_encode_graph(afwd, collapsed_nodes)
        (delta_back,
         back_stats) = self._delta_encode_graph(aback, collapsed_nodes)
        sizes = {self.pp.id2num(k): uf.get_size(k) for k in uf.leaders()}
        nbits_size_entry = nbits_for_int(max(sizes.values()))

        header_byts = self._compress_header(fwd_stats, back_stats,
                                            nbits_size_entry)
        (node_byts, index) = self._compress_nodes(delta_fwd, fwd_stats,
                                                  delta_back, back_stats,
                                                  collapsed_nodes)

        nbits_index_entry = nbits_for_int(max(index))
        header_byts.extend(nbits_index_entry.to_bytes(1, byteorder="big"))
        header_byts.extend(len(index).to_bytes(4, byteorder="big"))
        wbs = WriterBitString()
        for (idx, sz) in zip(index, [sizes[x] for x in sorted(sizes.keys())]):
            wbs.write_int(idx, nbits_index_entry)
            wbs.write_int(sz, nbits_size_entry)
        header_byts.extend(wbs.to_bytearray())

        self.header_byts = header_byts
        self.node_byts = node_byts

        self.is_initialized = True
    def compress(self):
        # Write header
        self.compressed = bytearray()
        info = {}
        for t in (False, True):
            degrees = [abs(x) for x in self.pp.get_degrees(transpose=t)]
            deltas = [abs(x) for x in self.pp.get_deltas(transpose=t)]
            nbits_degree = util.nbits_for_int(max(degrees))
            nbits_delta = util.nbits_for_int(max(deltas))
            graph = self.pp.get_graph(transpose=t)
            info[t] = (nbits_degree, nbits_delta, graph)
            self.compressed.extend(nbits_degree.to_bytes(1, byteorder="big"))
            self.compressed.extend(nbits_delta.to_bytes(1, byteorder="big"))
            #print(max(self.pp.get_degrees(transpose=t)))
            #print(max(self.pp.get_deltas(transpose=t)))
        #print(info)

        compressed_nodes, index = self.compress_nodes(info[False], info[True])

        nbits_index_entry = util.nbits_for_int(max(index))
        self.compressed.extend(nbits_index_entry.to_bytes(1, byteorder="big"))
        self.compressed.extend(len(index).to_bytes(4, byteorder="big"))
        bs = util.WriterBitString()
        for i in index:
            bs.write_int(i, width=nbits_index_entry)
        self.compressed.extend(bs.to_bytearray())
        self.compressed.extend(compressed_nodes)
    def __init__(self, pp):
        self.metadata = pp.get_metadata()
        self.iti = pp.get_id2num_map()
        self.id_bits = util.nbits_for_int(pp.get_graph().get_node_count())
        self.encoded_json_bits = ''

        self.default_node_data = {}
        self.default_relation_data = {}
        self.default_time = []

        self.keys_dict = {
            elt: util.int2bitstr(i, Encoder.key_bits)
            for (i, elt) in enumerate(Encoder.key_strings)
        }
        self.vals_dict = {
            elt: util.int2bitstr(i, Encoder.val_bits)
            for (i, elt) in enumerate(Encoder.val_strings)
        }
        self.typs_dict = {
            elt: util.int2bitstr(i, Encoder.typ_bits)
            for (i, elt) in enumerate(Encoder.typ_strings)
        }
        self.labels_dict = {
            elt: util.int2bitstr(i, Encoder.label_bits)
            for (i, elt) in enumerate(Encoder.prov_label_strings)
        }

        self.common_strs_dict = self.construct_common_strs_dict()
 def _delta_encode_graph(self, graph, collapsed_nodes):
     delta_graph = Graph()
     stats = {}
     for x in [True, False]:
         stats[x] = {"nbits_degree": 1, "nbits_delta": 1}
     for node in graph.get_vertices():
         delta_graph.add_vertex(node)
         edges = graph.get_outgoing_edges(node)
         if not edges:
             continue
         c = node in collapsed_nodes
         stats[c]["nbits_degree"] = max(stats[c]["nbits_degree"],
                                        nbits_for_int(len(edges)))
         prev = node
         for edge in sorted(edges):
             delta_graph.add_edge(node, edge - prev)
             prev = edge
         abs_deltas = [abs(x) for x in delta_graph.get_outgoing_edges(node)]
         stats[c]["nbits_delta"] = max(stats[c]["nbits_delta"],
                                       nbits_for_int(max(abs_deltas)))
     return (delta_graph, stats)
Beispiel #5
0
    def construct_identifier_ids(self):
        '''
        Adds the identifiers for relations to the identifier-to-id dictionary and 
        writes this to file.
        The first 32 bits of the file represent the number of nodes.
        '''
        if self.ids:
            return self.ids
        if not self.rankings:
            self.rank()
        node_bits = util.nbits_for_int(len(self.rankings))
        self.ids = self.rankings.copy()

        for identifier, metadata in self.metadata.items():
            # put IDs for relation identifiers in the dictionary
            if metadata.typ in pj.RELATION_TYPS:
                if metadata.typ == 'used':
                    head = metadata.data["prov:entity"]
                    tail = metadata.data["prov:activity"]
                elif metadata.typ == 'wasGeneratedBy':
                    head = metadata.data["prov:activity"]
                    tail = metadata.data["prov:entity"]
                elif metadata.typ == 'wasDerivedFrom':
                    head = metadata.data["prov:usedEntity"]
                    tail = metadata.data["prov:generatedEntity"]
                elif metadata.typ == 'wasInformedBy':
                    head = metadata.data["prov:informant"]
                    tail = metadata.data["prov:informed"]
                elif metadata.typ == 'relation':
                    head = metadata.data["prov:sender"]
                    tail = metadata.data["prov:receiver"]
                # add number of nodes to make sure we don't overlap with actual node IDs
                self.ids[identifier] = ((self.rankings[head] << node_bits) +
                                        self.rankings[tail] + len(self.g))
        sorted_idents = sorted(self.ids.keys(), key=lambda v: self.ids[v])
        with open(pj.PATH + "/identifiers.txt", 'wb') as f:
            f.write(len(self.rankings).to_bytes(4, byteorder='big'))
        with open(pj.PATH + "/identifiers.txt", 'a') as f:
            for i in sorted_idents:
                f.write(i + ",")
        return self.ids
    def _number_identifiers(self, graph, collapsed, uf):
        sizes = {x: uf.get_size(x) for x in uf.leaders()}
        ranker = TransposeBfsRanker(graph, collapsed, self.metadata, sizes)
        id_map = ranker.rank()

        node_count = len(id_map)
        node_bits = nbits_for_int(node_count)
        for _id, entry in self.metadata.items():
            if entry.typ in RELATION_TYPS:
                head = entry.data[RELATION_TYPS[entry.typ][0]]
                tail = entry.data[RELATION_TYPS[entry.typ][1]]
                id_map[_id] = ((id_map[head] << node_bits) + id_map[tail] +
                               node_count)
        sorted_ids = sorted(id_map.keys(), key=lambda v: id_map[v])
        with open("identifiers.txt", 'wb') as f:
            f.write(node_count.to_bytes(4, byteorder='big'))
        with open("identifiers.txt", 'a') as f:
            for _id in sorted_ids:
                f.write(_id + ",")
        self.id_map = id_map
        self.id_map_rev = {v: k for (k, v) in id_map.items()}
class Encoder():
    MAX_STRING_SIZE_BITS = 10
    RELATIVE_NODE = '@'
    typ_strings = {
        'prefix', 'activity', 'relation', 'entity', 'agent', 'message', 'used',
        'wasGeneratedBy', 'wasInformedBy', 'wasDerivedFrom', 'unknown'
    }
    key_strings = {
        # cf: keys
        'cf:id',
        'cf:boot_id',
        'cf:machine_id',
        'cf:date',
        'cf:taint',
        'cf:type',
        'cf:version',
        'cf:allowed',
        'cf:sender',
        'cf:receiver',
        'cf:jiffies',
        'cf:offset',
        'cf:hasParent',
        'cf:uid',
        'cf:uuid',
        'cf:gid',
        'cf:pid',
        'cf:vpid',
        'cf:mode',
        'cf:sock_type',
        'cf:family',
        'cf:seq',
        'cf:protocol',
        'cf:message',
        'cf:address',
        'cf:pathname',
        'cf:camflow',
        'cf:machine',
        'cf:sysname',
        'cf:nodename',
        'cf:release',
        # prov: keys
        'prov:label',
        'prov:entity',
        'prov:activity',
        'prov:informant',
        'prov:informed',
        'prov:usedEntity',
        'prov:generatedEntity',
        'prov:type',
        # additional keys for relative versions
        RELATIVE_NODE
    }
    # (prov:label is the key, and have a value following)
    prov_label_strings = {
        '[address]', '[path]', '[TODO]', '[task]', '[unknown]',
        '[block special]', '[char special]', '[directory]', '[fifo]', '[link]',
        '[file]', '[socket]', '[mmaped_file]'
    }
    val_strings = {
        # Booleans
        'false',
        'true',
        # Edge labels
        'unknown',
        'read',
        'write',
        'create',
        'pass',
        'change',
        'mmap_write',
        'attach',
        'associate',
        'bind',
        'connect',
        'listen',
        'accept',
        'open',
        'parent',
        'version',
        'link',
        'named',
        'ifc',
        'exec',
        'clone',
        'version',
        'search',
        'mmap_read',
        'mmap_exec',
        'send',
        'receive',
        'perm_read',
        'perm_write',
        'perm_exec',
        # Node types
        'string',
        'relation',
        'task',
        'inode_unknown',
        'link',
        'file',
        'directory',
        'char',
        'block',
        'fifo',
        'socket',
        'msg',
        'shm',
        'sock',
        'address',
        'sb',
        'file_name',
        'ifc',
        'disc_entity',
        'disc_activity',
        'disc_agent',
        'disc_node',
        'packet',
        'mmaped_file',
        # For prov
        'cf:file'
    }
    key_bits = util.nbits_for_int(len(key_strings))
    label_bits = 8
    typ_bits = util.nbits_for_int(len(typ_strings))
    val_bits = util.nbits_for_int(len(val_strings))
    date_bits = {
        "year": 12,
        "month": 4,
        "day": 5,
        "hour": 5,
        "minute": 6,
        "sec": 6,
    }
    date_types = ["year", "month", "day", "hour", "minute", "sec"]
    date_type_bits = util.nbits_for_int(len(date_types))

    # number of common strings we will map to dictionary
    strdict_threshold = 300
    strdict_bits = util.nbits_for_int(strdict_threshold)

    def __init__(self, pp):
        self.metadata = pp.get_metadata()
        self.iti = pp.get_id2num_map()
        self.id_bits = util.nbits_for_int(pp.get_graph().get_node_count())
        self.encoded_json_bits = ''

        self.default_node_data = {}
        self.default_relation_data = {}
        self.default_time = []

        self.keys_dict = {
            elt: util.int2bitstr(i, Encoder.key_bits)
            for (i, elt) in enumerate(Encoder.key_strings)
        }
        self.vals_dict = {
            elt: util.int2bitstr(i, Encoder.val_bits)
            for (i, elt) in enumerate(Encoder.val_strings)
        }
        self.typs_dict = {
            elt: util.int2bitstr(i, Encoder.typ_bits)
            for (i, elt) in enumerate(Encoder.typ_strings)
        }
        self.labels_dict = {
            elt: util.int2bitstr(i, Encoder.label_bits)
            for (i, elt) in enumerate(Encoder.prov_label_strings)
        }

        self.common_strs_dict = self.construct_common_strs_dict()

    def construct_common_strs_dict(self):
        strval_counter = Counter()
        for identifier, metadata in self.metadata.items():
            typ = metadata.typ
            data = metadata.data

            values = [str(v) for v in data.values()]
            c = Counter(values)
            strval_counter.update(c)
            if 'cf:date' in data:
                del strval_counter[data['cf:date']]

            if typ in RELATION_TYPS:
                if typ == 'used':
                    head = data["prov:entity"]
                    tail = data["prov:activity"]
                elif typ == 'wasGeneratedBy':
                    head = data["prov:activity"]
                    tail = data["prov:entity"]
                elif typ == 'wasDerivedFrom':
                    head = data["prov:usedEntity"]
                    tail = data["prov:generatedEntity"]
                elif typ == 'wasInformedBy':
                    head = data["prov:informant"]
                    tail = data["prov:informed"]
                elif typ == 'relation':
                    head = data["prov:sender"]
                    tail = data["prov:receiver"]
                del strval_counter[head]
                del strval_counter[tail]

        return {
            val: util.int2bitstr(i, Encoder.strdict_bits)
            for i, (val, _) in enumerate(
                strval_counter.most_common(Encoder.strdict_threshold))
        }

    def write_to_file(self, outfile):
        with open(outfile, 'wb') as f:
            bitstring.BitArray(bin=self.encoded_json_bits).tofile(f)

        with open(PATH + "/prov_data_dicts.txt", 'w') as f:
            f.write(str(self.keys_dict))
            f.write(str(self.vals_dict))
            f.write(str(self.labels_dict))
            f.write(str(self.typs_dict))

        keys = ''
        values = ''
        for key, val in self.common_strs_dict.items():
            values += val
            keys += str(key) + ","
        with open(PATH + "/common_strs.txt", 'w') as f:
            f.write(keys)
        with open(PATH + "/common_strs.bin", 'bw') as f:
            bitstring.BitArray(bin=values).tofile(f)