Beispiel #1
0
def batch_samples(args, samplefile, batch_size):
    """Batch samples and return batches in a generator."""
    batch = ([], [])
    count = 0
    index_of = None
    if not args.php:
        index_of = lambda x: NODE_MAP[x]
    else:
        index_of = lambda x: PHP_NODE_MAP[x]

    f = open(args.infile, 'rb')
    samples = ijson.items(f, 'item')
    for sample in samples:
        if sample['parent'] is not None:
            '''
            Each 'sample' is the node node_type, the parent node_type, and an array of children node_types
            This ignores the children refs and just appends the id for node_type of node&parent in parallel
            '''
            batch[0].append(index_of(sample['node']))
            batch[1].append(index_of(sample['parent']))
            count += 1
            if count >= batch_size:
                # yields two lists, like a 256 x 2 array, of ints (1-163?)
                yield batch
                batch, count = ([], []), 0
Beispiel #2
0
def gen_samples_ijson(infile, labels, vectors, vector_lookup):
    """Creates a generator that returns a tree in BFS order with each node
    replaced by its vector embedding, and a child lookup table."""

    # encode labels as one-hot vectors
    label_lookup = {
        label: _onehot(i, len(labels))
        for i, label in enumerate(labels)
    }

    f = open(infile, 'rb')
    trees = ijson.items(f, 'item')

    for tree in trees:
        nodes = [
        ]  # array (with an entry per node of tree) of vectors holding feature weights (from vectorizer) for node type
        children = [
        ]  # array (with an entry per node of tree) of lists of parent->child mappings by indices in nodes array
        label = label_lookup[tree['label']]

        queue = [(tree['tree'], -1)]
        while queue:
            node, parent_ind = queue.pop(0)
            node_ind = len(nodes)
            # add children and the parent index to the queue
            queue.extend([(child, node_ind) for child in node['children']])
            # create a list to store this node's children indices
            children.append([])
            # add this child to its parent's child list
            if parent_ind > -1:
                children[parent_ind].append(node_ind)
            # get this node's feature weights (by looking up the node type's number (from the node map/vector lookup) to find its position in the vectorized features)
            nodes.append(vectors[vector_lookup[node['node']]])

        yield (nodes, children, tree['meta'], label)
Beispiel #3
0
def JSONIterator(buffer):
    """Creates a ijson iterator over all items,
    then automatically closes the provided buffer.
    """
    try:
        yield from ijson.items(buffer, "item")
    finally:
        buffer.close()
Beispiel #4
0
def load_content_from_json_file(file_path):
    """Loads content from the file.

    By passing ``file_path`` parameter, the file is opened
    and the content from the file is extracted.

    :param str file_path: Optional file path.

    :returns: A content in a list
    :rtype: `list`
    """
    with open(file_path, 'rb') as file:
        items_generator = ijson.items(file, '')
        list_of_items = [item for item in items_generator]

        return list_of_items
Beispiel #5
0
def parse(args):
    if args.ijson:
        """Parse trees with the given arguments."""
        print('Loading json file')

        sys.setrecursionlimit(1000000)
        f = open(args.infile, 'r')
        data_source = ijson.items(f, 'item')

        print('Json file load finished')

        train_samples = []
        cv_samples = []
        test_samples = []

        train_counts = defaultdict(int)
        cv_counts = defaultdict(int)
        test_counts = defaultdict(int)

        f1 = open("/tmp/cv.txt", 'w')
        f2 = open("/tmp/test.txt", 'w')
        f3 = open("/tmp/train.txt", 'w')
        '''
        Traverses tree horizontally, reformats and flattens.
        Resulting nodes contain 'node' and a 'children' array, all containing
        only nodeType information.
        'trees' does not package parent node information, but includes label
        and metadata information accompanying the sample.
        If the node size is too big or small it is pruned, and then its assigned to
        a random group - training, testing, or cross-validation
        '''
        for item in data_source:
            root = item['tree']
            label = item['metadata'][args.label_key]
            sample, size = _traverse_tree(root)

            if size > args.maxsize or size < args.minsize:
                continue

            roll = random.randint(0, 100)
            if args.usecv and roll < args.cv:
                file_handler = f1
                cv_counts[label] += 1
            elif roll < args.test:
                file_handler = f2
                test_counts[label] += 1
            else:
                file_handler = f3
                train_counts[label] += 1
            datum = {
                'tree': sample,
                'label': label,
                'meta': json.loads(json.dumps(item['metadata']))
            }
            file_handler.write(json.dumps(datum) + ",\n")
        f1.close()
        f2.close()
        f3.close()

        # implement shuffling algorithm?
        for filelabel in ['train', 'test', 'cv']:
            tmpfile = "/tmp/" + filelabel
            fout = open(tmpfile + ".shuffled.txt", "w")
            with open(tmpfile + ".txt") as inhandle:
                p = subprocess.Popen("terashuf", stdin=inhandle, stdout=fout)
                [output, error] = p.communicate()
                rc = p.wait()
            print(tmpfile + ".shuffled.txt")

        f1o = open(args.outfile + ".cv.json", 'w')
        f2o = open(args.outfile + ".test.json", 'w')
        f3o = open(args.outfile + ".train.json", 'w')
        out_dict = {'cv': f1o, 'test': f2o, 'train': f3o}
        f1o.write('[\n')
        f2o.write('[\n')
        f3o.write('[\n')
        #        labels = list(set(cv_counts.keys() + train_counts.keys() + test_counts.keys()))
        labels = list(
            set(
                itertools.chain(cv_counts.keys(), train_counts.keys(),
                                test_counts.keys())))
        print(labels)
        print('Dumping sample')
        with open(args.outfile, 'w') as out_handler:
            out_handler.write('(\t[\n')
            for filelabel in ['train', 'test', 'cv']:
                c = 0
                tmpfile = "/tmp/" + filelabel + '.shuffled.txt'
                with open(tmpfile, 'r') as in_handler:
                    for line in in_handler:
                        linestr = line.rstrip().rstrip(',')
                        if c:
                            out_handler.write(",\n" + linestr)
                            out_dict[filelabel].write(",\n" + linestr)
                        else:
                            out_handler.write(linestr)
                            out_dict[filelabel].write(linestr)
                        c += 1
                out_handler.write('\n],\t[\n')
                out_dict[filelabel].write('\n]')
                out_dict[filelabel].close()
            out_handler.write('],\n')
            out_handler.write(json.dumps(labels))
            out_handler.write('\n)')

        f4o = open(args.outfile + ".labels.json", 'w')
        f4o.write(json.dumps(labels))
        f4o.close()

        print('dump finished')
        print('Sampled tree counts: ')
        print('Cross-Validation:', cv_counts)
        print('Training:', train_counts)
        print('Testing:', test_counts)

        return
    """Parse trees with the given arguments."""
    print('Loading json file')

    sys.setrecursionlimit(1000000)
    with open(args.infile, 'r') as file_handler:
        #        data_source = pickle.load(file_handler)
        data_source = json.load(file_handler)

    print('Json file load finished')

    train_samples = []
    cv_samples = []
    test_samples = []

    train_counts = defaultdict(int)
    cv_counts = defaultdict(int)
    test_counts = defaultdict(int)

    for item in data_source:
        '''
        Traverses tree horizontally, reformats and flattens.
        Resulting nodes contain 'node' and a 'children' array, all containing
        only nodeType information.
        'trees' does not package parent node information, but includes label
        and metadata information accompanying the sample.
        If the node size is too big or small it is pruned, and then its assigned to
        a random group - training, testing, or cross-validation
        '''
        root = item['tree']
        label = item['metadata'][args.label_key]
        sample, size = _traverse_tree(root)

        if size > args.maxsize or size < args.minsize:
            continue

        roll = random.randint(0, 100)

        datum = {
            'tree': sample,
            'label': label,
            'meta': json.loads(json.dumps(item['metadata']))
        }

        if args.usecv and roll < args.cv:
            cv_samples.append(datum)
            cv_counts[label] += 1
        elif roll < args.test:
            test_samples.append(datum)
            test_counts[label] += 1
        else:
            train_samples.append(datum)
            train_counts[label] += 1

    random.shuffle(cv_samples)
    random.shuffle(train_samples)
    random.shuffle(test_samples)

    # create a list of unique labels in the data
    labellist = []
    labellist.extend(cv_counts.keys())
    labellist.extend(train_counts.keys())
    labellist.extend(test_counts.keys())
    labels = list(set(labellist))

    print('Dumping sample')
    with open(args.outfile, 'w') as file_handler:
        pickle.dump((train_samples, test_samples, cv_samples, labels),
                    file_handler)
        file_handler.close()
    print('dump finished')
    print('Sampled tree counts: ')
    print('Cross-Validation:', cv_counts)
    print('Training:', train_counts)
    print('Testing:', test_counts)
Beispiel #6
0
        parser = JsonSlicer(gen, (b'level1', b'level2', None),
                            path_mode='full',
                            binary=True)
        for n, (*path, item) in enumerate(parser):
            assert (item[b'id'] == n)

    with TestCase('**JsonSlicer (full paths, unicode output)**', 'str',
                  args.json_size, results):
        gen = io.StringIO(jsondata)
        parser = JsonSlicer(gen, ('level1', 'level2', None), path_mode='full')
        for n, (*path, item) in enumerate(parser):
            assert (item['id'] == n)

    with TestCase('ijson.yajl2_c', 'bytes', args.json_size, results):
        gen = io.BytesIO(jsondata.encode('utf-8'))
        parser = ijson_yajl2_c.items(gen, b'level1.level2.item')
        for n, item in enumerate(parser):
            assert (item['id'] == n)

    with TestCase('ijson.yajl2_cffi', 'bytes', args.json_size, results):
        gen = io.BytesIO(jsondata.encode('utf-8'))
        parser = ijson_yajl2_cffi.items(gen, b'level1.level2.item')
        for n, item in enumerate(parser):
            assert (item['id'] == n)

    with TestCase('ijson.yajl2', 'bytes', args.json_size, results):
        gen = io.BytesIO(jsondata.encode('utf-8'))
        parser = ijson_yajl2.items(gen, 'level1.level2.item')
        for n, item in enumerate(parser):
            assert (item['id'] == n)
    def updateTopology(cls, source: str, info: dict, nodes: dict):
        cls.config = info
        config.load_kube_config()
        configuration.assert_hostname = False
        k8sClient = client.ApiClient()
        loader = YAML(typ='safe')
        files = cls._listFiles(source)
        newNodes = []
        try:
            #Search ingress controller already deployed
            cls.controllers = cls._searchIngressControllers()
            for controller in cls.controllers:
                newNode = cls._addControllerToTopology(controller, nodes)
                if newNode:
                    newNodes.append(newNode)
                else:
                    cls.controllers.remove(controller)
            #Deployment of the application
            print('   Deploying the application...')
            i = 0
            for k8sFile in files:
                yamls = re.split('^---\n',
                                 cls._readFile(k8sFile),
                                 flags=re.MULTILINE)
                for contentStr in yamls:
                    contentDict = loader.load(contentStr)
                    if not contentDict:
                        continue
                    cls._prepareYaml(contentStr, contentDict)
                    with open(
                            join(cls.config['modDeploymentFiles'],
                                 str(i) + '.yml'), 'w') as f:
                        try:
                            f.write(yaml.dump(contentDict))
                            utils.create_from_dict(k8sClient, contentDict)
                        except utils.FailToCreateError:
                            cls._cleanEnvironment()
                            raise DeploymentError('Error deploying ' + k8sFile)
                    i = i + 1

            #Wait until the deployment is completed
            v1 = client.CoreV1Api()
            deploymentCompleted = False

            while not deploymentCompleted:
                pods = v1.list_pod_for_all_namespaces(watch=False)
                deploymentCompleted = True
                for pod in pods.items:
                    if pod.spec.hostname in nodes:
                        if pod.status.phase != 'Running' and pod.status.phase != 'Succeeded':
                            deploymentCompleted = False
                            break
                        for containerStatus in pod.status.container_statuses:
                            if not containerStatus.ready:
                                deploymentCompleted = False
                                break
                    if not deploymentCompleted:
                        break
                if not deploymentCompleted:
                    time.sleep(3)
            print('   Deployment completed')

            #Start monitoring
            print('   Monitoring in progress...')
            pods = v1.list_pod_for_all_namespaces(watch=False)
            containerName = ''.join(c
                                    for c in cls.config['monitoringContainer']
                                    if c.isalnum())
            for pod in pods.items:
                if pod.spec.hostname in nodes or (
                        pod.metadata.annotations
                        and 'archMinerName' in pod.metadata.annotations
                        and pod.metadata.annotations['archMinerName']
                        in nodes) and pod.status.phase == 'Running':
                    fileName = pod.spec.hostname if pod.spec.hostname in nodes else pod.metadata.annotations[
                        'archMinerName']
                    filePath = join('/home/dump', fileName + '.json')
                    command = [
                        './bin/sh', '-c',
                        'tshark -i eth0 -a duration:' + str(info['time'] + 3) +
                        ' -N nNdt -T json > ' + filePath + ' 2>/dev/null &'
                    ]
                    try:
                        resp = stream(v1.connect_get_namespaced_pod_exec,
                                      pod.metadata.name,
                                      pod.metadata.namespace,
                                      command=command,
                                      container=containerName,
                                      stderr=False,
                                      stdin=False,
                                      stdout=True,
                                      tty=False)
                    except ApiException as e:
                        cls._cleanEnvironment()
                        raise MonitoringError(pod.metadata.name)

            #Start tests
            time.sleep(3)
            if info['test']:
                try:
                    testModule = importlib.import_module(info['test'])
                    testModule.runTest()
                except:
                    cls._cleanEnvironment()
                    raise TestError('')

            #Wait until monitoring is finished
            time.sleep(info['time'] + 5)
            print('   Monitoring completed')

            #Save on local host the packets
            pods = v1.list_pod_for_all_namespaces(watch=False)
            for pod in pods.items:
                if pod.spec.hostname in nodes or (
                        pod.metadata.annotations
                        and 'archMinerName' in pod.metadata.annotations
                        and pod.metadata.annotations['archMinerName']
                        in nodes) and pod.status.phase == 'Running':
                    fileName = pod.spec.hostname if pod.spec.hostname in nodes else pod.metadata.annotations[
                        'archMinerName']
                    remoteFilePath = join('home/dump', fileName + '.json')
                    localFilePath = join(cls.config['monitoringFiles'],
                                         fileName + '.json')
                    os.system('kubectl cp -c ' + containerName + ' ' +
                              pod.metadata.namespace + '/' +
                              pod.metadata.name + ':' + remoteFilePath + ' ' +
                              localFilePath)

            #Create edges
            print('   Analyzing packets...')
            try:
                files = cls._listFiles(cls.config['monitoringFiles'])
            except WrongFolderError:
                cls._cleanEnvironment()
                raise
            for monitoringFilePath in files:
                if os.path.getsize(monitoringFilePath) == 0:
                    continue
                srcNodeName = monitoringFilePath.split('/')[-1].replace(
                    '.json', '')
                with open(monitoringFilePath, 'rb') as monitoringFile:
                    for packet in ijson.items(monitoringFile, 'item'):
                        if cls._isOutgoingPacket(packet, nodes, srcNodeName):
                            cls._createEdge(packet, nodes, srcNodeName)

            #Create communications
            commFactory = ConcreteCommunicationFactory()
            for monitoringFilePath in files:
                if os.path.getsize(monitoringFilePath) == 0:
                    continue
                srcNodeName = monitoringFilePath.split('/')[-1].replace(
                    '.json', '')
                with open(monitoringFilePath, 'rb') as monitoringFile:
                    for packet in ijson.items(monitoringFile, 'item'):
                        if cls._isOutgoingPacket(packet, nodes, srcNodeName):
                            cls._createCommunication(packet, nodes,
                                                     commFactory, srcNodeName)

            for newNode in newNodes:
                edges = nodes[newNode['controller']].getEdges(
                    Direction.OUTGOING)
                if not edges:
                    nodes.pop(newNode['controller'], None)
                    for service in newNode['services']:
                        nodes.pop(service, None)

        finally:
            cls._cleanEnvironment()
Beispiel #8
0
def parse_nodes(args, node_counts, samples, has_capacity, can_add_more):
    if args.fjson:
        #        f = open(args.infile, 'r')
        #        data_source = ijson.items(f,'item')
        fc = 0
        c = 0
        file_handler = open(args.outfile, 'w')
        file_handler.write("[\t")
        for (dirpath, dirnames, filenames) in os.walk(args.infile):
            for filepath in [
                    os.path.join(dirpath, file) for file in filenames
            ]:
                print(filepath)
                try:
                    with open(filepath, 'r') as file_reader:
                        data_source = json.load(file_reader)
                except json.decoder.JSONDecodeError:
                    continue
                if data_source is None:
                    continue
                for item in data_source:
                    samples = []

                    result = parse_node(item, samples, node_counts,
                                        has_capacity, can_add_more)
                    if result is False:
                        continue
                    if not can_add_more:
                        break
                    for sample in samples:
                        if c:
                            file_handler.write(",\n" +
                                               json.dumps(sample, indent=2))
                        else:
                            file_handler.write(json.dumps(sample, indent=2))
                        c += 1
    #            print(fc, len(new_samples), item['metadata']['name'])
            fc += 1
        file_handler.write("]")
        file_handler.close()

    elif args.ijson:
        f = open(args.infile, 'r')
        data_source = ijson.items(f, 'item')
        fc = 0
        c = 0
        file_handler = open(args.outfile, 'w')
        file_handler.write("[\t")
        for item in data_source:
            '''
            Reads the array of objects one at a time, each object containing:
                'metadata.label' (benign or malicious)
                'tree' (array of node trees)
            Only the 'tree' data is used, the label is ignored
            '''
            samples = []
            result = parse_node(item, samples, node_counts, has_capacity,
                                can_add_more)
            if result is None:
                continue
            if not can_add_more:
                break
            for sample in samples:
                if c:
                    file_handler.write(",\n" + json.dumps(sample, indent=2))
                else:
                    file_handler.write(json.dumps(sample, indent=2))
                c += 1


#            print(fc, len(new_samples), item['metadata']['name'])
            fc += 1
        file_handler.write("]")
        file_handler.close()
    else:
        with open(args.infile, 'r') as file_handler:
            data_source = json.load(file_handler)
        for item in data_source:
            result = parse_node(item, samples, node_counts, has_capacity,
                                can_add_more)
            if result is False:
                return
            if not can_add_more:
                break

        print('dumping sample')

        with open(args.outfile, 'wb') as file_handler:
            pickle.dump(samples, file_handler)
            file_handler.close()
        file_handler = open(args.outfile + ".json", 'w')
        file_handler.write("[\t")
        file_handler.write(json.dumps(samples, indent=2))
        file_handler.write("]")
        file_handler.close()