def batch_samples(args, samplefile, batch_size): """Batch samples and return batches in a generator.""" batch = ([], []) count = 0 index_of = None if not args.php: index_of = lambda x: NODE_MAP[x] else: index_of = lambda x: PHP_NODE_MAP[x] f = open(args.infile, 'rb') samples = ijson.items(f, 'item') for sample in samples: if sample['parent'] is not None: ''' Each 'sample' is the node node_type, the parent node_type, and an array of children node_types This ignores the children refs and just appends the id for node_type of node&parent in parallel ''' batch[0].append(index_of(sample['node'])) batch[1].append(index_of(sample['parent'])) count += 1 if count >= batch_size: # yields two lists, like a 256 x 2 array, of ints (1-163?) yield batch batch, count = ([], []), 0
def gen_samples_ijson(infile, labels, vectors, vector_lookup): """Creates a generator that returns a tree in BFS order with each node replaced by its vector embedding, and a child lookup table.""" # encode labels as one-hot vectors label_lookup = { label: _onehot(i, len(labels)) for i, label in enumerate(labels) } f = open(infile, 'rb') trees = ijson.items(f, 'item') for tree in trees: nodes = [ ] # array (with an entry per node of tree) of vectors holding feature weights (from vectorizer) for node type children = [ ] # array (with an entry per node of tree) of lists of parent->child mappings by indices in nodes array label = label_lookup[tree['label']] queue = [(tree['tree'], -1)] while queue: node, parent_ind = queue.pop(0) node_ind = len(nodes) # add children and the parent index to the queue queue.extend([(child, node_ind) for child in node['children']]) # create a list to store this node's children indices children.append([]) # add this child to its parent's child list if parent_ind > -1: children[parent_ind].append(node_ind) # get this node's feature weights (by looking up the node type's number (from the node map/vector lookup) to find its position in the vectorized features) nodes.append(vectors[vector_lookup[node['node']]]) yield (nodes, children, tree['meta'], label)
def JSONIterator(buffer): """Creates a ijson iterator over all items, then automatically closes the provided buffer. """ try: yield from ijson.items(buffer, "item") finally: buffer.close()
def load_content_from_json_file(file_path): """Loads content from the file. By passing ``file_path`` parameter, the file is opened and the content from the file is extracted. :param str file_path: Optional file path. :returns: A content in a list :rtype: `list` """ with open(file_path, 'rb') as file: items_generator = ijson.items(file, '') list_of_items = [item for item in items_generator] return list_of_items
def parse(args): if args.ijson: """Parse trees with the given arguments.""" print('Loading json file') sys.setrecursionlimit(1000000) f = open(args.infile, 'r') data_source = ijson.items(f, 'item') print('Json file load finished') train_samples = [] cv_samples = [] test_samples = [] train_counts = defaultdict(int) cv_counts = defaultdict(int) test_counts = defaultdict(int) f1 = open("/tmp/cv.txt", 'w') f2 = open("/tmp/test.txt", 'w') f3 = open("/tmp/train.txt", 'w') ''' Traverses tree horizontally, reformats and flattens. Resulting nodes contain 'node' and a 'children' array, all containing only nodeType information. 'trees' does not package parent node information, but includes label and metadata information accompanying the sample. If the node size is too big or small it is pruned, and then its assigned to a random group - training, testing, or cross-validation ''' for item in data_source: root = item['tree'] label = item['metadata'][args.label_key] sample, size = _traverse_tree(root) if size > args.maxsize or size < args.minsize: continue roll = random.randint(0, 100) if args.usecv and roll < args.cv: file_handler = f1 cv_counts[label] += 1 elif roll < args.test: file_handler = f2 test_counts[label] += 1 else: file_handler = f3 train_counts[label] += 1 datum = { 'tree': sample, 'label': label, 'meta': json.loads(json.dumps(item['metadata'])) } file_handler.write(json.dumps(datum) + ",\n") f1.close() f2.close() f3.close() # implement shuffling algorithm? for filelabel in ['train', 'test', 'cv']: tmpfile = "/tmp/" + filelabel fout = open(tmpfile + ".shuffled.txt", "w") with open(tmpfile + ".txt") as inhandle: p = subprocess.Popen("terashuf", stdin=inhandle, stdout=fout) [output, error] = p.communicate() rc = p.wait() print(tmpfile + ".shuffled.txt") f1o = open(args.outfile + ".cv.json", 'w') f2o = open(args.outfile + ".test.json", 'w') f3o = open(args.outfile + ".train.json", 'w') out_dict = {'cv': f1o, 'test': f2o, 'train': f3o} f1o.write('[\n') f2o.write('[\n') f3o.write('[\n') # labels = list(set(cv_counts.keys() + train_counts.keys() + test_counts.keys())) labels = list( set( itertools.chain(cv_counts.keys(), train_counts.keys(), test_counts.keys()))) print(labels) print('Dumping sample') with open(args.outfile, 'w') as out_handler: out_handler.write('(\t[\n') for filelabel in ['train', 'test', 'cv']: c = 0 tmpfile = "/tmp/" + filelabel + '.shuffled.txt' with open(tmpfile, 'r') as in_handler: for line in in_handler: linestr = line.rstrip().rstrip(',') if c: out_handler.write(",\n" + linestr) out_dict[filelabel].write(",\n" + linestr) else: out_handler.write(linestr) out_dict[filelabel].write(linestr) c += 1 out_handler.write('\n],\t[\n') out_dict[filelabel].write('\n]') out_dict[filelabel].close() out_handler.write('],\n') out_handler.write(json.dumps(labels)) out_handler.write('\n)') f4o = open(args.outfile + ".labels.json", 'w') f4o.write(json.dumps(labels)) f4o.close() print('dump finished') print('Sampled tree counts: ') print('Cross-Validation:', cv_counts) print('Training:', train_counts) print('Testing:', test_counts) return """Parse trees with the given arguments.""" print('Loading json file') sys.setrecursionlimit(1000000) with open(args.infile, 'r') as file_handler: # data_source = pickle.load(file_handler) data_source = json.load(file_handler) print('Json file load finished') train_samples = [] cv_samples = [] test_samples = [] train_counts = defaultdict(int) cv_counts = defaultdict(int) test_counts = defaultdict(int) for item in data_source: ''' Traverses tree horizontally, reformats and flattens. Resulting nodes contain 'node' and a 'children' array, all containing only nodeType information. 'trees' does not package parent node information, but includes label and metadata information accompanying the sample. If the node size is too big or small it is pruned, and then its assigned to a random group - training, testing, or cross-validation ''' root = item['tree'] label = item['metadata'][args.label_key] sample, size = _traverse_tree(root) if size > args.maxsize or size < args.minsize: continue roll = random.randint(0, 100) datum = { 'tree': sample, 'label': label, 'meta': json.loads(json.dumps(item['metadata'])) } if args.usecv and roll < args.cv: cv_samples.append(datum) cv_counts[label] += 1 elif roll < args.test: test_samples.append(datum) test_counts[label] += 1 else: train_samples.append(datum) train_counts[label] += 1 random.shuffle(cv_samples) random.shuffle(train_samples) random.shuffle(test_samples) # create a list of unique labels in the data labellist = [] labellist.extend(cv_counts.keys()) labellist.extend(train_counts.keys()) labellist.extend(test_counts.keys()) labels = list(set(labellist)) print('Dumping sample') with open(args.outfile, 'w') as file_handler: pickle.dump((train_samples, test_samples, cv_samples, labels), file_handler) file_handler.close() print('dump finished') print('Sampled tree counts: ') print('Cross-Validation:', cv_counts) print('Training:', train_counts) print('Testing:', test_counts)
parser = JsonSlicer(gen, (b'level1', b'level2', None), path_mode='full', binary=True) for n, (*path, item) in enumerate(parser): assert (item[b'id'] == n) with TestCase('**JsonSlicer (full paths, unicode output)**', 'str', args.json_size, results): gen = io.StringIO(jsondata) parser = JsonSlicer(gen, ('level1', 'level2', None), path_mode='full') for n, (*path, item) in enumerate(parser): assert (item['id'] == n) with TestCase('ijson.yajl2_c', 'bytes', args.json_size, results): gen = io.BytesIO(jsondata.encode('utf-8')) parser = ijson_yajl2_c.items(gen, b'level1.level2.item') for n, item in enumerate(parser): assert (item['id'] == n) with TestCase('ijson.yajl2_cffi', 'bytes', args.json_size, results): gen = io.BytesIO(jsondata.encode('utf-8')) parser = ijson_yajl2_cffi.items(gen, b'level1.level2.item') for n, item in enumerate(parser): assert (item['id'] == n) with TestCase('ijson.yajl2', 'bytes', args.json_size, results): gen = io.BytesIO(jsondata.encode('utf-8')) parser = ijson_yajl2.items(gen, 'level1.level2.item') for n, item in enumerate(parser): assert (item['id'] == n)
def updateTopology(cls, source: str, info: dict, nodes: dict): cls.config = info config.load_kube_config() configuration.assert_hostname = False k8sClient = client.ApiClient() loader = YAML(typ='safe') files = cls._listFiles(source) newNodes = [] try: #Search ingress controller already deployed cls.controllers = cls._searchIngressControllers() for controller in cls.controllers: newNode = cls._addControllerToTopology(controller, nodes) if newNode: newNodes.append(newNode) else: cls.controllers.remove(controller) #Deployment of the application print(' Deploying the application...') i = 0 for k8sFile in files: yamls = re.split('^---\n', cls._readFile(k8sFile), flags=re.MULTILINE) for contentStr in yamls: contentDict = loader.load(contentStr) if not contentDict: continue cls._prepareYaml(contentStr, contentDict) with open( join(cls.config['modDeploymentFiles'], str(i) + '.yml'), 'w') as f: try: f.write(yaml.dump(contentDict)) utils.create_from_dict(k8sClient, contentDict) except utils.FailToCreateError: cls._cleanEnvironment() raise DeploymentError('Error deploying ' + k8sFile) i = i + 1 #Wait until the deployment is completed v1 = client.CoreV1Api() deploymentCompleted = False while not deploymentCompleted: pods = v1.list_pod_for_all_namespaces(watch=False) deploymentCompleted = True for pod in pods.items: if pod.spec.hostname in nodes: if pod.status.phase != 'Running' and pod.status.phase != 'Succeeded': deploymentCompleted = False break for containerStatus in pod.status.container_statuses: if not containerStatus.ready: deploymentCompleted = False break if not deploymentCompleted: break if not deploymentCompleted: time.sleep(3) print(' Deployment completed') #Start monitoring print(' Monitoring in progress...') pods = v1.list_pod_for_all_namespaces(watch=False) containerName = ''.join(c for c in cls.config['monitoringContainer'] if c.isalnum()) for pod in pods.items: if pod.spec.hostname in nodes or ( pod.metadata.annotations and 'archMinerName' in pod.metadata.annotations and pod.metadata.annotations['archMinerName'] in nodes) and pod.status.phase == 'Running': fileName = pod.spec.hostname if pod.spec.hostname in nodes else pod.metadata.annotations[ 'archMinerName'] filePath = join('/home/dump', fileName + '.json') command = [ './bin/sh', '-c', 'tshark -i eth0 -a duration:' + str(info['time'] + 3) + ' -N nNdt -T json > ' + filePath + ' 2>/dev/null &' ] try: resp = stream(v1.connect_get_namespaced_pod_exec, pod.metadata.name, pod.metadata.namespace, command=command, container=containerName, stderr=False, stdin=False, stdout=True, tty=False) except ApiException as e: cls._cleanEnvironment() raise MonitoringError(pod.metadata.name) #Start tests time.sleep(3) if info['test']: try: testModule = importlib.import_module(info['test']) testModule.runTest() except: cls._cleanEnvironment() raise TestError('') #Wait until monitoring is finished time.sleep(info['time'] + 5) print(' Monitoring completed') #Save on local host the packets pods = v1.list_pod_for_all_namespaces(watch=False) for pod in pods.items: if pod.spec.hostname in nodes or ( pod.metadata.annotations and 'archMinerName' in pod.metadata.annotations and pod.metadata.annotations['archMinerName'] in nodes) and pod.status.phase == 'Running': fileName = pod.spec.hostname if pod.spec.hostname in nodes else pod.metadata.annotations[ 'archMinerName'] remoteFilePath = join('home/dump', fileName + '.json') localFilePath = join(cls.config['monitoringFiles'], fileName + '.json') os.system('kubectl cp -c ' + containerName + ' ' + pod.metadata.namespace + '/' + pod.metadata.name + ':' + remoteFilePath + ' ' + localFilePath) #Create edges print(' Analyzing packets...') try: files = cls._listFiles(cls.config['monitoringFiles']) except WrongFolderError: cls._cleanEnvironment() raise for monitoringFilePath in files: if os.path.getsize(monitoringFilePath) == 0: continue srcNodeName = monitoringFilePath.split('/')[-1].replace( '.json', '') with open(monitoringFilePath, 'rb') as monitoringFile: for packet in ijson.items(monitoringFile, 'item'): if cls._isOutgoingPacket(packet, nodes, srcNodeName): cls._createEdge(packet, nodes, srcNodeName) #Create communications commFactory = ConcreteCommunicationFactory() for monitoringFilePath in files: if os.path.getsize(monitoringFilePath) == 0: continue srcNodeName = monitoringFilePath.split('/')[-1].replace( '.json', '') with open(monitoringFilePath, 'rb') as monitoringFile: for packet in ijson.items(monitoringFile, 'item'): if cls._isOutgoingPacket(packet, nodes, srcNodeName): cls._createCommunication(packet, nodes, commFactory, srcNodeName) for newNode in newNodes: edges = nodes[newNode['controller']].getEdges( Direction.OUTGOING) if not edges: nodes.pop(newNode['controller'], None) for service in newNode['services']: nodes.pop(service, None) finally: cls._cleanEnvironment()
def parse_nodes(args, node_counts, samples, has_capacity, can_add_more): if args.fjson: # f = open(args.infile, 'r') # data_source = ijson.items(f,'item') fc = 0 c = 0 file_handler = open(args.outfile, 'w') file_handler.write("[\t") for (dirpath, dirnames, filenames) in os.walk(args.infile): for filepath in [ os.path.join(dirpath, file) for file in filenames ]: print(filepath) try: with open(filepath, 'r') as file_reader: data_source = json.load(file_reader) except json.decoder.JSONDecodeError: continue if data_source is None: continue for item in data_source: samples = [] result = parse_node(item, samples, node_counts, has_capacity, can_add_more) if result is False: continue if not can_add_more: break for sample in samples: if c: file_handler.write(",\n" + json.dumps(sample, indent=2)) else: file_handler.write(json.dumps(sample, indent=2)) c += 1 # print(fc, len(new_samples), item['metadata']['name']) fc += 1 file_handler.write("]") file_handler.close() elif args.ijson: f = open(args.infile, 'r') data_source = ijson.items(f, 'item') fc = 0 c = 0 file_handler = open(args.outfile, 'w') file_handler.write("[\t") for item in data_source: ''' Reads the array of objects one at a time, each object containing: 'metadata.label' (benign or malicious) 'tree' (array of node trees) Only the 'tree' data is used, the label is ignored ''' samples = [] result = parse_node(item, samples, node_counts, has_capacity, can_add_more) if result is None: continue if not can_add_more: break for sample in samples: if c: file_handler.write(",\n" + json.dumps(sample, indent=2)) else: file_handler.write(json.dumps(sample, indent=2)) c += 1 # print(fc, len(new_samples), item['metadata']['name']) fc += 1 file_handler.write("]") file_handler.close() else: with open(args.infile, 'r') as file_handler: data_source = json.load(file_handler) for item in data_source: result = parse_node(item, samples, node_counts, has_capacity, can_add_more) if result is False: return if not can_add_more: break print('dumping sample') with open(args.outfile, 'wb') as file_handler: pickle.dump(samples, file_handler) file_handler.close() file_handler = open(args.outfile + ".json", 'w') file_handler.write("[\t") file_handler.write(json.dumps(samples, indent=2)) file_handler.write("]") file_handler.close()