class DataSet: ############################################### ### root_path: The root path for shell logs ### ############################################### def __init__(self, root_path): self.NAME_PATTERN = re.compile( '(.*)([0-9]{2})_([0-9]{2})_([0-9]{4})_([0-9]{2})_([0-9]{2})\+?([0-9]+)?.log_?$' ) self.item_set = SList() os.chdir(root_path) self.ROOT_PATH = os.path.abspath(os.getcwd()) for dir_item in os.listdir(self.ROOT_PATH): if os.path.isdir(os.path.join(self.ROOT_PATH, dir_item)): self._process_single_item(dir_item) else: print "Not a directory: {}".format(dir_item) #################################### ### Process single log directory ### #################################### def _process_single_item(self, dir_name): file_list = self._unwrap_directory( os.path.join(self.ROOT_PATH, dir_name)) match_group = self.NAME_PATTERN.match(dir_name) info = [match_group.group(i + 1) for i in range(match_group.lastindex)] for item in file_list: # self.item_set.append(self.DataItem(info, item)) self.item_set.append(LOG_TYPE_TABLE[item[1]](info, item)) ################################################ ### Get all files contained in the directory ### ################################################ def _unwrap_directory(self, dir_path): file_set = [] for item in os.listdir(dir_path): tem_path = os.path.join(dir_path, item) if os.path.isdir(tem_path): file_set += self._unwrap_directory(tem_path) elif os.path.isfile(tem_path): file_set.append((tem_path, item)) else: print 'ERROR::Detect an item that is neither file nor directory.' print "\t {}".format(tem_path) return file_set
class DataSet: ############################################### ### root_path: The root path for shell logs ### ############################################### def __init__(self, root_path): self.NAME_PATTERN = re.compile('(.*)([0-9]{2})_([0-9]{2})_([0-9]{4})_([0-9]{2})_([0-9]{2})\+?([0-9]+)?.log_?$') self.item_set = SList() os.chdir(root_path) self.ROOT_PATH = os.path.abspath(os.getcwd()) for dir_item in os.listdir(self.ROOT_PATH): if os.path.isdir(os.path.join(self.ROOT_PATH, dir_item)): self._process_single_item(dir_item) else: print "Not a directory: {}".format(dir_item) #################################### ### Process single log directory ### #################################### def _process_single_item(self, dir_name): file_list = self._unwrap_directory(os.path.join(self.ROOT_PATH, dir_name)) match_group = self.NAME_PATTERN.match(dir_name) info = [match_group.group(i+1) for i in range(match_group.lastindex)] for item in file_list: # self.item_set.append(self.DataItem(info, item)) self.item_set.append(LOG_TYPE_TABLE[item[1]](info, item)) ################################################ ### Get all files contained in the directory ### ################################################ def _unwrap_directory(self, dir_path): file_set = [] for item in os.listdir(dir_path): tem_path = os.path.join(dir_path, item) if os.path.isdir(tem_path): file_set += self._unwrap_directory(tem_path) elif os.path.isfile(tem_path): file_set.append((tem_path, item)) else: print 'ERROR::Detect an item that is neither file nor directory.' print "\t {}".format(tem_path) return file_set
def generate_feature(self, data_item): def cut_mat(dim_x, dim_y): tmp_feature = [] for x in range(max(0, int(pnt[0]-dim_x)), min(data_item.img_dim[0]-1, int(pnt[0]+dim_x))+1): for y in range(max(0, int(pnt[1]-dim_y)), min(data_item.img_dim[1]-1, int(pnt[1]+dim_y))+1): tmp_feature.append(image_data[x][y]) return numpy.array(tmp_feature) data_set = SList([]) image_data = data_item.image_data for index, pnt in enumerate(data_item.tag): data_set.append((cut_mat(self.MAT_SIZE[0], self.MAT_SIZE[1]),1, pnt)) for i in range(int(len(data_item.tag)*self.POS_NEG_RATIO)): pnt = (random.randint(self.MAT_SIZE[0], data_item.img_dim[0]-self.MAT_SIZE[0]), random.randint(self.MAT_SIZE[1], data_item.img_dim[1]-self.MAT_SIZE[1])) if data_item.contain_tag(range(pnt[0]-self.MAT_SIZE[0],pnt[0]+self.MAT_SIZE[0]), range(pnt[1]-self.MAT_SIZE[1],pnt[1]+self.MAT_SIZE[1])): data_set.append((cut_mat(self.MAT_SIZE[0], self.MAT_SIZE[1]), 1, pnt)) else: data_set.append((cut_mat(self.MAT_SIZE[0], self.MAT_SIZE[1]), 0, pnt)) print('{}/{}'.format(data_set.filter_by(lambda x: x[1] == 1).count(), data_set.count())) return data_set.filter_by(lambda x: len(x[0]) == (2*self.MAT_SIZE[0]+1)*(2*self.MAT_SIZE[1]+1))
class LogDataItem(DataItem): def __init__(self, info_list, file_info): DataItem.__init__(self, info_list, file_info) self.log_type = LOG_TYPE_SHELL def _parse_operation(self, log_content): self.operation_list = re.findall(re.compile('[0-9]{10} [0-9]+'), log_content) if len(self.operation_list) == 0: self.has_timestamp = False else: self.has_timestamp = True for line in log_content.split("\n"): tmp_list = filter(None, line.split(' ')) if len(tmp_list) > 1 and self.has_timestamp: tmp_timestamp = tmp_list[0] self.operation_list.append( (tmp_timestamp, filter(lambda x: x != tmp_timestamp, tmp_list))) elif len(tmp_list) > 0 and not self.has_timestamp: self.operation_list.append(('0', tmp_list)) def combine(self): self.cmd_list = SList([]) CHARACTER = [8, 9] + range(32, 128) tem_cmd = [] tem_timestamp = [] for item in self.operation_list: if len(item[1]) > 1 and '27' in item[1]: continue if len(filter(lambda x: int(x) < 0, item[1])) > 0: continue if '4' in item[1]: continue for op in item[1]: tem_timestamp.append(item[0]) tem_cmd.append(op) if not int(op) in CHARACTER: content, delimiter = self._convert_to_text(tem_cmd) self.cmd_list.append({ 'action': 'shell', 'content': content, 'delimiter': delimiter, 'timestamp': int(tem_timestamp[0]), 'timestamp_end': int(tem_timestamp[-1]) }) tem_cmd = [] tem_timestamp = [] return self def _convert_to_text(self, cmd): text = '' rtn = '' for c in cmd: if int(c) in range(32, 127): text += chr(int(c)) elif int(c) == 8: text = text[:-1] elif int(c) == 9: text += '[TAB]' elif int(c) == 13: rtn = '[RETURN]' elif int(c) == 27: rtn = '[ESC]' elif int(c) == 127: rtn = '[DELETE]' else: rtn = "[{}]".format(c) return text, rtn
class EditorDataItem(DataItem): def __init__(self, info_list, file_info): DataItem.__init__(self, info_list, file_info) self.log_type = LOG_TYPE_EDITOR def _parse_operation(self, log_content): lines = log_content.split(u"\n") for line in lines: if len(line) != 0: self.operation_list.append(json.loads(line)) def combine(self): prev_command = False self.cmd_list = SList([]) for item in self.operation_list: if item['action'] in ['insert', 'remove']: item['content'] = u"\n".join(item['lines']) elif item['action'] in ['copy', 'paste']: item['content'] = item['text'] elif item['action'] in ['open', 'save']: item['content'] = '' else: continue if not prev_command: prev_command = item continue if (prev_command['action'], item['action']) in [(u'insert', u'remove'), (u'insert', u'insert'), (u'remove', u'remove')]: if item['action'] == u'insert' and prev_command[ 'action'] == u'insert' and str( prev_command['end']) == str(item['start']): prev_command['content'] += item['content'] prev_command['end'] = item['end'] elif item['action'] == u'remove' and str( prev_command['start']) == str(item['end']): prev_command['content'] += item['content'] prev_command['start'] = item['start'] elif item['action'] == u'insert' and prev_command[ 'action'] == u'remove' and str( prev_command['end']) == str(item['end']): prev_length = len(prev_command['content']) tem_length = len(item['content']) if prev_length - tem_length < 0: prev_command['content'] = item['content'][:tem_length - prev_length] prev_command['action'] = u'remove' else: prev_command['content'] = prev_length[ 'content'][:tem_length - prev_length] else: self.cmd_list.append(prev_command) prev_command = item else: self.cmd_list.append(prev_command) prev_command = item if prev_command: self.cmd_list.append(prev_command) return self
class TestSList(unittest.TestCase): def setUp(self): self.list = SList() def test_isEmpty(self): self.assertTrue(self.list.isEmpty()) self.list.append('new') self.assertFalse(self.list.isEmpty()) def test_size(self): self.assertEqual(0, self.list.size()) self.list.append('new1') self.assertEqual(1, self.list.size()) self.list.append('new2') self.list.append('new3') self.assertEqual(3, self.list.size()) def test_index(self): self.assertIsNone(self.list.index('not exist')) self.list.append('1') self.list.append('2') self.list.append('3') self.assertEqual(0, self.list.index('1')) self.assertEqual(1, self.list.index('2')) self.assertEqual(2, self.list.index('3')) self.list.pop() self.assertIsNone(self.list.index('3')) def test_search(self): self.assertFalse(self.list.search('1')) self.list.append('1') self.assertTrue(self.list.search('1')) self.assertFalse(self.list.search('2')) def test_insertAppendAdd(self): self.list.append('1') self.list.append('2') self.list.add('3') self.list.add('4') self.list.insert(2, '5') self.list.insert(0, '6') self.list.insert(7, '7') # [6,4,3,5,1,2,7] self.assertEqual('6', self.list.get(0)) self.assertEqual('4', self.list.get(1)) self.assertEqual('3', self.list.get(2)) self.assertEqual('5', self.list.get(3)) self.assertEqual('1', self.list.get(4)) self.assertEqual('2', self.list.get(5)) self.assertEqual('7', self.list.get(6)) def test_popRemove(self): self.list.append('1') self.list.append('2') self.list.add('3') self.list.add('4') self.list.insert(2, '5') self.list.insert(0, '6') self.list.insert(7, '7') # [6,4,3,5,1,2,7] self.assertEqual('7', self.list.pop()) self.assertEqual('6', self.list.pop(0)) self.assertEqual('5', self.list.remove('5')) # [4,3,1,2] self.list.append('8')
class LogDataItem(DataItem): def __init__(self, info_list, file_info): DataItem.__init__(self, info_list, file_info) self.log_type = LOG_TYPE_SHELL def _parse_operation(self, log_content): self.operation_list = re.findall(re.compile("[0-9]{10} [0-9]+"), log_content) if len(self.operation_list) == 0: self.has_timestamp = False else: self.has_timestamp = True for line in log_content.split("\n"): tmp_list = filter(None, line.split(" ")) if len(tmp_list) > 1 and self.has_timestamp: tmp_timestamp = tmp_list[0] self.operation_list.append((tmp_timestamp, filter(lambda x: x != tmp_timestamp, tmp_list))) elif len(tmp_list) > 0 and not self.has_timestamp: self.operation_list.append(("0", tmp_list)) def combine(self): self.cmd_list = SList([]) CHARACTER = [8, 9] + range(32, 128) tem_cmd = [] tem_timestamp = [] for item in self.operation_list: if len(item[1]) > 1 and "27" in item[1]: continue if len(filter(lambda x: int(x) < 0, item[1])) > 0: continue if "4" in item[1]: continue for op in item[1]: tem_timestamp.append(item[0]) tem_cmd.append(op) if not int(op) in CHARACTER: content, delimiter = self._convert_to_text(tem_cmd) self.cmd_list.append( { "action": "shell", "content": content, "delimiter": delimiter, "timestamp": int(tem_timestamp[0]), "timestamp_end": int(tem_timestamp[-1]), } ) tem_cmd = [] tem_timestamp = [] return self def _convert_to_text(self, cmd): text = "" rtn = "" for c in cmd: if int(c) in range(32, 127): text += chr(int(c)) elif int(c) == 8: text = text[:-1] elif int(c) == 9: text += "[TAB]" elif int(c) == 13: rtn = "[RETURN]" elif int(c) == 27: rtn = "[ESC]" elif int(c) == 127: rtn = "[DELETE]" else: rtn = "[{}]".format(c) return text, rtn
class EditorDataItem(DataItem): def __init__(self, info_list, file_info): DataItem.__init__(self, info_list, file_info) self.log_type = LOG_TYPE_EDITOR def _parse_operation(self, log_content): lines = log_content.split(u"\n") for line in lines: if len(line) != 0: self.operation_list.append(json.loads(line)) def combine(self): prev_command = False self.cmd_list = SList([]) for item in self.operation_list: if item["action"] in ["insert", "remove"]: item["content"] = u"\n".join(item["lines"]) elif item["action"] in ["copy", "paste"]: item["content"] = item["text"] elif item["action"] in ["open", "save"]: item["content"] = "" else: continue if not prev_command: prev_command = item continue if (prev_command["action"], item["action"]) in [ (u"insert", u"remove"), (u"insert", u"insert"), (u"remove", u"remove"), ]: if ( item["action"] == u"insert" and prev_command["action"] == u"insert" and str(prev_command["end"]) == str(item["start"]) ): prev_command["content"] += item["content"] prev_command["end"] = item["end"] elif item["action"] == u"remove" and str(prev_command["start"]) == str(item["end"]): prev_command["content"] += item["content"] prev_command["start"] = item["start"] elif ( item["action"] == u"insert" and prev_command["action"] == u"remove" and str(prev_command["end"]) == str(item["end"]) ): prev_length = len(prev_command["content"]) tem_length = len(item["content"]) if prev_length - tem_length < 0: prev_command["content"] = item["content"][: tem_length - prev_length] prev_command["action"] = u"remove" else: prev_command["content"] = prev_length["content"][: tem_length - prev_length] else: self.cmd_list.append(prev_command) prev_command = item else: self.cmd_list.append(prev_command) prev_command = item if prev_command: self.cmd_list.append(prev_command) return self