def get_valid_java_dir_path(): decompress_dir_paths = utils.get_files_from_dir( config.DATA_MALWARE_DECOMPRESS_PATH) print("Total get {0} decompress dir paths".format( len(decompress_dir_paths))) java_dir_paths = utils.get_files_from_dir( config.DATA_MALWARE_DECOMPILE_JAVA_PATH) java_dir_paths = list(x.split("\\")[-1] for x in java_dir_paths) print("Total get {0} java dir paths".format(len(java_dir_paths))) res = [] count = 0 for decompress_dir_path in decompress_dir_paths: if decompress_dir_path.split("\\")[-1] in java_dir_paths: count += 1 if count % 100 == 0: print(count, " pass") res.append(decompress_dir_path) print("Total get {0} decompress dir paths".format( len(decompress_dir_paths))) print("Total get {0} java dir paths".format(len(java_dir_paths))) print("{0} files pass the filter".format(count)) return res
def update_db(dataset_dir_path, decompress_dir_path): origin_files = utils.get_files_from_dir(dataset_dir_path) print("Get {0} origin files from dataset".format(len(origin_files))) decompress_subdirs = utils.get_files_from_dir(decompress_dir_path) if DEVICE == 'M': decompress_files = list(x.split("\\")[-1] for x in decompress_subdirs) else: pass print("Get {0} decompress subdir paths".format(len(decompress_files))) print("{0} files occur error in decompress process".format( len(origin_files) - len(decompress_files))) if DEVICE == "M": print("start insert malware file info that not decompressed correctly") for file in origin_files: filename = file.split("\\")[-1] if filename not in decompress_files: md5 = filename.split("_")[-1] valid = "decompress error;" sql = "INSERT INTO malware_info(MD5,VALID) VALUES(%s,%s)" print("Error in decompress:", filename) mysql = MySQLUtils.MyPymysqlPool() mysql.insert(sql, (md5, valid)) mysql.dispose() else: pass
def get_malwares(dir_path, num_limit=2): sub_dirs = utils.get_files_from_dir(dir_path) res = {} for sub_dir in sub_dirs: files = utils.get_files_from_dir(sub_dir, file_type=".apk") if len(files) >= num_limit: dir_name = sub_dir.split("\\")[-1] res[dir_name] = files return res
def __init__(self, split, img_size, tag, **kwargs): self.data_path = coerce_to_path_and_check_exist( self.root / self.name / tag) / split self.split = split self.tag = tag try: input_files = get_files_from_dir(self.data_path, IMG_EXTENSIONS, sort=True) except FileNotFoundError: input_files = [] self.input_files = input_files self.labels = [-1] * len(input_files) self.n_classes = 0 self.size = len(self.input_files) if isinstance(img_size, int): self.img_size = (img_size, img_size) self.crop = True else: assert len(img_size) == 2 self.img_size = img_size self.crop = False if self.size > 0: sample_size = Image.open(self.input_files[0]).size if min(self.img_size) > min(sample_size): raise ValueError( "img_size too big compared to a sampled image size, adjust it or upscale dataset" )
def update_db(src_md5, des_dir_path, mysql): print("Get {0} src files".format(len(src_md5))) des_files = utils.get_files_from_dir(des_dir_path) print("Get {0} des files from {1}".format(len(des_files), des_dir_path)) if DEVICE == 'M': des_md5 = list(x.split("\\")[-1].split("_")[-1] for x in des_files) else: pass print("{0} files occur error in decompile {1} process".format( len(src_md5) - len(des_files), DECOMPILE_TYPE)) if DEVICE == "M": print("start update db") for md5 in src_md5: if md5 not in des_md5: valid = "decompile {0} error;".format(DECOMPILE_TYPE) sql = "UPDATE malware_info SET VALID=%s WHERE MD5=%s" print(sql % (valid, md5)) mysql.update(sql, (valid, md5)) else: decompile_path = os.path.join(des_dir_path, "VirusShare_" + md5) sql = "UPDATE malware_info SET DECOMPILE_{0}_DIR_PATH=%s WHERE MD5=%s".format( DECOMPILE_TYPE) print(sql % (decompile_path, md5)) mysql.update(sql, (decompile_path, md5)) else: pass
def _initialize_table(self): table = dict() for name in AVAILABLE_RESRC_NAMES: p, ext = self.input_dir / name, VALID_EXTENSIONS[name] if name == FONT_RESRC_NAME: d = {} for font in FONT_TYPES: files = get_files_from_dir(p / font, valid_extensions=ext, recursive=True) d[font] = list(map(str, files)) table[name] = d else: files = get_files_from_dir(p, valid_extensions=ext, recursive=True) table[name] = list(map(str, files)) return table
def get_valid_decompress_dir_path(): dir_paths = utils.get_files_from_dir(config.DATA_BENIGN_DECOMPRESS_PATH) print("Total get {0} dir paths".format(len(dir_paths))) res=[] count = 0 for dir_path in dir_paths: dex_files = utils.get_files_from_dir(dir_path,file_type=".dex") if len(dex_files) == 1: count += 1 if count % 100 == 0: print(count," pass") res.append(dir_path) print("Total get {0} dir paths".format(len(dir_paths))) print("{0} files pass the filter".format(count)) return res
def __init__(self, input_dir, output_dir, color_label_mapping=COLOR_TO_LABEL_MAPPING, img_extension='png', verbose=True): self.input_dir = coerce_to_path_and_check_exist(input_dir) self.files = get_files_from_dir(self.input_dir, valid_extensions=img_extension) self.output_dir = coerce_to_path_and_create_dir(output_dir) self.color_label_mapping = color_label_mapping self.verbose = verbose
def filter_by_dex(decompress_dir_path): """ 按两条规则:1. 无dex文件;有多个dex文件 :param dir_path: :return: """ dir_paths = utils.get_files_from_dir(decompress_dir_path) values=[] count = 0 for dir_path in dir_paths: dex_files = utils.get_files_from_dir(dir_path,file_type=".dex") MD5 = dir_path.split("\\")[-1].split("_")[-1] valid = '' if len(dex_files) == 1: valid = '1' count += 1 print("pass ",dir_path) elif len(dex_files) == 0: valid = "no dex file;" print(valid, dir_path) elif len(dex_files) > 1: valid = "multiple dex file;" print(valid, dir_path) values.append([MD5,valid,dir_path]) print("Total get {0} dir paths".format(len(dir_paths))) print("{0} files pass the filter".format(count)) #print(values) print("start saving to database") mysql = MySQLUtils.MyPymysqlPool() insert_cmd = "INSERT INTO malware_info (MD5,VALID,DECOMPRESS_DIR_PATH) VALUES(%s,%s,%s)" mysql.insertMany(insert_cmd, values) mysql.dispose()
def get_segment_set_of(dirpath, train_set_path): """ :param dirpath: path to directory. :param train_set_path: path to trainLabels.csv . :return: set of segments-names extracted from all the files in the given directory. """ seg_set = set() train_set = utils.read_csv(train_set_path, 'Id', 'Class').viewkeys() # segments from .asm files ASM_END = utils.ASM_END asm_files = utils.get_files_from_dir(dirpath, '.' + ASM_END) # get list of .asm files for asm_f in asm_files: full_path = dirpath + '/' + asm_f if full_path in train_set: with open('%s.%s' % (full_path, ASM_END)) as f: for line in f: segment_name = line.split(':', 1)[0] seg_set.add(segment_name.rstrip('\x00')) # segments from .dll files DLL_END = utils.DLL_END # TODO in ASAFIS the dll_files list is empty because the .bytes and .dll files are in different dirs, # TODO thus the dirpath here is of the .bytes dir but needed .dll dirpath dll_files = utils.get_files_from_dir(dirpath, '.' + DLL_END) # get list of .dll files for dll_f in dll_files: full_path = dirpath + '/' + dll_f if full_path in train_set: try: pe = pefile.PE('%s.%s' % (full_path, DLL_END)) except Exception as e: print 'Error with pefile on file: %s' % dll_f print e.message continue for section in pe.sections: seg_set.add(section.Name.rstrip('\x00')) return seg_set
def _get_input_label_files(self): input_files = get_files_from_dir(self.data_path, INPUT_EXTENSIONS, sort=True) label_files = get_files_from_dir(self.data_path, [LABEL_EXTENSION]) if len(label_files) == 0 and self.split == 'test': return input_files, None elif len(input_files) != len(label_files): raise RuntimeError("The number of inputs and labels don't match") if len(input_files) < 1e5: inputs = [p.stem for p in input_files] labels = [str(p.name) for p in label_files] invalid = [] for name in inputs: if SEG_GROUND_TRUTH_FMT.format(name, LABEL_EXTENSION) not in labels: invalid.append(name) if len(invalid) > 0: raise FileNotFoundError("Some inputs don't have corresponding labels: {}".format(' '.join(invalid))) else: assert len(input_files) == len(label_files) label_files = [path.parent / SEG_GROUND_TRUTH_FMT.format(path.stem, LABEL_EXTENSION) for path in input_files] return input_files, label_files
def do_something(dirpath, ending): benign_files = utils.get_files_from_dir(dirpath, ending) some_file = benign_files[0] print some_file pe = pefile.PE('%s/%s%s' % (dirpath, some_file, ending)) md = Cs(CS_ARCH_X86, CS_MODE_64) for section in pe.sections[:2]: # print section.Name, section.SizeOfRawData, '\n' # section attr - VirtualAddress, PointerToRawData code = section.get_data() first_instruction_address = section.PointerToRawData for i in md.disasm(code, first_instruction_address): print '0x%x:\t%s\t%s' % (i.address, i.mnemonic, i.op_str) print '\n'
def __init__(self, input_dir, output_dir, suffix_fmt='-{}', out_ext='jpg', create_sub_dir=False, verbose=True): self.input_dir = coerce_to_path_and_check_exist(input_dir) self.files = get_files_from_dir(self.input_dir, valid_extensions='pdf') self.output_dir = coerce_to_path_and_create_dir(output_dir) self.suffix_fmt = suffix_fmt self.out_ext = out_ext self.create_sub_dir = create_sub_dir self.verbose = verbose if self.verbose: print_info("Pdf2Image initialised: found {} files".format( len(self.files)))
def __init__(self, input_dir, output_dir, labels_to_extract=None, in_ext=VALID_EXTENSIONS, out_ext='jpg', tag='default', save_annotations=True, straight_bbox=False, add_margin=True, draw_margin=False, verbose=True): self.input_dir = coerce_to_path_and_check_exist(input_dir).absolute() self.files = get_files_from_dir(self.input_dir, valid_extensions=in_ext, recursive=True, sort=True) self.output_dir = coerce_to_path_and_create_dir(output_dir).absolute() self.out_extension = out_ext self.logger = get_logger(self.output_dir, name='extractor') model_path = coerce_to_path_and_check_exist(MODELS_PATH / tag / MODEL_FILE) self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.model, (self.img_size, restricted_labels, self.normalize) = load_model_from_path( model_path, device=self.device, attributes_to_return=[ 'train_resolution', 'restricted_labels', 'normalize' ]) self.model.eval() self.restricted_labels = sorted(restricted_labels) self.labels_to_extract = [ 1, 4 ] if labels_to_extract is None else sorted(labels_to_extract) if not set(self.labels_to_extract).issubset(self.restricted_labels): raise ValueError( 'Incompatible `labels_to_extract` and `tag` arguments: ' f'model was trained using {self.restricted_labels} labels only' ) self.save_annotations = save_annotations self.straight_bbox = straight_bbox self.add_margin = add_margin self.draw_margin = add_margin and draw_margin self.verbose = verbose self.print_and_log_info('Extractor initialised with kwargs {}'.format({ 'tag': tag, 'labels_to_extract': self.labels_to_extract, 'save_annotations': save_annotations, 'straight_bbox': straight_bbox, 'add_margin': add_margin, 'draw_margin': draw_margin })) self.print_and_log_info( 'Model characteristics: train_resolution={}, restricted_labels={}'. format(self.img_size, self.restricted_labels)) self.print_and_log_info('Found {} input files to process'.format( len(self.files)))
) logger = logging.getLogger(__name__) # Directory in the root directory where the results will be saved # Useful directories ROOT_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) TRAIN_DIR = os.path.join('dataset', 'train') TRAIN_DIR_GT = os.path.join(TRAIN_DIR, 'gt') OUTPUT_DIR = os.path.join(ROOT_DIR, 'output') FIGURES_DIR = os.path.join(OUTPUT_DIR, 'figures') IMG_SHAPE = (1080, 1920) if __name__ == '__main__': # Get XML list from directory xml_gt = u.get_files_from_dir(TRAIN_DIR_GT) # Get GT from XML df_gt = u.get_bboxes_from_aicity(xml_gt) # Add noise to GT depending on noise parameter bboxes = u.add_noise_to_bboxes(df_gt, IMG_SHAPE, noise_size=True, noise_size_factor=5.0, noise_position=True, noise_position_factor=5.0) # Randomly create and destroy bounding boxes depending # on probability parameter bboxes = u.create_bboxes(bboxes, IMG_SHAPE, prob=0.5)
:param l2f: dict of label-number to file-name. :param f_list: list of file-names. :param add_labels_set: bool which tell if needed to add a line of set of labels in the file. """ csv_f = open('train_labels_filtered.csv', 'w') csv_f.write('Id,Class\n') labels_set = set() for f_name in f_list: # find the label of the file for label in l2f: if f_name in l2f[label]: csv_f.write('%s,%s\n' % (f_name, label)) # write the file and its label labels_set.add(label) break # continue to next file if add_labels_set: labels_set = sorted(labels_set) csv_f.write('\nlabels: ' + ','.join(labels_set)) csv_f.close() if __name__ == '__main__': label2files = csv_dict_to_new_dict() path = 'train50' ending = '.bytes' malware_file_list = utils.get_files_from_dir(path, ending) create_new_csv(label2files, malware_file_list)
import log import utils def convert(src_path, des_path): try: smali_jar_path = config.LIB_BAKSMALI_PATH cmd = "java -jar {0} d {1} -o {2}".format(smali_jar_path, src_path, des_path) print(cmd) os.system(cmd) except Exception as e: log.write(filename="apk2smali.txt", message=str(e), remark=src_path) if __name__ == "__main__": dir_path = "E:\\WorkPlaces\\PY_WorkPlace\\Tools\\APK_crawler\\apks" files = utils.get_files_from_dir(dir_path, file_type=".apk") count = len(files) print("Get {0} files".format(count)) for file in files: des_path = os.path.join(config.DATA_BENIGN_SMALI_PATH, file.split("\\")[-1]) if os.pat.exist(des_path) == False: convert(file, des_path) count -= 1 log.write(filename="apk2smali.txt", message=str(count) + " left to convert", type="INFO")
if train: RESULT_DIR = os.path.join('results', 'masks', 'train') IMAGE_DIR = TRAIN_DIR else: RESULT_DIR = os.path.join('results', 'masks', 'test') IMAGE_DIR = os.path.join('dataset', 'test') # If the directory already exists, delete it if os.path.exists(RESULT_DIR): shutil.rmtree(RESULT_DIR) # Create directory os.makedirs(RESULT_DIR) # Get list of test images in test directory test_images = get_files_from_dir(IMAGE_DIR) #test_images = os.listdir(TEST_DIR) # Set threshold based on ranges of interest ths_h = np.array([ [0.0, 0.05], # Red threshold [0.55, 0.65], # Blue threshold [0.95, 1.0] # Res threshold ]) ths_s = np.array([[0.0, 1.0]]) ths_v = np.array([[0.0, 1.0]]) # Get elapsed time t0 = time.time() t_frame = 0
logger = logging.getLogger(__name__) if __name__ == '__main__': logger.info("Starting Museum Painting Retrieval") """ ################################## TASK 1: TEXT #################################### """ candidates = list() candidates = [] if not os.path.exists("pkl/bboxes_iou_0.86658.pkl"): # Read groundtruth gt_annotations = ut.get_db(GTS_BBOXES_DIR) # Read images and find text_area for f in ut.get_files_from_dir(TRAIN_MUSEUM_DIR, excl_ext=['DS_Store']): img = ut.get_img(TRAIN_MUSEUM_DIR, f) candidates.append([ ut.get_number_from_filename(f), text.get_text_area( img, f, gt=gt_annotations[ut.get_number_from_filename(f)]) ]) # Sort bboxes candidates.sort(key=lambda x: x[0]) candidates = [x[1] for x in candidates] # Compute intersection over union mean_iou = text.compute_mean_iou(candidates, gt_annotations) # Export pkl
def __init__(self, input_dir, output_dir, tag="default", seg_fmt=SEG_GROUND_TRUTH_FMT, labels_to_eval=None, save_annotations=True, labels_to_annot=None, predict_bbox=False, verbose=True): self.input_dir = coerce_to_path_and_check_exist(input_dir).absolute() self.files = get_files_from_dir(self.input_dir, valid_extensions=VALID_EXTENSIONS, recursive=True, sort=True) self.output_dir = coerce_to_path_and_create_dir(output_dir).absolute() self.seg_fmt = seg_fmt self.logger = get_logger(self.output_dir, name='evaluator') model_path = coerce_to_path_and_check_exist(MODELS_PATH / tag / MODEL_FILE) self.device = torch.device( 'cuda' if torch.cuda.is_available() else 'cpu') self.model, (self.img_size, restricted_labels, self.normalize) = load_model_from_path( model_path, device=self.device, attributes_to_return=[ 'train_resolution', 'restricted_labels', 'normalize' ]) self.model.eval() self.restricted_labels = sorted(restricted_labels) self.labels_to_eval = [ ILLUSTRATION_LABEL ] if labels_to_eval is None else sorted(labels_to_eval) self.labels_to_rm = set(self.restricted_labels).difference( self.labels_to_eval) assert len( set(self.labels_to_eval).intersection( self.restricted_labels)) == len(self.labels_to_eval) self.restricted_colors = [ LABEL_TO_COLOR_MAPPING[l] for l in self.restricted_labels ] self.label_idx_color_mapping = { self.restricted_labels.index(l) + 1: c for l, c in zip(self.restricted_labels, self.restricted_colors) } self.color_label_idx_mapping = { c: l for l, c in self.label_idx_color_mapping.items() } self.metrics = defaultdict(lambda: RunningMetrics( self.restricted_labels, self.labels_to_eval)) self.save_annotations = save_annotations self.labels_to_annot = labels_to_annot or self.labels_to_eval self.predict_bbox = predict_bbox self.verbose = verbose self.print_and_log_info('Output dir: {}'.format( self.output_dir.absolute())) self.print_and_log_info('Evaluator initialised with kwargs {}'.format({ 'labels_to_eval': self.labels_to_eval, 'save_annotations': save_annotations })) self.print_and_log_info('Model tag: {}'.format(model_path.parent.name)) self.print_and_log_info( 'Model characteristics: train_resolution={}, restricted_labels={}'. format(self.img_size, self.restricted_labels)) self.print_and_log_info('Found {} input files to process'.format( len(self.files)))