def feature_extraction(process_function, outputfilename, delimeter=',', dataset='dataset', extensionList=['.exe', '.dll']): """ Get a dataset collect all information and write them into file. Attention ! Assume that filename is file hash value or unique value for the file :param process_function: the function pointer which will be executed to extract feature. Key point here is that its return type is dictionary :type process_function:func* :param outputfilename: name of documentary which the result of csv file will be put :type outputfilename: str :param dataset: Dataset name :type dataset:str :param extensionList: File extension list whose is on processing :type extensionList: list :return : void """ if os.path.isfile(dataset): pass elif os.path.isdir(dataset): listOfFile = fileutil.getFilePaths(dataset, extensionList) # Collect information from source file content_list = [] for index, filename in enumerate(listOfFile): # Assume that filename is file hash value hash_id = os.path.basename(filename).split('.')[0] class_id = os.path.dirname(filename) # Get content #filename = os.getcwd()+os.sep+filename content = process_function(filename, delimeter=delimeter) if content is not None: content['hash'] = hash_id content['class_id'] = class_id content_list.append(content) print(outputfilename + "-" + str(index) + " - " + str(len(listOfFile))) # Write informations into csv file outfile = os.path.dirname( filename) + os.sep + 'csv' + os.sep + outputfilename try: output.writeSingleIntoCSVFile(outfile, content_list, delimeter) except IOError as ioe: print(str(ioe)) pause() else: print('File type must be file or directory.')
def mergeAll(filespath, csvOut, selectedColumn=[]): files = fileutil.getFilePaths(filespath) dictlist = [] for file in files: f1 = open(file) reader1 = csv.DictReader(f1) dicts = (list(reader1)) if (selectedColumn is not []): dicts = narrowDict(dicts, selectedColumn) dictlist += dicts f1.close() return writeIntoCSVFile(csvOut, dictlist)
def opcodeExtraction(dir, delimeter=","): fls = fu.getFilePaths(dir, [".asm"]) for fln in fls: dirname, filename, cname = fu.fileNameSettings(fln, ".opcode", "opcode") print(cname) if (os.path.isfile(cname)): continue a, d = opcodeUtils.get(fln) opcodeseq = opcodeUtils.opcodeSeq(a) opss = delimeter.join(opcodeseq) writeFile(cname, opss)
def asmbuilding(dir, disName, disassembler, isWriting=True): """ extracting asm from files in dir by using belirtilen disassembler :param dir: directory where files are disassembled :param disName: disassembler name :param disassembler: abstract dissambler :return: """ fls = fu.getFilePaths(dir, [".exe"]) for fln in fls: dirname, filename, cname = fu.fileNameSettings(fln, ".asm", disName) assembly = disassembler.getDisassembledCode(fln) if isWriting: writeFile(cname, assembly) print("klasor:" + dirname + " dosyadi:" + filename + " disassembler:" + disName)
def make(dirname, apply, resultfile, cv=10): ''' train set test set and their class values are prepared according to cross-validation and tag type. :param type: flag for classes of data. In (d)etection mode, there are 2 classes, malware and benign. In classicifation mode, all different tags are class :param cv: cross validation number :return: according to cross validation, 4 matrix; xtrain (cv,sr,f), xtest(cv,ss,f), ytrain(cv,sr), ytest(cv,ss) - cv: cross-valid, sr:count of sample in train set, f:feature number, ss:count of sample in test set ''' from joblib import Parallel, delayed import multiprocessing csvfiles = fu.getFilePaths(dirname, extensionList=[ ".csv" ]) # for each disassembly result there should be another csv file num_cores = multiprocessing.cpu_count() results = Parallel(n_jobs=num_cores)(delayed(apply)(csv, cv) for csv in csvfiles) #results = apply(csvfiles[0],cv) results.append(csvfiles) print(results) report(results, resultfile)
def core_process(process_function, output_ext, delimeter=',', dataset='dataset', extensionList=['.exe', '.dll'], feature_type="text"): """ Get a dataset collect all information and write them into file. Attention ! Assume that filename is file hash value or unique value for the file :param data_name: Dataset name :type data_name:str :param extensionList: File extension list :type extensionList: list :return : void """ if os.path.isfile(dataset): pass elif os.path.isdir(dataset): listOfFile = fileutil.getFilePaths(dataset, extensionList) # Collect information from source file for index, filename in enumerate(listOfFile): # Get content content = process_function(filename, delimeter=delimeter) if content is not None: try: if feature_type == "seq": output.writeFeatureAsSequence(filename, output_ext, content) else: output.writeIntoFile(filename, output_ext, content) except IOError as ioe: print(str(ioe)) pause() print(str(index) + " - " + str(len(listOfFile))) else: print('File type must be file or directory.')