def model_first_predict(): ''' ''' # self.blackbox_detector for filename in tqdm.tqdm(self.dataset_iterator(args.datadir)): filepath = os.path.join(args.datadir, filename) binary = open(filepath, 'rb').read() ember.predict_sample(self.blackbox_detector, binary)
def predict(): """ Predcit new datasets """ y_pred = [] name = [] err = 0 end = len(next(os.walk(args.datadir))[2]) for sample in tqdm.tqdm(sample_iterator(), total=end): fullpath = os.path.join(args.datadir, sample) if os.path.isfile(fullpath): binary = open(fullpath, "rb").read() name.append(sample) try: y_pred.append(ember.predict_sample(lgbm_model, binary)) except KeyboardInterrupt: sys.exit() except Exception as e: y_pred.append(0) print("{}: {} error is occuered".format(sample, e)) err += 1 series = OrderedDict([('hash', name),('y_pred', y_pred)]) r = pd.DataFrame.from_dict(series) r.to_csv(os.path.join(args.output, 'result.csv'), index=False, header=None) return err
def main(): prog = "classify_binaries" descr = "Use a trained ember model to make predictions on PE files" parser = argparse.ArgumentParser(prog=prog, description=descr) parser.add_argument("-m", "--modelpath", type=str, default=None, required=True, help="Ember model") parser.add_argument("binaries", metavar="BINARIES", type=str, nargs="+", help="PE files to classify") args = parser.parse_args() if not os.path.exists(args.modelpath): parser.error("ember model {} does not exist".format(args.modelpath)) lgbm_model = lgb.Booster(model_file=args.modelpath) for binary_path in args.binaries: if not os.path.exists(binary_path): print("{} does not exist".format(binary_path)) file_data = open(binary_path, "rb").read() score = ember.predict_sample(lgbm_model, file_data) if len(args.binaries) == 1: print(score) else: print("\t".join((binary_path, str(score))))
def main(): parser = argparse.ArgumentParser() parser.add_argument("-m", "--modelpath", type=str, required=True, help="trained model path") parser.add_argument("-d", "--datadir", type=str, help="Directory for predicting dataSets", required=True) #parser.add_argument("-c", "--csv", type=str, help="Answer file", required=True) parser.add_argument("-o", "--output", type=str, help="output label and y_pred", required=True) args = parser.parse_args() if not os.path.exists(args.modelpath): parser.error("ember model {} does not exist".format(args.modelpath)) # if not os.path.exists(args.csv): # parser.error("ember model {} does not exist".format(args.csv)) if not os.path.exists(args.datadir): parser.error("ember model {} does not exist".format(args.datadir)) if not os.path.exists(args.output): os.mkdir(args.output) #model_path = os.path.join(args.modelpath, "model.txt") lgbm_model = lgb.Booster(model_file=args.modelpath) #read answer sheet #data = pd.read_csv(args.csv, names=['hash', 'y']) errorcount = 0 y_pred = [] #y = [] _name = [] for filename in tqdm.tqdm(os.listdir(args.datadir)): _file = os.path.join(args.datadir, filename) if os.path.isfile(_file): binary = open(_file, "rb").read() _name.append(filename) #y.append(data[data.hash == filename].values[0][1]) try: y_pred.append(ember.predict_sample(lgbm_model, binary)) except KeyboardInterrupt: sys.exit() except: y_pred.append(0) errorcount += 1 #print and save accuracy y_pred_01 = np.array(y_pred) #y_pred_01 = np.where(y_pred_01 > 0.75, 1, 0) #save csv #raw_predict = pd.DataFrame({'hash': _name, 'y': y, 'ypred': y_pred_01}) #raw_predict.to_csv(os.path.join(args.output, 'predict_with_label.csv'), index=False, header=None) r = pd.DataFrame({'hash': _name, 'y_pred': y_pred_01}) r.to_csv(os.path.join(args.output, 'result.csv'), index=False, header=None) #print error count print("Error : %d" % (errorcount))
def scan(filelist, conf=DEFAULTCONF): results = [] for fname in filelist: # Ensure libmagic returns results if REQUIRES[0] is not None: # only run the analytic if it is an Office document file_type = _get_libmagicresults(REQUIRES[0][0], fname) if file_type.startswith('PE32'): with open(fname, 'rb') as fh: ember_result = ember.predict_sample(LGBM_MODEL, fh.read()) results.append( (fname, {'Prediction': ember_result}) ) metadata = {} metadata["Name"] = NAME metadata["Type"] = TYPE return (results, metadata)
def predict(self, bytez): return predict_sample(self.model, bytez) > self.thresh
def predict_sample(self, binary_data): score = ember.predict_sample(self.lgbm_model, binary_data) return score
import ember import lightgbm as lgb import sys # verify the number of arguments argc = len(sys.argv) if (argc != 2): print("Usage: python {0} <file>".format(sys.argv[0])) exit(1) # load and parse PE file file = sys.argv[1] lgbm_model = lgb.Booster(model_file="/home/geoffryaf/Desktop/MDP/ember_dataset_2018_2/ember2018/ember_model_2018.txt") putty_data = open(file, "rb").read() print(ember.predict_sample(lgbm_model, putty_data))
def analysis(filename=0, data=0, mode=0, mfl=False, ip=''): FLAG_S=False FLAG_B = False conn = sqlite3.connect("MalDet.db") c = conn.cursor() if data: path_to_file = os.path.join(UPLOAD_DIRECTORY,filename) with open(path_to_file, "wb") as fp: fp.write(data) if filename and not data: if mfl: path_to_file = filename else: path_to_file = os.path.join(UPLOAD_DIRECTORY,filename) with open(path_to_file, 'rb') as f: data = f.read() try: check = pefile.PE(path_to_file) except pefile.PEFormatError: err = {'error': 'this is not PE file'} return err imphash, ssdeep_hash, sha = hash_calc(path_to_file) c.execute("""SELECT sha_1 from MalDet_S where sha_1=?""", (sha,)) rows = c.fetchall() if rows: FLAG_S = True c.execute("""SELECT sha1_hash from MalDet_B where sha1_hash=?""", (sha,)) rows1 = c.fetchall() if rows1: FLAG_B = True if FLAG_S and mode != '2': print('existing file, searching...') c.execute("""SELECT * from MalDet_S where sha_1=?""", (sha,)) rows = c.fetchall() conn.commit() conn.close() return unit_pack(rows, mode) elif FLAG_B and mode == '2': print('existing file, searching...') readBlob(sha) else: print("new file, processing...") if mfl == True: filepath = filename else: filepath = os.path.join(UPLOAD_DIRECTORY, filename) if not FLAG_S: if os.path.getsize(filepath) > 1500000: data = open(filepath, "rb").read() prob = ember.predict_sample(ember_model, data) if prob >= 0.5: result = ('malware', prob) else: result = ('benign', 1-prob) else: picture = createRGBImage(filepath) result = predict_image(tf_model, picture) os.remove(picture) if '/' in filename: filename = filename.split('/')[-1] matches = search_for_matches(c, filename, ssdeep_hash, imphash, sha) data_tuple = (filename,result[0], int(result[1]*1000)/1000,sha, imphash, ssdeep_hash, datetime.now().strftime("%d %B %Y"), ip, matches) query = """INSERT INTO MalDet_S (filename, filetype, type_probability, sha_1, imphash, ssdeep, analysis_date, analyzer_ip, possible_matches) VALUES (?,?,?,?,?,?,?,?,?)""" c.execute(query, data_tuple) if mode != '2': os.remove(filepath) conn.commit() conn.close() if mode == '2': print('Sandbox here!!!') subprocess.call(['python3', 'sandbox.py', filepath]) try: with open('result.zip', 'rb') as f: blobData = f.read() query = """INSERT INTO MalDet_B (filename, sha1_hash, result, analysis_date, analyzer_ip) VALUES (?,?,?,?,?)""" data_tuple_2 = (filename.split('/')[-1], sha, blobData, datetime.now().strftime("%d %B %Y"), ip) c.execute(query, data_tuple_2) os.remove(filepath) conn.commit() conn.close() except Exception as e: shutil.rmtree('files') os.mkdir('files') elif mode == '1': info = dict() info['name'] = data_tuple[0] info['type'] = data_tuple[1] info['probability'] = int(data_tuple[2]*1000)/1000 info['sha1'] = data_tuple[3] info['imphash'] = data_tuple[4] info['ssdeep'] = data_tuple[5] info['date'] = data_tuple[6] info['source_ip'] = data_tuple[7] info['matches'] = data_tuple[8] return info elif mode == '0': info = dict() info['name'] = data_tuple[0] info['type'] = data_tuple[1] return info