def main(targets): if 'clean' in targets: shutil.rmtree('data/raw', ignore_errors=True) shutil.rmtree('data/out', ignore_errors=True) shutil.rmtree('data/test', ignore_errors=True) if 'data' in targets: cfg = load_params(TOP_PATH + DATA_PARAMS) get_data(**cfg) if 'test' in targets: cfg = load_params(TOP_PATH + TEST_PARAMS) get_data(**cfg) if 'transform' in targets: if not os.path.exists(TOP_PATH + '/data/cleaned'): os.makedirs(TOP_PATH + '/data/cleaned') for filename in os.listdir(TOP_PATH + '/data/raw'): if 'STOPS' in filename: if '2018' in filename: temp_df = cleaning.clean_2018_2019(TOP_PATH + '/data/raw/' + str(filename)) elif '2017' in filename: temp_df = cleaning.clean_2017(TOP_PATH + '/data/raw/' + str(filename)) else: temp_df = cleaning.clean_2014_2016(TOP_PATH + '/data/raw/' + str(filename)) elif 'csv' in filename: temp_df = cleaning.clean_trends(TOP_PATH + '/data/raw/' + str(filename)) return
def main(targets): if 'clean' in targets: shutil.rmtree('data/raw', ignore_errors=True) shutil.rmtree('data/cleaned', ignore_errors=True) shutil.rmtree('test/raw', ignore_errors=True) shutil.rmtree('test/cleaned', ignore_errors=True) if 'data' in targets: cfg = load_params(DATA_PARAMS) get_data(**cfg) cfg = load_params(CLEAN_PARAMS) clean_stops(**cfg) if 'test-project' in targets: cfg = load_params(TEST_DATA_PARAMS) get_data(**cfg) cfg = load_params(TEST_CLEAN_PARAMS) clean_stops(**cfg) cfg = load_params(TEST_MODEL_PARAMS) driver(**cfg) if 'model' in targets: cfg = load_params(MODEL_PARAMS) driver(**cfg) return
def main(targets): ''' Runs the main project pipeline logic, given the targets. targets must contain: 'data', 'analysis', 'model'. `main` runs the targets in order of data=>analysis=>model. ''' with open('config/data-params.json') as fh: data_cfg = json.load(fh) get_data(**data_cfg) with open('config/eda-params.json') as fh: eda_cfg = json.load(fh) do_eda(**eda_cfg) with open('config/auto-params.json') as fh: auto_cfg = json.load(fh) autophrase(**auto_cfg) with open('config/visual-params.json') as fh: visual_cfg = json.load(fh) visual(**visual_cfg) # with open('config/example-params.json') as fh: # example_cfg = json.load(fh) # example(**example_cfg) return
def main(targets): if not os.path.exists('data/'): os.mkdir('data/') if not os.path.exists('viz/'): os.mkdir('viz/') # make the clean target if 'clean' in targets: shutil.rmtree('data/raw', ignore_errors=True) shutil.rmtree('data/cleaned', ignore_errors=True) shutil.rmtree('viz', ignore_errors=True) # make the clean test target if 'clean-test' in targets: shutil.rmtree('test_data/cleaned', ignore_errors=True) shutil.rmtree('viz', ignore_errors=True) # make the data target if 'data' in targets: cfg = load_params(DATA_PARAMS) get_data(**cfg) if 'process' in targets: cfg = load_params(PROCESS_PARAMS) process(**cfg) if 'eda' in targets: if not os.path.exists('viz/EDA'): os.mkdir('viz/EDA') cfg = load_params(EDA_PARAMS) generate_viz(**cfg) if 'analyze' in targets: if not os.path.exists('viz/Analysis'): os.mkdir('viz/Analysis') cfg = load_params(ANALYZE_PARAMS) analyze(**cfg) if 'test-project' in targets: process_cfg = load_params(TEST_PROCESS_PARAMS) process(**process_cfg) if not os.path.exists('viz/EDA'): os.mkdir('viz/EDA') if not os.path.exists('viz/Analysis'): os.mkdir('viz/Analysis') eda_cfg = load_params(TEST_EDA_PARAMS) generate_viz(**eda_cfg) analyze_cfg = load_params(TEST_ANALYZE_PARAMS) analyze(**analyze_cfg) return
def main(targets): if 'data' in targets: with open('data-params.json') as fh: data_cfg = json.load(fh) # make the data target get_data(**data_cfg) return
def main(targets): if not os.path.exists('data/'): os.mkdir('data/') if not os.path.exists('viz/'): os.mkdir('viz/') # make the clean target if 'clean' in targets: shutil.rmtree('data/raw', ignore_errors=True) shutil.rmtree('data/cleaned', ignore_errors=True) shutil.rmtree('viz', ignore_errors=True) if 'clean-test' in targets: shutil.rmtree('test_data/cleaned', ignore_errors=True) shutil.rmtree('viz', ignore_errors=True) # make the data target if 'data' in targets: cfg = load_params(DATA_PARAMS) get_data(**cfg) if 'process' in targets: cfg = load_params(PROCESS_PARAMS) process(**cfg) if 'eda' in targets: if not os.path.exists('viz/EDA'): os.mkdir('viz/EDA') cfg_stops = load_params(EDA_STOPS_PARAMS) cfg_crimes = load_params(EDA_CRIMES_PARAMS) cfg_arrests = load_params(EDA_ARRESTS_PARAMS) gv_stops(**cfg_stops) gv_crimes(**cfg_crimes) gv_arrests(**cfg_arrests) if 'test' in targets: process_cfg = load_params(TEST_PROCESS_PARAMS) process(**process_cfg) if not os.path.exists('viz/EDA'): os.mkdir('viz/EDA') cfg_stops = load_params(TEST_EDA_STOPS_PARAMS) cfg_crimes = load_params(TEST_EDA_CRIMES_PARAMS) cfg_arrests = load_params(TEST_EDA_ARRESTS_PARAMS) gv_stops(**cfg_stops) gv_crimes(**cfg_crimes) gv_arrests(**cfg_arrests) return
def main(targets): # make the clean target if 'clean' in targets: shutil.rmtree('data/raw', ignore_errors=True) shutil.rmtree('data/cleaned', ignore_errors=True) shutil.rmtree('data/sunset', ignore_errors=True) shutil.rmtree('data/model', ignore_errors=True) shutil.rmtree('data/test', ignore_errors=True) if 'test-project' in targets: # create project # make the data target #if 'data' in targets: cfg = load_params(DATA_PARAMS) data_years = cfg["year"] [get_data(data_years[i]) for i in range(len(data_years))] # make VoD data [get_veil(data_years[i]) for i in range(len(data_years))] [build_intertw(data_years[i]) for i in range(len(data_years))] # make the test target #if 'test' in targets: cfg = load_params(TEST_DATA_PARAMS) test_year = cfg["year"][0] get_data_test(test_year)
def main(targets): # make the clean target if 'clean' in targets: shutil.rmtree('data/temp', ignore_errors=True) shutil.rmtree('data/out', ignore_errors=True) shutil.rmtree('data/test', ignore_errors=True) # make the data target if 'data' in targets: cfg = load_params(DATA_PARAMS) get_data(**cfg) # make the test target if 'test' in targets: cfg = load_params(TEST_PARAMS) get_data(**cfg) return
def run(save_data: bool = True) -> None: """ Entry-point function to run liquidity cost calculations and save image and static data to files. """ ticker_df, lob_df = etl.get_data() asks_regression_df, bids_regression_df = get_processed_lob_time_series(lob_df) results_dict = compute_liquidity_cost(asks_regression_df, bids_regression_df) if save_data: save_liquidity_data(results_dict) return
def main(targets): # make the clean target if 'clean' in targets: shutil.rmtree('data/', ignore_errors=True) # make the conversion target if 'convert' in targets: cfg = load_params(CONVERT_PARAMS)['data'] convert_data(**cfg) # make the data target if 'data' in targets: cfg = load_params(DATA_PARAMS)['data'] get_data(**cfg) # make the test target if 'data-test' in targets: cfg = load_params(TEST_PARAMS)['data'] get_data(**cfg) # make the process target if 'process' in targets: cfg = load_params(TEST_PARAMS)['process'] process_data(**cfg) # make the test project target if 'test-project' in targets: cfg_data = load_params(TEST_PARAMS)['data'] get_data(**cfg_data) cfg_process = load_params(TEST_PARAMS)['process'] process_data(**cfg_process, test=True) return
def main(targets): if 'test-project' in targets: targets.append('test') targets.append('transform') if 'clean' in targets: shutil.rmtree('data/raw', ignore_errors=True) shutil.rmtree('data/out', ignore_errors=True) shutil.rmtree('data/test', ignore_errors=True) if 'data' in targets: cfg = load_params(DATA_PARAMS) get_data(**cfg) if 'test' in targets: cfg = load_params(TEST_PARAMS) get_data(**cfg) if 'transform' in targets: for directory in directories: if not os.path.exists(directory): continue for filename in os.listdir(directory): if filename.endswith("csv"): if '2018' in filename: temp_df = cleaning.clean_2018_2019(directory + '/' + filename) df = calculations.get_inner_twilight_period(temp_df) calculations.veil_of_darkness(df, 2018, notebook=False) else: year = int(filename[0:4]) temp_df = cleaning.clean_2014_2017(directory + '/' + filename) df = calculations.get_inner_twilight_period(temp_df) calculations.veil_of_darkness(df, year, notebook=False) continue else: continue return
def main(targets): # to time the function start_time = time.time() # make the clean target if 'clean' in targets: shutil.rmtree('data/temp', ignore_errors=True) shutil.rmtree('data/out', ignore_errors=True) shutil.rmtree('data/raw', ignore_errors=True) # make the data target if 'data' in targets: cfg = load_params(DATA_PARAMS) # make the test target if 'test' in targets: cfg = load_params(TEST_PARAMS) get_data(**cfg) smali_dict = load_smali_dict(**cfg) app = smali_dict[list(smali_dict.keys())[0]] a_calc = a_matrix_calc(smali_dict) a_matrix = A_matrix(a_calc[0], a_calc[1]) b_calc = b_matrix_calc(app) b_matrix = B_matrix(b_calc[0], b_calc[1]) p_calc = p_matrix_calc(app) p_matrix = P_matrix(p_calc[0], p_calc[1]) i_calc = i_matrix_calc(app) i_matrix = I_matrix(i_calc[0], i_calc[1]) cleaned = preprocessing_feature_engineering(smali_dict) logreg(cleaned[0], cleaned[1], cleaned[2], cleaned[3]) print('Finished in: {} seconds'.format(time.time() - start_time)) return
def main(targets): ''' Runs the main project pipeline logic, given the targets. targets must contain: 'data', 'analysis', 'model'. `main` runs the targets in order of data=>analysis=>model. ''' if 'data' in targets: data = get_data() if 'analysis' in targets: pass if 'model' in targets: pass return
def get_data(sin_cos_transform=False) -> "asks_merged_df, bids_merged_df": """ Get data for ML modelling :return: asks_merged_df, bids_merged_df """ # Get data ticker_df, lob_df = etl.get_data() asks_costs_df, bids_costs_df = etl.get_costs_data() asks_costs_df.dropna(axis=0, inplace=True) bids_costs_df.dropna(axis=0, inplace=True) ticker_df.rename({"Time_Hour": "Time"}, axis=1, inplace=True) for df in [asks_costs_df, bids_costs_df]: df.rename({"Time_Minute": "Time"}, axis=1, inplace=True) for df in [ticker_df, asks_costs_df, bids_costs_df]: df["Time"] = pd.to_datetime(df["Time"], utc=True) asks_merged_df = ticker_df.merge(asks_costs_df, on="Time") bids_merged_df = ticker_df.merge(asks_costs_df, on="Time") asks_merged_df, asks_new_time_cols = get_time_cols( input_df=asks_merged_df, time_cols=TIME_COLS, sin_cos_transform=sin_cos_transform) bids_merged_df, bids_new_time_cols = get_time_cols( input_df=bids_merged_df, time_cols=TIME_COLS, sin_cos_transform=sin_cos_transform) asks_merged_df["Instrument_Code"] = asks_merged_df[ "Instrument_Code"].astype("category") bids_merged_df["Instrument_Code"] = asks_merged_df[ "Instrument_Code"].astype("category") asks_merged_df["Instrument_Code_id"] = asks_merged_df[ "Instrument_Code"].cat.codes bids_merged_df["Instrument_Code_id"] = bids_merged_df[ "Instrument_Code"].cat.codes return asks_merged_df, bids_merged_df
def main(targets): ''' Runs the main project pipeline logic, given the targets. targets must contain: 'data', 'train', 'analysis', 'results'. `main` runs the targets in order of data=>train=>analysis=>results. ''' # Setup Logger logger = logging.getLogger('project_log') logger.setLevel(logging.DEBUG) fh = RotatingFileHandler('example.log', maxBytes=1000000, backupCount=0) fh.setLevel(logging.DEBUG) formatter = logging.Formatter( '%(asctime)s.%(msecs)03d %(levelname)s %(module)s - %(funcName)s: %(message)s' ) fh.setFormatter(formatter) logger.addHandler(fh) logger.info('STARTING PROGRAM') # Data Target if 'data' in targets or 'all' in targets: logger.info('Starting data target') with open('config/data-params.json') as fh: data_cfg = json.load(fh) with open('config/twitter-api-keys.json') as fh: twitter_cfg = json.load(fh) get_data(logger, **data_cfg, **twitter_cfg) logger.info('Finishing data target') # Train Model target if 'train' in targets or 'all' in targets: logger.info('Starting train target') with open('config/train-params.json') as fh: train_cfg = json.load(fh) df = pd.read_csv( os.path.join(train_cfg['training_data_path'], 'data.csv')).drop(columns=['Unnamed: 0']) train_model(logger, df, **train_cfg) convert_notebook('train', **train_cfg) logger.info('finished train target: wrote html file to {}'.format( os.path.join(train_cfg['outdir'], 'train.html'))) # Analysis target: calculate user polarities if 'analysis' in targets or 'all' in targets: logger.info('Starting analysis target') with open('config/analysis-params.json') as fh: analysis_cfg = json.load(fh) # do user stats tweets = {} for tweet_id in analysis_cfg['tweet_ids']: path = os.path.join(analysis_cfg['user_data_path'], 'tweet_{}.csv'.format(tweet_id)) tweet = pickle.load(open(path, 'rb')) tweets[tweet_id] = tweet for key, value in tweets.items(): for user_id in list(value['user_ids'].keys()): value['user_ids'][user_id] = pd.read_csv( os.path.join(analysis_cfg['user_data_path'], 'user_{}_tweets.csv'.format(user_id))) mdls = [] dims = analysis_cfg['dims'] for dim in dims: path = os.path.join(analysis_cfg['model_path'], '{}.mdl'.format(dim)) mdl = pickle.load(open(path, 'rb')) mdls.append(mdl) compute_user_stats(logger, tweets, mdls, dims, analysis_cfg['user_data_path'], analysis_cfg['flagged']) convert_notebook('analysis', **analysis_cfg) logger.info('finished analysis target: wrote html file to {}'.format( os.path.join(analysis_cfg['outdir'], 'analysis.html'))) # Results target: calculate results if 'results' in targets or 'all' in targets: logger.info('Starting results target') with open('config/results-params.json') as fh: results_cfg = json.load(fh) fp = os.path.join(results_cfg['user_data_path'], 'polarities.csv') polarities = pd.read_csv(fp, usecols=results_cfg['dims'] + ['flagged']).dropna() compute_results(logger, polarities, results_cfg['dims'], results_cfg['outdir']) convert_notebook('results', **results_cfg) logger.info('finished results target: wrote html file to {}'.format( os.path.join(results_cfg['outdir'], 'results.html'))) # Test target if 'test' in targets or 'all' in targets: logger.info('Starting TEST target') # Train target logger.info('Starting TEST train target') with open('config/train-params.json') as fh: train_cfg = json.load(fh) df = pd.read_csv( os.path.join(train_cfg['training_data_path'], 'data.csv')).drop(columns=['Unnamed: 0']) train_model(logger, df, **train_cfg) convert_notebook('train', **train_cfg) logger.info('finished TEST train target: wrote html file to {}'.format( os.path.join(train_cfg['outdir'], 'train.html'))) # Analysis target logger.info('Starting TEST analysis target') with open('config/analysis-params.json') as fh: analysis_cfg = json.load(fh) # do user stats tweets = {} for tweet_id in analysis_cfg['tweet_ids']: path = os.path.join(analysis_cfg['user_data_path'], 'tweet_{}.csv'.format(tweet_id)) tweet = pickle.load(open(path, 'rb')) tweets[tweet_id] = tweet for key, value in tweets.items(): for user_id in list(value['user_ids'].keys()): value['user_ids'][user_id] = pd.read_csv( os.path.join(analysis_cfg['user_data_path'], 'user_{}_tweets.csv'.format(user_id))) mdls = [] dims = analysis_cfg['dims'] for dim in dims: path = os.path.join(analysis_cfg['model_path'], '{}.mdl'.format(dim)) mdl = pickle.load(open(path, 'rb')) mdls.append(mdl) compute_user_stats(logger, tweets, mdls, dims, analysis_cfg['user_data_path'], analysis_cfg['flagged']) convert_notebook('analysis', **analysis_cfg) logger.info( 'finished TEST analysis target: wrote html file to {}'.format( os.path.join(analysis_cfg['outdir'], 'analysis.html'))) # Results target: calculate results logger.info('Starting TEST results target') with open('config/results-params.json') as fh: results_cfg = json.load(fh) fp = os.path.join(results_cfg['user_data_path'], 'polarities.csv') polarities = pd.read_csv(fp, usecols=results_cfg['dims'] + ['flagged']).dropna() compute_results(logger, polarities, results_cfg['dims'], results_cfg['outdir']) convert_notebook('results', **results_cfg) logger.info( 'finished TEST results target: wrote html file to {}'.format( os.path.join(results_cfg['outdir'], 'results.html'))) logger.info('finished TEST target') logger.info('ENDING PROGRAM') return
import dash_html_components as html import dash_table import datetime from etl import get_data external_stylesheets = ["https://codepen.io/chriddyp/pen/bWLwgP.css"] app = dash.Dash(__name__, external_stylesheets=external_stylesheets, requests_pathname_prefix='/app3/') datenow= datetime.datetime.now() df = get_data('SELECT * FROM result;', "SHOW columns FROM result") def generate_table(dataframe, max_rows=10): return html.Table( # Header [html.Tr([html.Th(col) for col in dataframe.columns])] + # Body [html.Tr([ html.Td(dataframe.iloc[i][col]) for col in dataframe.columns ]) for i in range(min(len(dataframe), max_rows))] ) app.layout = html.Div(children=[
return param if __name__ == "__main__": parser = argparse.ArgumentParser( description='PCA and visualization with Plink2') parser.add_argument('process', type=str, nargs=1, help='the process to deal with') args = parser.parse_args() if args.process[0] == "get_data": cfg = load_params(DATA_PARAMS) get_data(cfg['files'], 'data/') get_metal(cfg['metal']) elif args.process[0] == "filter": cfg = load_params(FINAL_PARAMS) filter_recode(cfg['filename'], cfg['covar_file'], cfg['data_dir'], cfg['filter_output'], cfg['hwe'], cfg['maf'], cfg['geno'], cfg['mind'], cfg['chr'], cfg['min']) elif args.process[0] == 'pca': cfg = load_params(FINAL_PARAMS) pca(cfg['data_dir'], cfg['filter_output']) elif args.process[0] == 'plot_pca': cfg = load_params(FINAL_PARAMS) plot_pca(cfg['data_dir'] + 'pca.eigenvec', cfg['output_dir'])
def main(targets): if 'data' in targets: with open('../config/data-params.json') as fh: data_cfg = json.load(fh) get_data(**data_cfg) if 'process' in targets: with open('../config/data-params.json') as fh: data_cfg = json.load(fh) with open('../config/env.json') as fh: env_cfg = json.load(fh) metapath, p, q = data_cfg['metapath'], data_cfg['p'], data_cfg['q'] k, n = data_cfg['k'], data_cfg['n'] algorithm = data_cfg['algorithm'] malware_pos, benign_pos = data_cfg[ 'malware_position'], data_cfg['apk_out_path'] + '/decompiled/*' model_out_path = data_cfg['model_out_path'] if not os.path.exists(model_out_path): os.makedirs(model_out_path) malware_positions = glob.glob(malware_pos) benign_positions = glob.glob(benign_pos) decompiled_apks = benign_positions + malware_positions train = np.random.choice(benign_positions, int(len(benign_positions)*0.8), replace = False).tolist() + \ np.random.choice(malware_positions, int(len(malware_positions)*0.8), replace = False).tolist() test = [apk for apk in decompiled_apks if apk not in train] apk_names_train = [get_name(file) for file in train] apk_classes_train = [get_class(file) for file in train] apk_names_test = [get_name(file) for file in test] apk_classes_test = [get_class(file) for file in test] apk2idx_train = dict(zip(apk_names_train, range(len(apk_names_train)))) apk2idx_test = dict(zip(apk_names_test, range(len(apk_names_test)))) apk2node_train = dict( zip(apk_names_train, range(-len(apk_names_train), 0))) node2apk_train = dict( zip(range(-len(apk_names_train), 0), apk_names_train)) idx2apk_train = dict(zip(apk2idx_train.values(), apk2idx_train.keys())) print('Collecting All APIs in Training Data') APIs = list(get_all_APIs(train)) API2idx = dict(zip(APIs, range(len(APIs)))) idx2API = dict(zip(range(len(APIs)), APIs)) print('Processing Training Data...') apk2code_blocks_train, apk2call_train = apk_info_idx( train, API2idx, 'train') print('Processing Test Data...') apk2code_blocks_test, apk2call_test = apk_info_idx( test, API2idx, 'test') print('Building matrix_A_train...') matrix_A_train = build_matrix_A(API2idx, apk2call_train, apk2idx_train) print('Building matrix_A_test...') matrix_A_test = build_matrix_A(API2idx, apk2call_test, apk2idx_test) print('Building matrix_B_train...') matrix_B_train = build_matrix_B(API2idx, apk2code_blocks_train, apk2idx_train) print('Building matrix_P_train...') matrix_P_train = build_matrix_P(idx2API, apk2call_train, apk2idx_train) print('Building matrix_P_test...') matrix_P_test = build_matrix_P(idx2API, apk2call_test, apk2idx_test) matrix_BP_train = matrix_B_train + matrix_P_train print('generating random walks') walks = generate_walks(metapath, apk_names_train, apk2idx_train, idx2apk_train, \ apk2node_train, node2apk_train, matrix_A_train, matrix_B_train, matrix_P_train, matrix_BP_train, p, q, k, n) walks = [list(map(str, walk)) for walk in walks] print('word2vec model') model = Word2Vec(walks, size=128, window=10, min_count=0, sg=1, workers=8, iter=5) model.wv.save_word2vec_format( model_out_path + '/{}_len{}_k{}_w2v.model'.format(metapath, n, k)) apk2class_train = dict(zip(apk_names_train, apk_classes_train)) X_train = [ model.wv[str(apk2node_train[apk])] for apk in apk2idx_train if str(apk2node_train[apk]) in model.wv ] Y_train = [ apk2class_train[apk] for apk in apk2idx_train if str(apk2node_train[apk]) in model.wv ] clf = svm.SVC(kernel='rbf', gamma='scale') clf.fit(X_train, Y_train) if algorithm == 'node2vec': X = [ API_mean_embedding(model, apk2idx_test[apk], matrix_A_test) for apk in apk2idx_test ] targets = [ API_mean_embedding(model, apk2idx_train[apk], matrix_A_train) for apk in apk2idx_train ] elif algorithm == 'metapath2vec': # TODO: Add dic X = [ API_mean_embedding_metapath(apk2idx_test[apk], dic, matrix_A_test) for apk in apk2idx_test ] targets = [ API_mean_embedding_metapath(apk2idx_train[apk], dic, matrix_A_train) for apk in apk2idx_train ] print('neural network') train_net(clf, out_path = model_out_path, epochs = 20, inputs = X_train, \ targets = targets, labels_train = Y_train, labels_test = apk_classes_test, batch_size = 1) net = torch.load(model_out_path + '/net.model') device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") X_test = net(torch.tensor(X).type( torch.DoubleTensor).to(device)).cpu().detach() Y_test = apk_classes_test acc = clf.score(X_test, Y_test) print('test accuracy: ', acc) if 'test' in targets: with open('../config/test-params.json') as fh: data_cfg = json.load(fh) with open('../config/env.json') as fh: env_cfg = json.load(fh) # malware_pos, benign_pos = data_cfg['malware_position'], data_cfg['benign_position'] # malware_positions = glob.glob(malware_pos) # benign_positions = glob.glob(benign_pos) # benign_positions = get_data(**data_cfg) # malware_positions = glob.glob('/datasets/dsc180a-wi20-public/Malware/amd_data_smali/*/*/*') # malware_positions = list(np.random.choice(malware_positions, 5, replace = False)) # decompiled_apks = benign_positions + malware_positions # decompiled_positions = get_data(**data_cfg) metapath, p, q = data_cfg['metapath'], data_cfg['p'], data_cfg['q'] k, n = data_cfg['k'], data_cfg['n'] algorithm = data_cfg['algorithm'] model_out_path = data_cfg['model_out_path'] if not os.path.exists(model_out_path): os.makedirs(model_out_path) benign_positions = glob.glob('../Data/benign/*') malware_positions = glob.glob('../Data/malwares/*') decompiled_apks = benign_positions + malware_positions # train = np.random.choice(benign_positions, int(len(benign_positions)*0.8), replace = False).tolist() + \ # np.random.choice(malware_positions, int(len(malware_positions)*0.8), replace = False).tolist() train = benign_positions[:4] + malware_positions[:4] test = [apk for apk in decompiled_apks if apk not in train] apk_names_train = [get_name(file) for file in train] # apk_classes_train = [get_class(file) for file in train] apk_names_test = [get_name(file) for file in test] # apk_classes_test = [get_class(file) for file in test] apk_classes_train = [1] * int(len(benign_positions) * 0.8) + [0] * int( len(malware_positions) * 0.8) apk_classes_test = [1] * (len(benign_positions) - int(len(benign_positions)*0.8)) \ + [0] * (len(malware_positions) - int(len(malware_positions)*0.8)) apk2idx_train = dict(zip(apk_names_train, range(len(apk_names_train)))) apk2idx_test = dict(zip(apk_names_test, range(len(apk_names_test)))) apk2node_train = dict( zip(apk_names_train, range(-len(apk_names_train), 0))) node2apk_train = dict( zip(range(-len(apk_names_train), 0), apk_names_train)) idx2apk_train = dict(zip(apk2idx_train.values(), apk2idx_train.keys())) print('Collecting All APIs in Training Data') APIs = list(get_all_APIs(train)) API2idx = dict(zip(APIs, range(len(APIs)))) idx2API = dict(zip(range(len(APIs)), APIs)) print('Processing Training Data...') apk2code_blocks_train, apk2call_train = apk_info_idx( train, API2idx, 'train') print('Processing Test Data...') apk2code_blocks_test, apk2call_test = apk_info_idx( test, API2idx, 'test') print('Building matrix_A_train...') matrix_A_train = build_matrix_A(API2idx, apk2call_train, apk2idx_train) print('Building matrix_A_test...') matrix_A_test = build_matrix_A(API2idx, apk2call_test, apk2idx_test) print('Building matrix_B_train...') matrix_B_train = build_matrix_B(API2idx, apk2code_blocks_train, apk2idx_train) print('Building matrix_P_train...') matrix_P_train = build_matrix_P(idx2API, apk2call_train, apk2idx_train) print('Building matrix_P_test...') matrix_P_test = build_matrix_P(idx2API, apk2call_test, apk2idx_test) matrix_BP_train = matrix_B_train + matrix_P_train print('generating random walks') walks = generate_walks(metapath, apk_names_train, apk2idx_train, idx2apk_train, \ apk2node_train, node2apk_train, matrix_A_train, matrix_B_train, matrix_P_train, matrix_BP_train, p, q, k, n) walks = [list(map(str, walk)) for walk in walks] print('word2vec model') model = Word2Vec(walks, size=128, window=10, min_count=0, sg=1, workers=8, iter=5) model.wv.save_word2vec_format( model_out_path + '/{}_len{}_k{}_w2v.model'.format(metapath, n, k)) apk2class_train = dict(zip(apk_names_train, apk_classes_train)) X_train = [ model.wv[str(apk2node_train[apk])] for apk in apk2idx_train if str(apk2node_train[apk]) in model.wv ] Y_train = [ apk2class_train[apk] for apk in apk2idx_train if str(apk2node_train[apk]) in model.wv ] clf = svm.SVC(kernel='rbf', gamma='scale') clf.fit(X_train, Y_train) if algorithm == 'node2vec': X = [ API_mean_embedding(model, apk2idx_test[apk], matrix_A_test) for apk in apk2idx_test ] targets = [ API_mean_embedding(model, apk2idx_train[apk], matrix_A_train) for apk in apk2idx_train ] elif algorithm == 'metapath2vec': # TODO: Add dic X = [ API_mean_embedding_metapath(apk2idx_test[apk], dic, matrix_A_test) for apk in apk2idx_test ] targets = [ API_mean_embedding_metapath(apk2idx_train[apk], dic, matrix_A_train) for apk in apk2idx_train ] print('neural network') train_net(clf, out_path = model_out_path, epochs = 20, inputs = X_train, \ targets = targets, labels_train = Y_train, labels_test = apk_classes_test, batch_size = 1) net = torch.load(model_out_path + '/net.model') device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") X_test = net(torch.tensor(X).type( torch.DoubleTensor).to(device)).cpu().detach() Y_test = apk_classes_test acc = clf.score(X_test, Y_test) print('test accuracy: ', acc)
pca(filename=conf['temp_path'] + '/' + conf['name']) #'data/interim/chr22' elif args.process[0] == "remove_outlier_then_pca": remove_outlier(input='data/interim/chr22.eigenvec', graph_output1='data/before_remove_outliers1.png', graph_output2='data/before_remove_outliers2.png', remove_output='data/outliers.txt', filename=conf['temp_path'] + '/' + conf['name']) elif args.process[0] == "graph_after_remove_outlier": after_removal(input='data/interim/chr22.eigenvec', graph_output1='data/after_remove_outliers1.png', graph_output2='data/after_remove_outliers2.png') elif args.process[0] == "test-project": filter_recode(input_file=conf['input_file'], output_dir=conf['temp_path'], output_filename=conf['name'], maf=conf['maf'], geno=conf['geno'], mind=conf['mind']) pca(filename=conf['temp_path'] + '/' + conf['name']) #'data/interim/chr22' remove_outlier(input='data/interim/chr22.eigenvec', graph_output1='data/before_remove_outliers1.png', graph_output2='data/before_remove_outliers2.png', remove_output='data/outliers.txt', filename=conf['temp_path'] + '/' + conf['name']) after_removal(input='data/interim/chr22.eigenvec', graph_output1='data/after_remove_outliers1.png', graph_output2='data/after_remove_outliers2.png') elif args.process[0] == 'get_data': conf = json.load(open(DATA_PARAMS)) get_data(conf['person'], conf['files'], conf['config'])
def main(targets): # make the clean target if 'clean' in targets: shutil.rmtree('data/', ignore_errors=True) #shutil.rmtree('data/Smali', ignore_errors=True) #shutil.rmtree('data/APKs', ignore_errors=True) #shutil.rmtree('data/test',ignore_errors=True) if 'test_project' in targets: print('Loading Benign Data...') #loaded = load_params(TEST_PARAMS) #urls, smali_folders = etl.get_data(**loaded) #etl.smali_mover(smali_folders) b_app_dict = etl.get_smali_files_per_app() malware_apps = etl.find_malware_filepaths() sampled = etl.sample_malware(malware_apps, 20) m_app_dict = etl.get_malware_smali_files_per_app(sampled) b_app_dict_train, b_app_dict_test = etl.split_dictionary(b_app_dict) m_app_dict_train, m_app_dict_test = etl.split_dictionary(m_app_dict) labels = pcs.get_labels(b_app_dict_train, m_app_dict_train) print('Calculating A Data') train_api_calls, train_apis_per_app = pcs.A_matrix_calc( b_app_dict_train, m_app_dict_train) A_matrix_train = pcs.A_matrix_func(train_api_calls, train_apis_per_app).tocsr() test_api_calls, test_apis_per_app = pcs.A_matrix_calc( b_app_dict_test, m_app_dict_test) A_matrix_test = pcs.A_matrix_func(test_api_calls, test_apis_per_app).tocsr() print('Calculating B Data') B_api_calls_train, B_code_blocks_train = pcs.B_matrix_calc( b_app_dict_train, m_app_dict_train) B_matrix_train = pcs.B_matrix_func(B_api_calls_train, B_code_blocks_train).tocsr() B_api_calls_test, B_code_blocks_test = pcs.B_matrix_calc( b_app_dict_test, m_app_dict_test) B_matrix_test = pcs.B_matrix_func(B_api_calls_test, B_code_blocks_test).tocsr() print('Calculating P Data') P_api_calls_train, P_packages_train = pcs.P_matrix_calc( b_app_dict_train, m_app_dict_train) P_matrix_train = pcs.P_matrix_func(P_api_calls_train, P_packages_train).tocsr() P_api_calls_test, P_packages_test = pcs.P_matrix_calc( b_app_dict_test, m_app_dict_test) P_matrix_test = pcs.P_matrix_func(P_api_calls_test, P_packages_test).tocsr() AAT = A_matrix_train.dot(A_matrix_train.T).todense() ABAT = (A_matrix_train.dot(B_matrix_train)).dot( A_matrix_train.T).todense() APAT = (A_matrix_train.dot(P_matrix_train)).dot( A_matrix_train.T).todense() #APBPA = (((A_matrix_train.dot(P_matrix_train)).dot(B_matrix_train)).dot(P_matrix_train.T)).dot(A_matrix_train.T).todense() AAT_test = A_matrix_test.dot(A_matrix_test.T).todense() ABAT_test = ((A_matrix_test.dot(B_matrix_test)).dot( A_matrix_test.T)).todense() APAT_test = ((A_matrix_test.dot(P_matrix_test)).dot( A_matrix_test.T)).todense() #APBPA_test = (((A_matrix_test.dot(P_matrix_test)).dot(B_matrix_test)).dot(P_matrix_test.T)).dot(A_matrix_test.T).todense() print('Calculating AAT') AAT = A_matrix_train.dot(A_matrix_train.T).todense() AAT_test = A_matrix_test.dot(A_matrix_test.T).todense() model = trn.train_model(AAT, labels) AAT_score = trn.assess_model(model, AAT_test, labels) print( 'Classifier accuracy on A(A^T) metapath kernel with 10 apps: {0}'. format(round(AAT_score, 8))) print('Calculating ABAT') ABAT = (A_matrix_train.dot(B_matrix_train)).dot( A_matrix_train.T).todense() ABAT_test = ((A_matrix_test.dot(B_matrix_test)).dot( A_matrix_test.T)).todense() model = trn.train_model(ABAT, labels) ABAT_score = trn.assess_model(model, ABAT_test, labels) print( 'Classifier accuracy on (AB(A^T)) metapath kernel with 10 apps: {0}' .format(round(ABAT_score, 8))) print('Calculating APAT') APAT = (A_matrix_train.dot(P_matrix_train)).dot( A_matrix_train.T).todense() APAT_test = ((A_matrix_test.dot(P_matrix_test)).dot( A_matrix_test.T)).todense() model = trn.train_model(APAT, labels) ABAT_score = trn.assess_model(model, ABAT_test, labels) print( 'Classifier accuracy on (AP(A^T)) metapath kernel with 10 apps: {0}' .format(round(ABAT_score, 8))) if 'test_data' in targets: loaded = load_params(DATA_PARAMS) urls, smali_folders = etl.get_data(**loaded) etl.smali_mover(smali_folders) b_app_dict = etl.get_smali_files_per_app() malware_apps = etl.find_malware_filepaths() sampled = etl.sample_malware(malware_apps, 20) m_app_dict = etl.get_malware_smali_files_per_app(sampled) b_app_dict_train, b_app_dict_test = etl.split_dictionary(b_app_dict) m_app_dict_train, m_app_dict_test = etl.split_dictionary(m_app_dict) if 'process' in targets: labels = pcs.get_labels(b_app_dict_train, m_app_dict_train) train_api_calls, train_apis_per_app = pcs.A_matrix_calc( b_app_dict_train, m_app_dict_train) A_matrix_train = pcs.A_matrix_func(train_api_calls, train_apis_per_app).tocsr() test_api_calls, test_apis_per_app = pcs.A_matrix_calc( b_app_dict_test, m_app_dict_test) A_matrix_test = pcs.A_matrix_func(test_api_calls, test_apis_per_app).tocsr() B_api_calls_train, B_code_blocks_train = pcs.B_matrix_calc( b_app_dict_train, m_app_dict_train) B_matrix_train = pcs.B_matrix_func(B_api_calls_train, B_code_blocks_train).tocsr() B_api_calls_test, B_code_blocks_test = pcs.B_matrix_calc( b_app_dict_test, m_app_dict_test) B_matrix_test = pcs.B_matrix_func(B_api_calls_test, B_code_blocks_test).tocsr() P_api_calls_train, P_packages_train = pcs.P_matrix_calc( b_app_dict_train, m_app_dict_train) P_matrix_train = pcs.P_matrix_func(P_api_calls_train, P_packages_train).tocsr() P_api_calls_test, P_packages_test = pcs.P_matrix_calc( b_app_dict_test, m_app_dict_test) P_matrix_test = pcs.P_matrix_func(P_api_calls_test, P_packages_test).tocsr() AAT = A_matrix_train.dot(A_matrix_train.T).todense() ABAT = (A_matrix_train.dot(B_matrix_train)).dot( A_matrix_train.T).todense() APAT = (A_matrix_train.dot(P_matrix_train)).dot( A_matrix_train.T).todense() APBPA = ((( A_matrix_train.dot(P_matrix_train)).dot(B_matrix_train)).dot( P_matrix_train.T)).dot(A_matrix_train.T).todense() AAT_test = A_matrix_test.dot(A_matrix_test.T).todense() ABAT_test = ((A_matrix_test.dot(B_matrix_test)).dot( A_matrix_test.T)).todense() APAT_test = ((A_matrix_test.dot(P_matrix_test)).dot( A_matrix_test.T)).todense() APBPA_test = ((( A_matrix_test.dot(P_matrix_test)).dot(B_matrix_test)).dot( P_matrix_test.T)).dot(A_matrix_test.T).todense() if 'train' in targets: AAT = A_matrix_train.dot(A_matrix_train.T).todense() AAT_test = A_matrix_test.dot(A_matrix_test.T).todense() model = trn.train_model(AAT, labels) AAT_score = trn.assess_model(model, AAT_test, labels) ABAT = (A_matrix_train.dot(B_matrix_train)).dot( A_matrix_train.T).todense() ABAT_test = ((A_matrix_test.dot(B_matrix_test)).dot( A_matrix_test.T)).todense() model = trn.train_model(ABAT, labels) ABAT_score = trn.assess_model(model, ABAT_test, labels) APAT = (A_matrix_train.dot(P_matrix_train)).dot( A_matrix_train.T).todense() APAT_test = ((A_matrix_test.dot(P_matrix_test)).dot( A_matrix_test.T)).todense() model = trn.train_model(APAT, labels) ABAT_score = trn.assess_model(model, ABAT_test, labels) APBPA = ((( A_matrix_train.dot(P_matrix_train)).dot(B_matrix_train)).dot( P_matrix_train.T)).dot(A_matrix_train.T).todense() APBPA_test = ((( A_matrix_test.dot(P_matrix_test)).dot(B_matrix_test)).dot( P_matrix_test.T)).dot(A_matrix_test.T).todense() model = trn.train_model(APBPA, labels) APBPA_score = trn.assess_model(model, APBPA_test, labels) if 'analysis' in targets: print('Starting analysis...') print( 'Classifier accuracy on A(A^T) metapath kernel with 10 apps: {0}'. format(round(AAT_score, 8))) print( 'Classifier accuracy on (AB(A^T)) metapath kernel with 10 apps: {0}' .format(round(ABAT_score, 8))) print( 'Classifier accuracy on AP(A^T) metapath kernel with 10 apps: {0}'. format(round(APAT_score, 8))) #print('Classifier accuracy on APB(P^T)(A^T) metapath kernel with 10 apps: {0}'.format(round(APBPA_score,8))) print('...Done') return
MEASURES = ['RevFreq', 'RevSpeed', 'RevDur'] pvlimit = 0.001 alpha = 0.05 independent_variable = 'groupname' for strain_dir in strain_dirs: strain_name = os.path.basename(strain_dir) print(f'\n\nProcessing: {strain_name}') # data directory raw_data_dir = os.path.join(strain_dir, sub_dir) # make output folder output_dir = os.path.join(os.path.dirname(raw_data_dir), OUTPUT_DIR_NAME) if not os.path.isdir(output_dir): os.mkdir(output_dir) # get data rawdata, db = etl.get_data(raw_data_dir) df_transform = rawdata.pivot(index='mwtid', columns='tap', values=MEASURES) # calculate integral (requires stats and etl package) intobj = stats.Integral(df_transform) data_integral = intobj.bycolumns(MEASURES) data_integral = etl.merge_data_mwtdb(data_integral, db) # save excel graphing output by measures--- iv = 'groupname' savefname = f'graph_data_{PRJ_TAG}.xlsx' savepath = os.path.join(output_dir, savefname) graphpack.save_excel_graphdata(data_integral, 'groupname', MEASURES, savepath) # set up anova report --- filename = os.path.join(output_dir, 'anova.txt')