def main(targets): if "test" in targets: targets = ["data", "eda", "classification", "clustering"] etl_params = json.load(open("config/etl_test.json")) eda_params = json.load(open("config/eda_test.json")) else: etl_params = json.load(open("config/etl.json")) eda_params = json.load(open("config/eda.json")) autophrase_params = json.load(open("config/autophrase.json")) clf_params = json.load(open("config/classification.json")) clustering_params = json.load(open("config/clustering.json")) if "all" in targets: targets = ["data", "eda", "classification", "clustering"] if "download" in targets: download_dataset() if "data" in targets: get_data(autophrase_params, **etl_params) if "eda" in targets: generate_figures(**eda_params) if "classification" in targets: model(clf_params) if "clustering" in targets: run_clustering(clustering_params)
def main(targets): # make the clean target if 'clean' in targets: remove_dir('data/raw') remove_dir('data/temp') remove_dir('data/out') remove_dir('data/out_m_stat') # get all edit history zip files if 'edit-history-data' in targets: cfg = load_params(EDIT_HISTORY_DATA_PARAMS) get_data(**cfg) # get all pageviews zip files if 'pageviews-data' in targets: cfg = load_params(PAGEVIEWS_DATA_PARAMS) get_data(**cfg) # get all articles related to COVID-19 if 'covid-data' in targets: cfg = load_params(COVID_ARTICLES_DATA_PARAMS) get_wiki_category_articles(**cfg) # get complete article graph to COVID-19 if 'covid-complete' in targets: cfg = load_params(COVID_ARTICLES_COMP_PARAMS) get_wiki_category_articles(**cfg) # extract covid articles from pageviews data if 'pageviews-extract' in targets: cfg = load_params(PAGEVIEWS_EXTRACT_PARAMS) extract_data(**cfg) # extract covid articles from edit history data if 'edit-history-extract' in targets: cfg = load_params(EDIT_HISTORY_EXTRACT_PARAMS) extract_data(**cfg) # make the media data target if 'media-data' in targets: cfg = load_params(MEDIA_DATA_PARAMS) get_media_data(**cfg) # cleans and prepares the media data for analysis if 'process-media' in targets: cfg = load_params(MEDIA_PROCESS_PARAMS) process_media_data(**cfg) # create Sankey diagram from pageview data if 'sankey' in targets: cfg = load_params(SANKEY_PARAMS) create_sankey_figure(**cfg) return
def main(targets): if 'data' in targets: cfg = load_params(data_params) get_data(**cfg) if 'test' in targets: cfg = load_params(test_params) get_data(**cfg) if 'calc_m_stat' in targets: cfg = load_params(m - stat - param) final_M_stat(**cfg) return
def main(targets): # make the clean target if 'clean' in targets: remove_dir('data/raw') remove_dir('data/temp') remove_dir('data/out') remove_dir('data/out_m_stat') # make the data target if 'data' in targets: cfg = load_params(DATA_PARAMS) get_data(**cfg) # make the test data target if 'test-data' in targets: cfg = load_params(TEST_DATA_PARAMS) get_data(**cfg) # cleans and prepares the data for analysis if 'process' in targets: cfg = load_params(PROCESS_PARAMS) process_data(**cfg) # cleans and prepares the test data for analysis if 'test-process' in targets: cfg = load_params(TEST_PROCESS_PARAMS) process_data(**cfg) # runs m-statistic on processed data if 'm-stat' in targets: cfg = load_params(M_STAT_PARAMS) get_m_stat_data(**cfg) # runs m-statistic on processed test data if 'test-m-stat' in targets: cfg = load_params(TEST_M_STAT_PARAMS) get_m_stat_data(**cfg) # m-statistic for entire light dump if 'light-dump' in targets: data_cfg = load_params(LIGHT_DUMP_DATA_PARAMS) extract_cfg = load_params(LIGHT_DUMP_EXTRACT_PARAMS) m_stat_cfg = load_params(LIGHT_DUMP_M_STAT_PARAMS) evolution_cfg = load_params(LIGHT_DUMP_TIME_PARAMS) get_data(**data_cfg) extract_article(**extract_cfg) get_m_stat_data(**m_stat_cfg) grab_m_stat_over_time(**evolution_cfg) # Searches through all thee files from Wikimedia starting with # enwiki-20200201-pages-meta-history1.xml if 'deep-search' in targets: for i in range(6): data_cfg =\ load_params(DEEP_SEARCH_DATA_PARAMS .replace('params', 'params-' + str(i + 1))) process_cfg =\ load_params(DEEP_SEARCH_PROCESS_PARAMS .replace('params', 'params-' + str(i + 1))) m_stat_cfg =\ load_params(DEEP_SEARCH_M_STAT_PARAMS .replace('params', 'params-' + str(i + 1))) get_data(**data_cfg) remove_dir('data/raw') process_data(**process_cfg) remove_dir('data/temp') get_m_stat_data(**m_stat_cfg) remove_dir('data/out') # Complete project for generating M-Statistic Evolution if 'm-stat-time' in targets: data_cfg = load_params(OVER_TIME_DATA_PARAMS) process_cfg = load_params(OVER_TIME_PROCESS_PARAMS) extract_cfg = load_params(EXTRACT_PARAMS) evolution_cfg = load_params(OVER_TIME_M_STAT_PARAMS) get_data(**data_cfg) process_data(**process_cfg) extract_article(**extract_cfg) grab_m_stat_over_time(**evolution_cfg) # Complete project for test set if 'test-project' in targets: data_cfg = load_params(TEST_DATA_PARAMS) process_cfg = load_params(TEST_PROCESS_PARAMS) m_stat_cfg = load_params(TEST_M_STAT_PARAMS) get_data(**data_cfg) process_data(**process_cfg) get_m_stat_data(**m_stat_cfg) return
def main(targets): if 'data' in targets: with open('config/data-params.json') as fh: data_cfg = json.load(fh) get_data(**data_cfg) if 'process' in targets: with open('config/data-params.json') as fh: data_cfg = json.load(fh) with open('config/env.json') as fh: env_cfg = json.load(fh) metapath, p, q = data_cfg['metapath'], data_cfg['p'], data_cfg['q'] k, n = data_cfg['k'], data_cfg['n'] algorithm = data_cfg['algorithm'] malware_pos, benign_pos = data_cfg['malware_position'], data_cfg['apk_out_path'] + '/decompiled/*' model_out_path = data_cfg['model_out_path'] if not os.path.exists(model_out_path): os.makedirs(model_out_path) malware_positions = glob.glob(malware_pos) benign_positions = glob.glob(benign_pos) decompiled_apks = benign_positions + malware_positions train = np.random.choice(benign_positions, int(len(benign_positions)*0.8), replace = False).tolist() + \ np.random.choice(malware_positions, int(len(malware_positions)*0.8), replace = False).tolist() test = [apk for apk in decompiled_apks if apk not in train] apk_names_train = [get_name(file) for file in train] apk_classes_train = [get_class(file) for file in train] apk_names_test = [get_name(file) for file in test] apk_classes_test = [get_class(file) for file in test] apk2idx_train = dict(zip(apk_names_train, range(len(apk_names_train)))) apk2idx_test = dict(zip(apk_names_test, range(len(apk_names_test)))) apk2node_train = dict(zip(apk_names_train, range(-len(apk_names_train), 0))) node2apk_train = dict(zip(range(-len(apk_names_train), 0), apk_names_train)) idx2apk_train = dict(zip(apk2idx_train.values(), apk2idx_train.keys())) print('Collecting All APIs in Training Data') APIs = list(get_all_APIs(train)) API2idx = dict(zip(APIs, range(len(APIs)))) idx2API = dict(zip(range(len(APIs)), APIs)) print('Processing Training Data...') apk2code_blocks_train, apk2call_train = apk_info_idx(train, API2idx, 'train') print('Processing Test Data...') apk2code_blocks_test, apk2call_test = apk_info_idx(test, API2idx, 'test') print('Building matrix_A_train...') matrix_A_train = build_matrix_A(API2idx, apk2call_train, apk2idx_train) print('Building matrix_A_test...') matrix_A_test = build_matrix_A(API2idx, apk2call_test, apk2idx_test) print('Building matrix_B_train...') matrix_B_train = build_matrix_B(API2idx, apk2code_blocks_train, apk2idx_train) print('Building matrix_P_train...') matrix_P_train = build_matrix_P(idx2API, apk2call_train, apk2idx_train) print('Building matrix_P_test...') matrix_P_test = build_matrix_P(idx2API, apk2call_test, apk2idx_test) matrix_BP_train = matrix_B_train + matrix_P_train print('generating random walks') walks = generate_walks(metapath, apk_names_train, apk2idx_train, idx2apk_train, \ apk2node_train, node2apk_train, matrix_A_train, matrix_B_train, matrix_P_train, matrix_BP_train, p, q, k, n) walks = [list(map(str, walk)) for walk in walks] print('word2vec model') model = Word2Vec(walks, size=128, window=10, min_count=0, sg=1, workers=8, iter=5) model.wv.save_word2vec_format(model_out_path + '/{}_len{}_k{}_w2v.model'.format(metapath, n, k)) apk2class_train = dict(zip(apk_names_train, apk_classes_train)) X_train = [model.wv[str(apk2node_train[apk])] for apk in apk2idx_train if str(apk2node_train[apk]) in model.wv] Y_train = [apk2class_train[apk] for apk in apk2idx_train if str(apk2node_train[apk]) in model.wv] clf = svm.SVC(kernel = 'rbf', gamma = 'scale') clf.fit(X_train, Y_train) if algorithm == 'node2vec': X = [API_mean_embedding(model, apk2idx_test[apk], matrix_A_test) for apk in apk2idx_test] targets = [API_mean_embedding(model, apk2idx_train[apk], matrix_A_train) for apk in apk2idx_train] elif algorithm == 'metapath2vec': # TODO: Add dic X = [API_mean_embedding_metapath(apk2idx_test[apk], dic, matrix_A_test) for apk in apk2idx_test] targets = [API_mean_embedding_metapath(apk2idx_train[apk], dic, matrix_A_train) for apk in apk2idx_train] print('neural network') train_net(clf, out_path = model_out_path, epochs = 20, inputs = X_train, \ targets = targets, labels_train = Y_train, labels_test = apk_classes_test, batch_size = 1) net = torch.load(model_out_path + '/net.model') device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") X_test = net(torch.tensor(X).type(torch.DoubleTensor).to(device)).cpu().detach() Y_test = apk_classes_test acc = clf.score(X_test, Y_test) print('test accuracy: ', acc) if 'test' in targets: with open('config/test-params.json') as fh: data_cfg = json.load(fh) with open('config/env.json') as fh: env_cfg = json.load(fh) metapath, p, q = data_cfg['metapath'], data_cfg['p'], data_cfg['q'] k, n = data_cfg['k'], data_cfg['n'] algorithm = data_cfg['algorithm'] model_out_path = data_cfg['model_out_path'] if not os.path.exists(model_out_path): os.makedirs(model_out_path) benign_positions = glob.glob('Data/benign/*') malware_positions = glob.glob('Data/malwares/*') decompiled_apks = benign_positions + malware_positions # train = np.random.choice(benign_positions, int(len(benign_positions)*0.8), replace = False).tolist() + \ # np.random.choice(malware_positions, int(len(malware_positions)*0.8), replace = False).tolist() train = benign_positions[:4] + malware_positions[:4] test = [apk for apk in decompiled_apks if apk not in train] apk_names_train = [get_name(file) for file in train] # apk_classes_train = [get_class(file) for file in train] apk_names_test = [get_name(file) for file in test] # apk_classes_test = [get_class(file) for file in test] apk_classes_train = [1] * int(len(benign_positions)*0.8) + [0] * int(len(malware_positions)*0.8) apk_classes_test = [1] * (len(benign_positions) - int(len(benign_positions)*0.8)) \ + [0] * (len(malware_positions) - int(len(malware_positions)*0.8)) apk2idx_train = dict(zip(apk_names_train, range(len(apk_names_train)))) apk2idx_test = dict(zip(apk_names_test, range(len(apk_names_test)))) apk2node_train = dict(zip(apk_names_train, range(-len(apk_names_train), 0))) node2apk_train = dict(zip(range(-len(apk_names_train), 0), apk_names_train)) idx2apk_train = dict(zip(apk2idx_train.values(), apk2idx_train.keys())) print('Collecting All APIs in Training Data') APIs = list(get_all_APIs(train)) API2idx = dict(zip(APIs, range(len(APIs)))) idx2API = dict(zip(range(len(APIs)), APIs)) print('Processing Training Data...') apk2code_blocks_train, apk2call_train = apk_info_idx(train, API2idx, 'train') print('Processing Test Data...') apk2code_blocks_test, apk2call_test = apk_info_idx(test, API2idx, 'test') print('Building matrix_A_train...') matrix_A_train = build_matrix_A(API2idx, apk2call_train, apk2idx_train) print('Building matrix_A_test...') matrix_A_test = build_matrix_A(API2idx, apk2call_test, apk2idx_test) print('Building matrix_B_train...') matrix_B_train = build_matrix_B(API2idx, apk2code_blocks_train, apk2idx_train) print('Building matrix_P_train...') matrix_P_train = build_matrix_P(idx2API, apk2call_train, apk2idx_train) print('Building matrix_P_test...') matrix_P_test = build_matrix_P(idx2API, apk2call_test, apk2idx_test) matrix_BP_train = matrix_B_train + matrix_P_train print('generating random walks') walks = generate_walks(metapath, apk_names_train, apk2idx_train, idx2apk_train, \ apk2node_train, node2apk_train, matrix_A_train, matrix_B_train, matrix_P_train, matrix_BP_train, p, q, k, n) walks = [list(map(str, walk)) for walk in walks] print('word2vec model') model = Word2Vec(walks, size=128, window=10, min_count=0, sg=1, workers=8, iter=5) model.wv.save_word2vec_format(model_out_path + '/{}_len{}_k{}_w2v.model'.format(metapath, n, k)) apk2class_train = dict(zip(apk_names_train, apk_classes_train)) X_train = [model.wv[str(apk2node_train[apk])] for apk in apk2idx_train if str(apk2node_train[apk]) in model.wv] Y_train = [apk2class_train[apk] for apk in apk2idx_train if str(apk2node_train[apk]) in model.wv] clf = svm.SVC(kernel = 'rbf', gamma = 'scale') clf.fit(X_train, Y_train) if algorithm == 'node2vec': X = [API_mean_embedding(model, apk2idx_test[apk], matrix_A_test) for apk in apk2idx_test] targets = [API_mean_embedding(model, apk2idx_train[apk], matrix_A_train) for apk in apk2idx_train] elif algorithm == 'metapath2vec': # TODO: Add dic X = [API_mean_embedding_metapath(apk2idx_test[apk], dic, matrix_A_test) for apk in apk2idx_test] targets = [API_mean_embedding_metapath(apk2idx_train[apk], dic, matrix_A_train) for apk in apk2idx_train] print('neural network') train_net(clf, out_path = model_out_path, epochs = 20, inputs = X_train, \ targets = targets, labels_train = Y_train, labels_test = apk_classes_test, batch_size = 1) net = torch.load(model_out_path + '/net.model') device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") X_test = net(torch.tensor(X).type(torch.DoubleTensor).to(device)).cpu().detach() Y_test = apk_classes_test acc = clf.score(X_test, Y_test) print('test accuracy: ', acc)
def main(targets): # make the clean target if 'clean' in targets: remove_dir('data/raw') remove_dir('data/temp') remove_dir('data/out') remove_dir('data/out_m_stat') # make the data target if 'data' in targets: cfg = load_params(DATA_PARAMS) get_data(**cfg) # make the test data target if 'test-data' in targets: cfg = load_params(TEST_DATA_PARAMS) get_data(**cfg) # cleans and prepares the data for analysis if 'process' in targets: cfg = load_params(PROCESS_PARAMS) process_data(**cfg) # cleans and prepares the test data for analysis if 'test-process' in targets: cfg = load_params(TEST_PROCESS_PARAMS) process_data(**cfg) # runs m-statistic on processed data if 'm-stat' in targets: cfg = load_params(M_STAT_PARAMS) get_m_stat_data(**cfg) # runs m-statistic on processed test data if 'test-m-stat' in targets: cfg = load_params(TEST_M_STAT_PARAMS) get_m_stat_data(**cfg) # m-statistic for entire light dump if 'light-dump' in targets: cfg = load_params(LIGHT_DUMP_DATA_PARAMS) get_data(**cfg) cfg = load_params(LIGHT_DUMP_M_STAT_PARAMS) get_m_stat_data(**cfg) if 'deep-search' in targets: for i in range(6): cfg = load_params( DEEP_SEARCH_DATA_PARAMS.replace('params', 'params-' + str(i + 1))) get_data(**cfg) remove_dir('data/raw') cfg = load_params( DEEP_SEARCH_PROCESS_PARAMS.replace('params', 'params-' + str(i + 1))) process_data(**cfg) remove_dir('data/temp') cfg = load_params( DEEP_SEARCH_M_STAT_PARAMS.replace('params', 'params-' + str(i + 1))) get_m_stat_data(**cfg) remove_dir('data/out') if 'm-stat-time' in targets: cfg = load_params(OVER_TIME_DATA_PARAMS) get_data(**cfg) cfg = load_params(OVER_TIME_PROCESS_PARAMS) process_data(**cfg) cfg = load_params(EXTRACT_PARAMS) extract_article(**cfg) cfg = load_params(OVER_TIME_M_STAT_PARAMS) grab_m_stat_over_time(**cfg) return