def main(targets):
    if "test" in targets:
        targets = ["data", "eda", "classification", "clustering"]
        etl_params = json.load(open("config/etl_test.json"))
        eda_params = json.load(open("config/eda_test.json"))
    else:
        etl_params = json.load(open("config/etl.json"))
        eda_params = json.load(open("config/eda.json"))

    autophrase_params = json.load(open("config/autophrase.json"))
    clf_params = json.load(open("config/classification.json"))
    clustering_params = json.load(open("config/clustering.json"))

    if "all" in targets:
        targets = ["data", "eda", "classification", "clustering"]

    if "download" in targets:
        download_dataset()

    if "data" in targets:
        get_data(autophrase_params, **etl_params)

    if "eda" in targets:
        generate_figures(**eda_params)

    if "classification" in targets:
        model(clf_params)

    if "clustering" in targets:
        run_clustering(clustering_params)
Esempio n. 2
0
def main(targets):

    # make the clean target
    if 'clean' in targets:
        remove_dir('data/raw')
        remove_dir('data/temp')
        remove_dir('data/out')
        remove_dir('data/out_m_stat')

    # get all edit history zip files
    if 'edit-history-data' in targets:
        cfg = load_params(EDIT_HISTORY_DATA_PARAMS)
        get_data(**cfg)

    # get all pageviews zip files
    if 'pageviews-data' in targets:
        cfg = load_params(PAGEVIEWS_DATA_PARAMS)
        get_data(**cfg)

    # get all articles related to COVID-19
    if 'covid-data' in targets:
        cfg = load_params(COVID_ARTICLES_DATA_PARAMS)
        get_wiki_category_articles(**cfg)

    # get complete article graph to COVID-19
    if 'covid-complete' in targets:
        cfg = load_params(COVID_ARTICLES_COMP_PARAMS)
        get_wiki_category_articles(**cfg)

    # extract covid articles from pageviews data
    if 'pageviews-extract' in targets:
        cfg = load_params(PAGEVIEWS_EXTRACT_PARAMS)
        extract_data(**cfg)

    # extract covid articles from edit history data
    if 'edit-history-extract' in targets:
        cfg = load_params(EDIT_HISTORY_EXTRACT_PARAMS)
        extract_data(**cfg)

    # make the media data target
    if 'media-data' in targets:
        cfg = load_params(MEDIA_DATA_PARAMS)
        get_media_data(**cfg)

    # cleans and prepares the media data for analysis
    if 'process-media' in targets:
        cfg = load_params(MEDIA_PROCESS_PARAMS)
        process_media_data(**cfg)

    # create Sankey diagram from pageview data
    if 'sankey' in targets:
        cfg = load_params(SANKEY_PARAMS)
        create_sankey_figure(**cfg)

    return
Esempio n. 3
0
def main(targets):

    if 'data' in targets:
        cfg = load_params(data_params)
        get_data(**cfg)

    if 'test' in targets:
        cfg = load_params(test_params)
        get_data(**cfg)

    if 'calc_m_stat' in targets:
        cfg = load_params(m - stat - param)
        final_M_stat(**cfg)

    return
Esempio n. 4
0
def main(targets):

    # make the clean target
    if 'clean' in targets:
        remove_dir('data/raw')
        remove_dir('data/temp')
        remove_dir('data/out')
        remove_dir('data/out_m_stat')

    # make the data target
    if 'data' in targets:
        cfg = load_params(DATA_PARAMS)
        get_data(**cfg)

    # make the test data target
    if 'test-data' in targets:
        cfg = load_params(TEST_DATA_PARAMS)
        get_data(**cfg)

    # cleans and prepares the data for analysis
    if 'process' in targets:
        cfg = load_params(PROCESS_PARAMS)
        process_data(**cfg)

    # cleans and prepares the test data for analysis
    if 'test-process' in targets:
        cfg = load_params(TEST_PROCESS_PARAMS)
        process_data(**cfg)

    # runs m-statistic on processed data
    if 'm-stat' in targets:
        cfg = load_params(M_STAT_PARAMS)
        get_m_stat_data(**cfg)

    # runs m-statistic on processed test data
    if 'test-m-stat' in targets:
        cfg = load_params(TEST_M_STAT_PARAMS)
        get_m_stat_data(**cfg)

    # m-statistic for entire light dump
    if 'light-dump' in targets:
        data_cfg = load_params(LIGHT_DUMP_DATA_PARAMS)
        extract_cfg = load_params(LIGHT_DUMP_EXTRACT_PARAMS)
        m_stat_cfg = load_params(LIGHT_DUMP_M_STAT_PARAMS)
        evolution_cfg = load_params(LIGHT_DUMP_TIME_PARAMS)

        get_data(**data_cfg)
        extract_article(**extract_cfg)
        get_m_stat_data(**m_stat_cfg)
        grab_m_stat_over_time(**evolution_cfg)

    # Searches through all thee files from Wikimedia starting with
    # enwiki-20200201-pages-meta-history1.xml
    if 'deep-search' in targets:
        for i in range(6):
            data_cfg =\
                load_params(DEEP_SEARCH_DATA_PARAMS
                            .replace('params', 'params-' + str(i + 1)))
            process_cfg =\
                load_params(DEEP_SEARCH_PROCESS_PARAMS
                            .replace('params', 'params-' + str(i + 1)))
            m_stat_cfg =\
                load_params(DEEP_SEARCH_M_STAT_PARAMS
                            .replace('params', 'params-' + str(i + 1)))

            get_data(**data_cfg)
            remove_dir('data/raw')
            process_data(**process_cfg)
            remove_dir('data/temp')
            get_m_stat_data(**m_stat_cfg)
            remove_dir('data/out')

    # Complete project for generating M-Statistic Evolution
    if 'm-stat-time' in targets:
        data_cfg = load_params(OVER_TIME_DATA_PARAMS)
        process_cfg = load_params(OVER_TIME_PROCESS_PARAMS)
        extract_cfg = load_params(EXTRACT_PARAMS)
        evolution_cfg = load_params(OVER_TIME_M_STAT_PARAMS)

        get_data(**data_cfg)
        process_data(**process_cfg)
        extract_article(**extract_cfg)
        grab_m_stat_over_time(**evolution_cfg)

    # Complete project for test set
    if 'test-project' in targets:
        data_cfg = load_params(TEST_DATA_PARAMS)
        process_cfg = load_params(TEST_PROCESS_PARAMS)
        m_stat_cfg = load_params(TEST_M_STAT_PARAMS)

        get_data(**data_cfg)
        process_data(**process_cfg)
        get_m_stat_data(**m_stat_cfg)

    return
Esempio n. 5
0
def main(targets):
    if 'data' in targets:
        with open('config/data-params.json') as fh:
            data_cfg = json.load(fh)
        get_data(**data_cfg)
    if 'process' in targets:
        with open('config/data-params.json') as fh:
            data_cfg = json.load(fh)
        with open('config/env.json') as fh:
            env_cfg = json.load(fh)
        metapath, p, q = data_cfg['metapath'], data_cfg['p'], data_cfg['q']
        k, n = data_cfg['k'], data_cfg['n']
        algorithm = data_cfg['algorithm']
        malware_pos, benign_pos = data_cfg['malware_position'], data_cfg['apk_out_path'] + '/decompiled/*'
        model_out_path = data_cfg['model_out_path']
        if not os.path.exists(model_out_path):
            os.makedirs(model_out_path)
        malware_positions = glob.glob(malware_pos)
        benign_positions = glob.glob(benign_pos)
        decompiled_apks = benign_positions + malware_positions
        train = np.random.choice(benign_positions, int(len(benign_positions)*0.8), replace = False).tolist() + \
        np.random.choice(malware_positions, int(len(malware_positions)*0.8), replace = False).tolist()
        test = [apk for apk in decompiled_apks if apk not in train]
        apk_names_train = [get_name(file) for file in train]
        apk_classes_train = [get_class(file) for file in train]
        apk_names_test = [get_name(file) for file in test]
        apk_classes_test = [get_class(file) for file in test]
        apk2idx_train = dict(zip(apk_names_train, range(len(apk_names_train))))
        apk2idx_test = dict(zip(apk_names_test, range(len(apk_names_test))))
        apk2node_train = dict(zip(apk_names_train, range(-len(apk_names_train), 0)))
        node2apk_train = dict(zip(range(-len(apk_names_train), 0), apk_names_train))
        idx2apk_train = dict(zip(apk2idx_train.values(), apk2idx_train.keys()))

        print('Collecting All APIs in Training Data')
        APIs = list(get_all_APIs(train))
        API2idx = dict(zip(APIs, range(len(APIs))))
        idx2API = dict(zip(range(len(APIs)), APIs))
        print('Processing Training Data...')
        apk2code_blocks_train, apk2call_train = apk_info_idx(train, API2idx, 'train')
        print('Processing Test Data...')
        apk2code_blocks_test, apk2call_test = apk_info_idx(test, API2idx, 'test')
        print('Building matrix_A_train...')
        matrix_A_train = build_matrix_A(API2idx, apk2call_train, apk2idx_train)
        print('Building matrix_A_test...')
        matrix_A_test = build_matrix_A(API2idx, apk2call_test, apk2idx_test)
        print('Building matrix_B_train...')
        matrix_B_train = build_matrix_B(API2idx, apk2code_blocks_train, apk2idx_train)
        print('Building matrix_P_train...')
        matrix_P_train = build_matrix_P(idx2API, apk2call_train, apk2idx_train)
        print('Building matrix_P_test...')
        matrix_P_test = build_matrix_P(idx2API, apk2call_test, apk2idx_test)
        matrix_BP_train = matrix_B_train + matrix_P_train

        print('generating random walks')
        walks = generate_walks(metapath, apk_names_train, apk2idx_train, idx2apk_train, \
        apk2node_train, node2apk_train, matrix_A_train, matrix_B_train, matrix_P_train, matrix_BP_train, p, q, k, n)
        walks = [list(map(str, walk)) for walk in walks]
        print('word2vec model')
        model = Word2Vec(walks, size=128, window=10, min_count=0, sg=1, workers=8, iter=5)
        model.wv.save_word2vec_format(model_out_path + '/{}_len{}_k{}_w2v.model'.format(metapath, n, k))

        apk2class_train = dict(zip(apk_names_train, apk_classes_train))
        X_train = [model.wv[str(apk2node_train[apk])] for apk in apk2idx_train if str(apk2node_train[apk]) in model.wv]
        Y_train = [apk2class_train[apk] for apk in apk2idx_train if str(apk2node_train[apk]) in model.wv]
        clf = svm.SVC(kernel = 'rbf', gamma = 'scale')
        clf.fit(X_train, Y_train)

        if algorithm == 'node2vec':
            X = [API_mean_embedding(model, apk2idx_test[apk], matrix_A_test) for apk in apk2idx_test]
            targets = [API_mean_embedding(model, apk2idx_train[apk], matrix_A_train) for apk in apk2idx_train]
        elif algorithm == 'metapath2vec':
            # TODO: Add dic
            X = [API_mean_embedding_metapath(apk2idx_test[apk], dic, matrix_A_test) for apk in apk2idx_test]
            targets = [API_mean_embedding_metapath(apk2idx_train[apk], dic, matrix_A_train) for apk in apk2idx_train]
        print('neural network')
        train_net(clf, out_path = model_out_path, epochs = 20, inputs = X_train, \
        targets = targets, labels_train = Y_train, labels_test = apk_classes_test, batch_size = 1)
        net = torch.load(model_out_path + '/net.model')


        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        X_test = net(torch.tensor(X).type(torch.DoubleTensor).to(device)).cpu().detach()
        Y_test = apk_classes_test
        acc = clf.score(X_test, Y_test)
        print('test accuracy: ', acc)


    if 'test' in targets:
        with open('config/test-params.json') as fh:
            data_cfg = json.load(fh)
        with open('config/env.json') as fh:
            env_cfg = json.load(fh)

        metapath, p, q = data_cfg['metapath'], data_cfg['p'], data_cfg['q']
        k, n = data_cfg['k'], data_cfg['n']
        algorithm = data_cfg['algorithm']
        model_out_path = data_cfg['model_out_path']
        if not os.path.exists(model_out_path):
            os.makedirs(model_out_path)
        benign_positions = glob.glob('Data/benign/*')
        malware_positions = glob.glob('Data/malwares/*')
        decompiled_apks = benign_positions + malware_positions
        # train = np.random.choice(benign_positions, int(len(benign_positions)*0.8), replace = False).tolist() + \
        # np.random.choice(malware_positions, int(len(malware_positions)*0.8), replace = False).tolist()
        train = benign_positions[:4] + malware_positions[:4]
        test = [apk for apk in decompiled_apks if apk not in train]
        apk_names_train = [get_name(file) for file in train]
        # apk_classes_train = [get_class(file) for file in train]
        apk_names_test = [get_name(file) for file in test]
        # apk_classes_test = [get_class(file) for file in test]
        apk_classes_train = [1] * int(len(benign_positions)*0.8) + [0] * int(len(malware_positions)*0.8)
        apk_classes_test = [1] * (len(benign_positions) - int(len(benign_positions)*0.8)) \
        + [0] * (len(malware_positions) - int(len(malware_positions)*0.8))
        apk2idx_train = dict(zip(apk_names_train, range(len(apk_names_train))))
        apk2idx_test = dict(zip(apk_names_test, range(len(apk_names_test))))
        apk2node_train = dict(zip(apk_names_train, range(-len(apk_names_train), 0)))
        node2apk_train = dict(zip(range(-len(apk_names_train), 0), apk_names_train))
        idx2apk_train = dict(zip(apk2idx_train.values(), apk2idx_train.keys()))

        print('Collecting All APIs in Training Data')
        APIs = list(get_all_APIs(train))
        API2idx = dict(zip(APIs, range(len(APIs))))
        idx2API = dict(zip(range(len(APIs)), APIs))
        print('Processing Training Data...')
        apk2code_blocks_train, apk2call_train = apk_info_idx(train, API2idx, 'train')
        print('Processing Test Data...')
        apk2code_blocks_test, apk2call_test = apk_info_idx(test, API2idx, 'test')
        print('Building matrix_A_train...')
        matrix_A_train = build_matrix_A(API2idx, apk2call_train, apk2idx_train)
        print('Building matrix_A_test...')
        matrix_A_test = build_matrix_A(API2idx, apk2call_test, apk2idx_test)
        print('Building matrix_B_train...')
        matrix_B_train = build_matrix_B(API2idx, apk2code_blocks_train, apk2idx_train)
        print('Building matrix_P_train...')
        matrix_P_train = build_matrix_P(idx2API, apk2call_train, apk2idx_train)
        print('Building matrix_P_test...')
        matrix_P_test = build_matrix_P(idx2API, apk2call_test, apk2idx_test)
        matrix_BP_train = matrix_B_train + matrix_P_train

        print('generating random walks')
        walks = generate_walks(metapath, apk_names_train, apk2idx_train, idx2apk_train, \
        apk2node_train, node2apk_train, matrix_A_train, matrix_B_train, matrix_P_train, matrix_BP_train, p, q, k, n)
        walks = [list(map(str, walk)) for walk in walks]
        print('word2vec model')
        model = Word2Vec(walks, size=128, window=10, min_count=0, sg=1, workers=8, iter=5)
        model.wv.save_word2vec_format(model_out_path + '/{}_len{}_k{}_w2v.model'.format(metapath, n, k))

        apk2class_train = dict(zip(apk_names_train, apk_classes_train))
        X_train = [model.wv[str(apk2node_train[apk])] for apk in apk2idx_train if str(apk2node_train[apk]) in model.wv]
        Y_train = [apk2class_train[apk] for apk in apk2idx_train if str(apk2node_train[apk]) in model.wv]
        clf = svm.SVC(kernel = 'rbf', gamma = 'scale')
        clf.fit(X_train, Y_train)

        if algorithm == 'node2vec':
            X = [API_mean_embedding(model, apk2idx_test[apk], matrix_A_test) for apk in apk2idx_test]
            targets = [API_mean_embedding(model, apk2idx_train[apk], matrix_A_train) for apk in apk2idx_train]
        elif algorithm == 'metapath2vec':
            # TODO: Add dic
            X = [API_mean_embedding_metapath(apk2idx_test[apk], dic, matrix_A_test) for apk in apk2idx_test]
            targets = [API_mean_embedding_metapath(apk2idx_train[apk], dic, matrix_A_train) for apk in apk2idx_train]
        print('neural network')
        train_net(clf, out_path = model_out_path, epochs = 20, inputs = X_train, \
        targets = targets, labels_train = Y_train, labels_test = apk_classes_test, batch_size = 1)
        net = torch.load(model_out_path + '/net.model')

        device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        X_test = net(torch.tensor(X).type(torch.DoubleTensor).to(device)).cpu().detach()
        Y_test = apk_classes_test
        acc = clf.score(X_test, Y_test)
        print('test accuracy: ', acc)
Esempio n. 6
0
def main(targets):

    # make the clean target
    if 'clean' in targets:
        remove_dir('data/raw')
        remove_dir('data/temp')
        remove_dir('data/out')
        remove_dir('data/out_m_stat')

    # make the data target
    if 'data' in targets:
        cfg = load_params(DATA_PARAMS)
        get_data(**cfg)

    # make the test data target
    if 'test-data' in targets:
        cfg = load_params(TEST_DATA_PARAMS)
        get_data(**cfg)

    # cleans and prepares the data for analysis
    if 'process' in targets:
        cfg = load_params(PROCESS_PARAMS)
        process_data(**cfg)

    # cleans and prepares the test data for analysis
    if 'test-process' in targets:
        cfg = load_params(TEST_PROCESS_PARAMS)
        process_data(**cfg)

    # runs m-statistic on processed data
    if 'm-stat' in targets:
        cfg = load_params(M_STAT_PARAMS)
        get_m_stat_data(**cfg)

    # runs m-statistic on processed test data
    if 'test-m-stat' in targets:
        cfg = load_params(TEST_M_STAT_PARAMS)
        get_m_stat_data(**cfg)

    # m-statistic for entire light dump
    if 'light-dump' in targets:
        cfg = load_params(LIGHT_DUMP_DATA_PARAMS)
        get_data(**cfg)
        cfg = load_params(LIGHT_DUMP_M_STAT_PARAMS)
        get_m_stat_data(**cfg)

    if 'deep-search' in targets:
        for i in range(6):
            cfg = load_params(
                DEEP_SEARCH_DATA_PARAMS.replace('params',
                                                'params-' + str(i + 1)))
            get_data(**cfg)
            remove_dir('data/raw')
            cfg = load_params(
                DEEP_SEARCH_PROCESS_PARAMS.replace('params',
                                                   'params-' + str(i + 1)))
            process_data(**cfg)
            remove_dir('data/temp')
            cfg = load_params(
                DEEP_SEARCH_M_STAT_PARAMS.replace('params',
                                                  'params-' + str(i + 1)))
            get_m_stat_data(**cfg)
            remove_dir('data/out')

    if 'm-stat-time' in targets:
        cfg = load_params(OVER_TIME_DATA_PARAMS)
        get_data(**cfg)
        cfg = load_params(OVER_TIME_PROCESS_PARAMS)
        process_data(**cfg)
        cfg = load_params(EXTRACT_PARAMS)
        extract_article(**cfg)
        cfg = load_params(OVER_TIME_M_STAT_PARAMS)
        grab_m_stat_over_time(**cfg)

    return