Ejemplo n.º 1
0
def handle_features_1dir(samples_dir, label, level, features_choice, n,
                         analysis_path):
    """ handle_features_1file for ALL files from a directory.
    Case one folder. """

    if not os.path.exists(analysis_path):
        os.makedirs(analysis_path)

    pickle_path = os.path.join(analysis_path, features_choice,
                               level + '_all_features_' + label)
    utility.check_folder_exists(pickle_path)

    if os.path.isfile(pickle_path):
        all_features_dict = pickle.load(open(pickle_path, 'rb'))
    else:
        all_features_dict = dict()

    analyses = get_features_all_files_multiproc(samples_dir, level,
                                                features_choice, n)

    start = timeit.default_timer()

    for analysis in analyses:
        features_dict = analysis.features
        if features_dict is not None:
            try:
                handle_features_1file(features_dict, all_features_dict)
            except:
                logging.exception('Something went wrong with %s',
                                  analysis.file_path)

    pickle.dump(all_features_dict, open(pickle_path, 'wb'))
    utility.micro_benchmark('Total elapsed time:',
                            timeit.default_timer() - start)
Ejemplo n.º 2
0
def analyze_features_all(all_features_dict1, all_features_dict2,
                         samples_dir_list, labels_list, path_info, level,
                         features_choice, n, analysis_path):
    """ Produces a dict containing the number of occurrences (or not) of each expected feature
    with a distinction between benign and malicious files. """

    if len(samples_dir_list) != len(labels_list):
        logging.error(
            "Something is wrong with the size of samples_dir_list and label_list. "
            + "Got %s and %s", str(len(samples_dir_list)),
            str(len(labels_list)))
        return None

    if "benign" not in labels_list and "malicious" not in labels_list:
        logging.error("Expected 'benign' and 'malicious' in labels_list")
        return None

    pickle_path = os.path.join(analysis_path, features_choice,
                               level + '_analyzed_features_' + path_info)
    utility.check_folder_exists(pickle_path)

    analyzed_features_dict = initialize_analyzed_features_dict(
        all_features_dict1, all_features_dict2)

    analyses = get_features_all_files_multiproc(samples_dir_list, labels_list,
                                                level, features_choice, n)

    start = timeit.default_timer()

    for analysis in analyses:
        features_dict = analysis.features
        label = analysis.label
        if features_dict is not None:
            analyze_features(analyzed_features_dict, features_dict, label)

    pickle.dump(analyzed_features_dict, open(pickle_path, 'wb'))

    utility.micro_benchmark('Total elapsed time:',
                            timeit.default_timer() - start)
    return analyzed_features_dict
Ejemplo n.º 3
0
def analyze_features_all(all_features_dict1, all_features_dict2,
                         samples_dir_list, labels_list, path_info, level,
                         features_choice, n, analysis_path):
    """ Produces a dict containing the number of occurrences (or not) of each expected feature
    with a distinction between benign and malicious files. """

    if len(samples_dir_list) != len(labels_list):
        logging.error(
            "The number %s of directories (--vd option) does not match the number %s "
            "of labels (--vl option)", str(len(samples_dir_list)),
            str(len(labels_list)))
        return None

    if "benign" not in labels_list and "malicious" not in labels_list:
        logging.error(
            "Expected both the labels 'benign' and 'malicious' (--vl option).\nGot %s",
            labels_list)
        return None

    pickle_path = os.path.join(analysis_path, features_choice,
                               level + '_analyzed_features_' + path_info)
    utility.check_folder_exists(pickle_path)

    analyzed_features_dict = initialize_analyzed_features_dict(
        all_features_dict1, all_features_dict2)

    analyses = get_features_all_files_multiproc(samples_dir_list, labels_list,
                                                level, features_choice, n)

    for analysis in analyses:
        features_dict = analysis.features
        label = analysis.label
        if features_dict is not None:
            analyze_features(analyzed_features_dict, features_dict, label)

    pickle.dump(analyzed_features_dict, open(pickle_path, 'wb'))

    return analyzed_features_dict
Ejemplo n.º 4
0
def store_features(all_features_dict_path1,
                   all_features_dict_path2,
                   samples_dir_list,
                   labels_list,
                   path_info,
                   level,
                   features_choice,
                   analysis_path,
                   n=4,
                   analyzed_features_path=None,
                   chi_confidence=99):
    """ Stores the features selected by chi2 in a dict.
        The confidence has to be given in percent. """

    pickle_path = os.path.join(
        analysis_path, features_choice,
        level + '_selected_features_' + str(chi_confidence))
    utility.check_folder_exists(pickle_path)

    if analyzed_features_path is None:
        all_features_dict1 = pickle.load(open(all_features_dict_path1, 'rb'))
        all_features_dict2 = pickle.load(open(all_features_dict_path2, 'rb'))

        analyzed_features_dict = analyze_features_all(
            all_features_dict1, all_features_dict2, samples_dir_list,
            labels_list, path_info, level, features_choice, n, analysis_path)

    else:
        analyzed_features_dict = pickle.load(open(analyzed_features_path,
                                                  'rb'))

    selected_features_dict = select_features(analyzed_features_dict,
                                             chi_confidence)
    pickle.dump(selected_features_dict, open(pickle_path, 'wb'))

    return selected_features_dict
Ejemplo n.º 5
0
def main_learn(js_dirs=arg_obj['d'],
               js_dirs_validate=arg_obj['vd'],
               labels_validate=arg_obj['vl'],
               labels_d=arg_obj['l'],
               model_dir=arg_obj['md'],
               model_name=arg_obj['mn'],
               print_score=arg_obj['ps'],
               print_res=arg_obj['pr'],
               level=arg_obj['level'],
               n=arg_obj['n'][0],
               estimators=arg_obj['nt'],
               features_choice=arg_obj['features'],
               analysis_path=arg_obj['analysis_path'][0]):
    """
        Main function, performs a static analysis (syntactic) of JavaScript files given as input
        to build a model to classify future JavaScript files.

        -------
        Parameters:
        - js_dirs: list of strings
            Directories containing the JS files to be analysed.
        - js_dirs_validate: list of strings
            2 JS dir (1 benign, 1 malicious) to select the features with chi2.
        - labels_validate: list of strings
            Labels of the 2 JS dir for the features selection process
        - labels_d: list of strings
            Indicates the label's name of the directories considered: either benign or malicious.
        - model_dir: String
            Path to store the model that will be produced.
            Default value being the folder JS-Analysis/Classification/.
        - model_name: String
            Name of the model that will be produced.
            Default value being model.
        - print_score: Boolean
            Indicates whether to print or not the classifier's performance.
        - print_res: Boolean
            Indicates whether to print or not the classifier's predictions.
        - n: Integer
            Stands for the size of the sliding-window which goes through the units contained in the
            files to be analysed.
        - estimators: int
            Number of trees in the forest.
        - level: str
            Either 'tokens', 'ast', 'cfg', 'pdg', or 'pdg-dfg' depending on the units you want
            to extract.
        - features_choice: str
            Either 'ngrams' or 'value' depending on the features you want.
        - analysis_path: str
            Folder to store the features' analysis results in.
        Default values are the ones given in the command lines or in the
        ArgumentParser object (function parsingCommands()).
    """

    if js_dirs is None:
        logging.error(
            'Please, indicate at least 2 directories (--d option), at least one benign '
            'and one malicious (--l option to pass the corresponding labels).\nGot 0 '
            'directories with the following labels: %s', labels_d)

    elif len(js_dirs) < 2 or labels_d is None\
            or 'benign' not in labels_d or 'malicious' not in labels_d:
        logging.error(
            'Please, indicate at least 2 directories (--d option), at least one benign '
            'and one malicious (--l option to pass the corresponding labels).\nGot %s '
            'directories with the following labels: %s', str(len(js_dirs)),
            labels_d)

    elif js_dirs is not None and (labels_d is None
                                  or len(js_dirs) != len(labels_d)):
        logging.error(
            'Please, indicate as many directory labels (--l option) as the number %s of '
            'directories to analyze', str(len(js_dirs)))

    elif js_dirs_validate is None:
        logging.error(
            'Please, indicate the 2 JS directories (--vd option) '
            'with corresponding labels (--vl option, 1 benign and 1 malicious) '
            'for the features validation process')

    elif len(js_dirs_validate) != 2 or labels_validate is None\
            or 'benign' not in labels_validate or 'malicious' not in labels_validate:
        logging.error(
            'Please, indicate the 2 JS directories (--vd option) '
            'with corresponding labels (--vl option, 1 benign and 1 malicious) '
            'for the features validation process.\nGot %s directories with following '
            'labels %s', str(len(js_dirs_validate)), labels_validate)

    elif utility.check_params(level, features_choice) == 0:
        return

    else:
        analysis_path = os.path.join(analysis_path, 'Features')
        utility.check_folder_exists(analysis_path)

        features2int_dict_path = os.path.join(
            analysis_path, features_choice[0],
            level[0] + '_selected_features_99')

        features_preselection.handle_features_all(js_dirs, labels_d, level[0],
                                                  features_choice[0],
                                                  analysis_path, n)
        features_selection.store_features_all(js_dirs_validate,
                                              labels_validate, level[0],
                                              features_choice[0],
                                              analysis_path, n)

        names, attributes, labels = static_analysis.main_analysis\
            (js_dirs=js_dirs, labels_dirs=labels_d, js_files=None, labels_files=None,
             n=n, level=level[0], features_choice=features_choice[0],
             features2int_dict_path=features2int_dict_path)

        if names:
            # Uncomment to save the analysis results in pickle objects.
            """
            utility.save_analysis_results(os.path.join(model_dir[0], "Analysis-n" + str(n) + "-dict"
                                                       + str(dict_not_hash)),
                                          names, attributes, labels)
            """

            classify(names,
                     labels,
                     attributes,
                     model_dir=model_dir[0],
                     model_name=model_name[0],
                     print_score=print_score[0],
                     print_res=print_res[0],
                     estimators=estimators[0])

        else:
            logging.warning('No valid JS file found for the analysis')