Example #1
0
def evaluate_lgb(lightgbm_model_file,
                 results_dir,
                 db_path=config.db_path,
                 remove_missing_features='scan'
                 ):
    """
    Take a trained lightGBM model and perform an evaluation on it. Results will be saved to 
    results.csv in the path specified in results_dir

    :param lightgbm_model_file: Full path to the trained lightGBM model
    :param results_dir: The directory to which to write the 'results.csv' file; WARNING -- this will overwrite any
        existing results in that location
    :param db_path: the path to the directory containing the meta.db file; defaults to the value in config.py
    :param remove_missing_features: See help for remove_missing_features in train.py / train_network
    """
    os.system('mkdir -p {}'.format(results_dir))

    logger.info(f'Loading lgb model from {lightgbm_model_file}')
    model = lgb.Booster(model_file=lightgbm_model_file)
    generator = get_generator(mode='test', path=db_path, use_malicious_labels=True,
                              use_count_labels=False,
                              use_tag_labels=False, return_shas=True,
                              remove_missing_features=remove_missing_features)
    logger.info('running lgb evaluation')
    f = open(os.path.join(results_dir, 'results.csv'), 'w')
    first_batch = True
    for shas, features, labels in tqdm.tqdm(generator):
        predictions = {'malware':model.predict(features)}
        results = normalize_results(labels, predictions, use_malware=True, use_count=False, use_tags=False)
        pd.DataFrame(results, index=shas).to_csv(f, header=first_batch)
        first_batch = False
    f.close()
    print('...done')
Example #2
0
def process_line(line):
    if line.strip() == '':  # Don't process empty lines any further
        if cCountEmptyLines:
            return "\\State", None, False, 0
        else:
            return "\\Statex", None, False, 0

    sp = line.split("#")
    comment = ""
    if len(sp) > 1:
        if len(sp[-2]) == 0 or not sp[-2][-1] == "\\":
            comment = sp[-1]
            line = "\\#".join(sp[:-1])
        else:
            if not len(sp[-2]) == 0:
                sp[-2] = sp[-2][:-1]
            line = "\\#".join(sp)

    comment = comment.strip()
    line = line.strip()
    line = preprocess(line)

    terminator = None
    process_lvl = False
    transform = 0
    if line == "":
        line = generate_comment_line(comment)
    else:
        keyword = get_keyword(line)
        generator = get_generator(keyword)
        line, terminator, process_lvl, transform = generator(line)
        if not comment == "":
            line += " \\Comment{\ " + comment + "}"

    return line, terminator, process_lvl, transform  # Add generated line to result
def dump_data_to_numpy(mode, output_file, workers=1, batchsize=1000, remove_missing_features='scan'):
    """
    Produce numpy files required for training lightgbm model from SQLite + LMDB database.

    :param mode: One of 'train', 'validation', or 'test' representing which set of the
        data to process to file. Splits are obtained based on timestamps in config.py
    :param output_file: The name of the output file to produce for the indicated split.
    :param workers: How many worker processes to use (default 1)
    :param batchsize: The batch size to use in collecting samples (default 1000)
    :param remove_missing_features: How to check for and remove missing features; see
        README.md for recommendations (default 'scan')
    """
    _generator = get_generator(path=db_path,
                               mode=mode,
                               batch_size=batchsize,
                               use_malicious_labels=True,
                               use_count_labels=False,
                               use_tag_labels=False,
                               num_workers = workers,
                               remove_missing_features=remove_missing_features,
                               shuffle=False)
    feature_array = []
    label_array = []
    for i, (features, labels) in enumerate(_generator):
        feature_array.append(deepcopy(features.numpy()))
        label_array.append(deepcopy(labels['malware'].numpy()))
        sys.stdout.write(f"\r{i} / {len(_generator)}")
        sys.stdout.flush()
    np.savez(output_file, feature_array, label_array)
    print(f"\nWrote output to {output_file}")
Example #4
0
def run(x_train,
        x_test,
        y_train_one_hot,
        y_test,
        model,
        fit_generator=False,
        epochs=8,
        steps_per_epoch=20):
    if fit_generator:
        generate = generators.get_generator(x_train, y_train_one_hot)
        model.fit_generator(generate(),
                            steps_per_epoch=steps_per_epoch,
                            epochs=epochs,
                            verbose=0)
    else:
        model.fit(x_train,
                  y_train_one_hot,
                  nb_epoch=epochs,
                  batch_size=BATCH_SIZE)

    pred = model.predict_classes(x_test, 10)
    print(pred)
    print(y_test)
    print('acc: ', metrics.accuracy_score(pred, y_test))
    print('f1: ', metrics.f1_score(pred, y_test))

    return metrics.accuracy_score(pred, y_test), metrics.f1_score(pred, y_test)
def play(pathinfo):
    content, info = _parse_path(pathinfo)
    if not content:
        xbmcgui.Dialog().notification(L(34201), pathinfo['full path'])
        return
    singlevideo = pathinfo.get('singlevideo', False)
    try:
        showbusy = get_main_addon().getSetting('hidebusydialog') == 'false'
        get_player(get_generator(content, info, singlevideo), showbusy).run()
    except quickjson.JSONException as ex:
        # json_result['error']['code'] == -32602 is the best we get, invalid params
        if content == 'other' and ex.json_result.get('error', {}).get('code', 0) == -32602 \
                and not any(1 for source in quickjson.get_sources('video') if info['path'].startswith(source['file'])):
            xbmcgui.Dialog().ok(L(ADD_SOURCE_HEADER), L(ADD_SOURCE_MESSAGE).format(info['path']))
        else:
            raise
Example #6
0
 def __init__(self, params):
     super(Trainer, self).__init__()
     # save params
     self.params = params
     # set device
     self.device = torch.device(params['training']['device'])
     # set niter to -1
     self.niter = -1
     # set attribute for best score
     self.best_psnr = None
     # create generator
     self.netG = get_generator(self.params['generator'])
     print(self.netG)
     # move it to device
     self.netG.to(self.device)
     # define output filename
     self.name = get_generator_name(self.params['generator'])
     # define dirs
     self.base_dir = os.path.join('./checkpoints', self.params['exp_name'])
     self.model_dir = self.base_dir
     self.logs_dir = self.base_dir
     self.images_dir = self.base_dir
     self.out_dir = os.path.join(self.base_dir, 'regen')
     # create them
     if not os.path.isdir(self.base_dir): os.makedirs(self.base_dir)
     if not os.path.isdir(self.model_dir): os.makedirs(self.model_dir)
     if not os.path.isdir(self.logs_dir): os.makedirs(self.logs_dir)
     if not os.path.isdir(self.images_dir): os.makedirs(self.images_dir)
     if not os.path.isdir(self.out_dir): os.makedirs(self.out_dir)
     # if not training, do not continue
     if not self.training: return
     # get loss
     self.loss = get_loss(self.params['training']['loss'])
     # create generator optimizer
     self.optimG = torch.optim.Adam(
         self.netG.parameters(),
         lr=self.params['training']['lr'],
         weight_decay=self.params['training']['weight_decay'])
     # init weights
     for m in self.modules():
         if isinstance(m, nn.Conv2d):
             nn.init.kaiming_normal_(m.weight,
                                     mode='fan_out',
                                     nonlinearity='relu')
         elif isinstance(m, nn.BatchNorm2d):
             nn.init.constant_(m.weight, 1)
             nn.init.constant_(m.bias, 0)
Example #7
0
def add_generator(id, name):
    s = system.get_system()
    gen_class = generators.get_generator(id)
    gen = gen_class()
    params = gen.get_parameters()
    if params != None:
        user_params = {}
        print(f"{id} generator setup:")
        for param in params:
            user_in = input(f"{param} ({params[param]}) $ ")
            if len(user_in) == 0:
                user_params[param] = params[param]
            elif isinstance(params[param], str):
                user_params[param] = user_in
            else:
                user_val = eval(user_in)
                user_params[param] = user_val
        gen.set_parameters(user_params)
    s.add_generator(name, gen)
    print(f"Created gen {name}!")
Example #8
0
def evaluate_network(results_dir, checkpoint_file,
                     db_path=config.db_path,
                     evaluate_malware=True,
                     evaluate_count=True,
                     evaluate_tags=True,
                     remove_missing_features='scan'):
    """
    Take a trained feedforward neural network model and output evaluation results to a csv in the specified location.

    :param results_dir: The directory to which to write the 'results.csv' file; WARNING -- this will overwrite any
        existing results in that location
    :param checkpoint_file: The checkpoint file containing the weights to evaluate
    :param db_path: the path to the directory containing the meta.db file; defaults to the value in config.py
    :param evaluate_malware: defaults to True; whether or not to record malware labels and predictions
    :param evaluate_count: defaults to True; whether or not to record count labels and predictions
    :param evaluate_tags: defaults to True; whether or not to record individual tag labels and predictions
    :param remove_missing_features: See help for remove_missing_features in train.py / train_network
    """
    os.system('mkdir -p {}'.format(results_dir))
    model = PENetwork(use_malware=True, use_counts=True, use_tags=True, n_tags=len(Dataset.tags),
                      feature_dimension=2381)
    model.load_state_dict(torch.load(checkpoint_file))
    model.to(device)
    generator = get_generator(mode='test', path=db_path, use_malicious_labels=evaluate_malware,
                              use_count_labels=evaluate_count,
                              use_tag_labels=evaluate_tags, return_shas=True,
                              remove_missing_features=remove_missing_features)
    logger.info('...running network evaluation')
    f = open(os.path.join(results_dir,'results.csv'),'w')
    first_batch = True
    for shas, features, labels in tqdm.tqdm(generator):
        features = features.to(device)
        predictions = model(features)
        results = normalize_results(labels, predictions)
        pd.DataFrame(results, index=shas).to_csv(f, header=first_batch)
        first_batch=False
    f.close()
    print('...done')
Example #9
0
def main(argv=None):
    tf.set_random_seed(1237)
    np.random.seed(1237)

    # Load data
    x_train, sorted_x_train = \
            utils.load_image_data(FLAGS.dataset, n_xl, n_channels, FLAGS.mbs)
    xshape = (-1, n_xl, n_xl, n_channels)
    print('Data shape = {}'.format(x_train.shape))

    x_train = x_train * 2 - 1
    sorted_x_train = sorted_x_train * 2 - 1

    # Make some data
    is_training = tf.placeholder_with_default(False,
                                              shape=[],
                                              name='is_training')
    generator = get_generator(FLAGS.dataset, FLAGS.arch,
                              n_code if FLAGS.arch == 'ae' else n_x, n_xl,
                              n_channels, n_z, ngf, is_training,
                              'transformation')
    if FLAGS.arch == 'adv':
        discriminator = get_discriminator(FLAGS.dataset, FLAGS.arch, n_x, n_xl,
                                          n_channels, n_f, ngf // 2,
                                          is_training)
        decoder = get_generator(FLAGS.dataset, FLAGS.arch, n_x, n_xl,
                                n_channels, n_f, ngf, is_training, 'decoder')

    # Define training/evaluation parameters
    run_name = 'results/{}_{}_{}_{}_c{}_mbs{}_bs{}_lr{}_t0{}'.format(
        FLAGS.dataset, FLAGS.arch, FLAGS.dist, FLAGS.match, n_code, FLAGS.mbs,
        FLAGS.bs, FLAGS.lr0, FLAGS.t0)

    if not os.path.exists(run_name):
        os.makedirs(run_name)

    # Build the computation graph
    if FLAGS.arch == 'ae':
        ae = ConvAE(x_train, (None, n_xl, n_xl, n_channels), ngf)
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            ae.train(sess)
            x_code = ae.encode(x_train, sess)
            sorted_x_code = ae.encode(sorted_x_train, sess)

        model = MyPMD(x_code, sorted_x_code, xshape, generator, run_name, ae)
    elif FLAGS.arch == 'adv':
        model = MyPMD(x_train,
                      sorted_x_train,
                      xshape,
                      generator,
                      run_name,
                      F=discriminator,
                      D=decoder)
    else:
        model = MyPMD(x_train, sorted_x_train, xshape, generator, run_name)

    # Run the inference
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        if FLAGS.arch == 'ae':
            ae.train(sess)

        print('Training...')
        model.train(sess,
                    gen_dict={
                        model.batch_size_ph: FLAGS.mbs,
                        is_training: False
                    },
                    opt_dict={
                        model.batch_size_ph: FLAGS.bs,
                        is_training: True
                    },
                    iters=((x_train.shape[0] - 1) // FLAGS.mbs) + 1)