def test_githubRootNoAuth():
    testArl = "[github,refractionPOINT/python-limacharlie]"
    nElemFound = 0
    with ARL(testArl) as r:
        for fileName, fileContent in r:
            assert (fileName)
            assert (0 != len(fileContent))
            nElemFound += 1
    assert (0 != nElemFound)
def test_straightFileCompat():
    testArl = "https://raw.githubusercontent.com/refractionPOINT/sigma_rules/master/README.md"
    nElemFound = 0
    with ARL(testArl) as r:
        for fileName, fileContent in r:
            assert (fileName)
            assert (0 != len(fileContent))
            nElemFound += 1
    assert (1 == nElemFound)
def test_httpsWithTar():
    testArl = "[https,api.github.com/repos/refractionPOINT/sigma_rules/tarball/0.2.0]"
    nElemFound = 0
    with ARL(testArl) as r:
        for fileName, fileContent in r:
            assert (fileName)
            assert (0 != len(fileContent))
            nElemFound += 1
    assert (0 != nElemFound)
def test_githubBranch():
    testArl = "[github,refractionPOINT/sigma/lc-rules/windows_sysmon/?ref=lc-rules]"
    nElemFound = 0
    with ARL(testArl) as r:
        for fileName, fileContent in r:
            assert (fileName)
            assert (0 != len(fileContent))
            nElemFound += 1
    assert (0 != nElemFound)
def test_githubSigleFileNoAuth():
    testArl = "[github,refractionPOINT/sigma/README.md]"
    nElemFound = 0
    with ARL(testArl) as r:
        for fileName, fileContent in r:
            assert (fileName)
            assert (0 != len(fileContent))
            nElemFound += 1
    assert (1 == nElemFound)
def test_githubSubdirNoAuth():
    testArl = "[github,refractionPOINT/sigma/rules/windows/builtin]"
    nElemFound = 0
    with ARL(testArl) as r:
        for fileName, fileContent in r:
            assert (fileName)
            assert (0 != len(fileContent))
            nElemFound += 1
    assert (0 != nElemFound)
Example #7
0
def get_model(config: Dict[str, Any], args: argparse.Namespace,
              dataset: FairnessDataset) -> pl.LightningModule:
    """Selects and inits a model instance for training.
    
    Args:
        config: Dict with hyperparameters (learning rate, batch size, eta).
        args: Object from the argument parser that defines various settings of
            the model, dataset and training.
        dataset: Dataset instance that will be used for training.
    
    Returns:
        An instantiated model; one of the following:
                
        Model based on Adversarially Reweighted Learning (ARL).
        Model based on Distributionally Robust Optimization (DRO).
        Model based on Inverse Probability Weighting (IPW).
        Baseline model; simple fully-connected or convolutional (TODO) network.
    """

    model: pl.LightningModule

    model = ARL(
        config=config,  # for hparam tuning
        input_shape=dataset.dimensionality,
        pretrain_steps=args.pretrain_steps,
        prim_hidden=args.prim_hidden,
        adv_hidden=args.adv_hidden,
        optimizer=OPT_BY_NAME[args.opt],
        dataset_type=args.dataset_type,
        adv_input=set(args.adv_input),
        num_groups=len(dataset.protected_index2value),
        opt_kwargs={"initial_accumulator_value": 0.1} if args.tf_mode else {})

    if args.tf_mode:

        def init_weights(layer):
            if type(layer) == torch.nn.Linear:
                torch.nn.init.xavier_uniform_(layer.weight)
                torch.nn.init.zeros_(layer.bias)

        model.apply(init_weights)

    return model
def test_maxSizeGood():
    testArl = "https://raw.githubusercontent.com/refractionPOINT/sigma_rules/master/README.md"
    nElemFound = 0
    try:
        with ARL(testArl, maxSize=1024) as r:
            for fileName, fileContent in r:
                nElemFound += 1
    except:
        pass
    assert (1 == nElemFound)
Example #9
0
    def downloadRules(self):
        newRules = {}
        newDetections = set()

        # We assume all D&R rules are in detections.yaml and all internal
        # lookup resources those rules use are each in a "resources/RESOURCE_NAME"
        # sub-directory in the repo.
        with ARL('[github,%s/%s,token,%s]' % (
                GITHUB_ORG,
                REPO_NAME,
                GITHUB_TOKEN,
        ),
                 maxConcurrent=5) as r:
            for fileName, content in r:
                # The detections are in a single file "detections.yaml".
                # like: ruleName => {detect => ..., respond => ...}
                if 'detections.yaml' == fileName:
                    try:
                        newRules = yaml.safe_load(content)
                    except:
                        raise Exception(
                            "failed to parse yaml from rules file %s: %s" %
                            (fileName, traceback.format_exc()))

                # Resources are in a "resources/" directory.
                if fileName.startswith('resources/'):
                    # This is a resource, use the filename without extension as name.
                    resourceName = fileName.split('/', 1)[1]
                    resourceName = resourceName[:resourceName.rfind('.')]
                    # We assume all resources are lookups.
                    self.publishResource(resourceName, 'lookup', content)

        for ruleName, rule in newRules.items():
            # Make sure the rule goes in the "replicant" namespace. This way
            # we don't need to set the namespace in the yaml file.
            rule['namespace'] = 'replicant'

            for drResponse in rule['respond']:
                if 'report' == drResponse['action']:
                    # We want to be notified for all rule reports.
                    newDetections.add(drResponse['name'])

        # Update the rules in effect.
        self.svcRules = newRules

        # Make sure we're subscribed to all the notifications.
        for detection in newDetections:
            self.subscribeToDetect(detection)
Example #10
0
def train(config: Dict[str, Any],
          args: argparse.Namespace,
          train_dataset: FairnessDataset,
          val_dataset: Optional[FairnessDataset] = None,
          test_dataset: Optional[FairnessDataset] = None,
          version=str(int(time())),
          fold_nbr=None) -> Tuple[pl.LightningModule, pl.Trainer]:
    """Single training run on a given dataset.
    
    Inits a model and optimizes its parameters on the given training dataset  
    with a given set of hyperparameters. Logs various metrics and stops the 
    training when the micro-average AUC on the validation set stops improving.
    
    Args:
        config: Dict with hyperparameters (learning rate, batch size, eta).
        args: Object from the argument parser that defines various settings 
            of the model, dataset and training.
        train_dataset: Dataset instance to use for training.
        val_dataset: Optional; dataset instance to use for validation.
        test_dataset: Optional; dataset instance to use for testing.
        version: Version used for the logging directory.
        fold_nbr: Optional; used for the logging directory if training run
            is part of kfold cross validation.
    
    Returns:
        Model with the highest micro-average AUC on the validation set during 
        the training run.
            
    Raises:
        AssertionError: If no model checkpoint callback exists.
    """

    # create logdir if necessary
    logdir: str = args.log_dir
    os.makedirs(logdir, exist_ok=True)

    # create fold loaders and callbacks
    train_loader = DataLoader(train_dataset,
                              batch_size=config['batch_size'],
                              shuffle=True,
                              num_workers=args.num_workers,
                              pin_memory=True)

    callbacks: List[pl.callbacks.Callback] = []
    callbacks.append(
        Logger(train_dataset,
               'train',
               batch_size=args.eval_batch_size,
               save_scatter=(args.model in ['ARL', 'ARL_strong', 'ARL_weak'])))

    if val_dataset is not None:
        callbacks.append(
            Logger(val_dataset, 'validation', batch_size=args.eval_batch_size))
        if not args.no_early_stopping:
            callbacks.append(
                EarlyStopping(monitor='validation/micro_avg_auc',
                              min_delta=0.00,
                              patience=10,
                              verbose=True,
                              mode='max'))

    if test_dataset is not None:
        callbacks.append(
            Logger(test_dataset,
                   'test',
                   batch_size=args.eval_batch_size,
                   save_scatter=(args.model
                                 in ['ARL', 'ARL_strong', 'ARL_weak'])))

    # Select model and instantiate
    model: pl.LightningModule = get_model(config, args, train_dataset)

    # create logger
    if args.grid_search:
        logger_version = ''
    else:
        logger_version = f'seed_{args.seed}'
    if fold_nbr is not None:
        logger_version += f'./fold_{fold_nbr}'

    logger = TensorBoardLogger(save_dir='./',
                               name=logdir,
                               version=logger_version)

    if not args.no_early_stopping:
        # create checkpoint
        checkpoint = ModelCheckpoint(save_weights_only=True,
                                     dirpath=logger.log_dir,
                                     mode='max',
                                     verbose=False,
                                     monitor='validation/micro_avg_auc')
        callbacks.append(checkpoint)

    # Create a PyTorch Lightning trainer
    trainer = pl.Trainer(
        logger=logger,
        gpus=1 if torch.cuda.is_available() else 0,
        max_steps=args.train_steps + args.pretrain_steps,
        callbacks=callbacks,
        gradient_clip_val=1 if args.model == 'DRO' else 0,
        progress_bar_refresh_rate=1 if args.p_bar else 0,
    )

    # Training
    fit_time = time()
    if val_dataset is not None:
        trainer.fit(model,
                    train_loader,
                    val_dataloaders=DataLoader(val_dataset,
                                               batch_size=args.eval_batch_size,
                                               num_workers=args.num_workers))
    else:
        trainer.fit(model, train_loader)
    print(f'time to fit was {time()-fit_time}')

    if not args.no_early_stopping:
        # necessary to make the type checker happy and since this is only run once,
        # runtime is not an issue
        assert trainer.checkpoint_callback is not None

        # Load best checkpoint after training
        if args.model == 'baseline':
            model = BaselineModel.load_from_checkpoint(
                trainer.checkpoint_callback.best_model_path)

        elif args.model == 'ARL':
            model = ARL.load_from_checkpoint(
                trainer.checkpoint_callback.best_model_path)

        elif args.model == 'DRO':
            model = DRO.load_from_checkpoint(
                trainer.checkpoint_callback.best_model_path)

        elif args.model == 'IPW':
            model = IPW.load_from_checkpoint(
                trainer.checkpoint_callback.best_model_path)

    return model, trainer
Example #11
0
def get_model(config: Dict[str, Any], args: argparse.Namespace,
              dataset: FairnessDataset) -> pl.LightningModule:
    """Selects and inits a model instance for training.
    
    Args:
        config: Dict with hyperparameters (learning rate, batch size, eta).
        args: Object from the argument parser that defines various settings of
            the model, dataset and training.
        dataset: Dataset instance that will be used for training.
    
    Returns:
        An instantiated model; one of the following:
                
        Model based on Adversarially Reweighted Learning (ARL).
        Model based on Distributionally Robust Optimization (DRO).
        Model based on Inverse Probability Weighting (IPW).
        Baseline model; simple fully-connected or convolutional (TODO) network.
    """

    model: pl.LightningModule

    if args.model == 'ARL':
        model = ARL(
            config=config,  # for hparam tuning
            input_shape=dataset.dimensionality,
            pretrain_steps=args.pretrain_steps,
            prim_hidden=args.prim_hidden,
            adv_hidden=args.adv_hidden,
            optimizer=OPT_BY_NAME[args.opt],
            dataset_type=args.dataset_type,
            adv_input=set(args.adv_input),
            num_groups=len(dataset.protected_index2value),
            opt_kwargs={"initial_accumulator_value": 0.1}
            if args.tf_mode else {})

    elif args.model == 'ARL_strong':
        model = ARL(
            config=config,  # for hparam tuning
            input_shape=dataset.dimensionality,
            pretrain_steps=args.pretrain_steps,
            prim_hidden=args.prim_hidden,
            adv_hidden=args.adv_hidden,
            optimizer=OPT_BY_NAME[args.opt],
            dataset_type=args.dataset_type,
            adv_input=set(args.adv_input),
            num_groups=len(dataset.protected_index2value),
            adv_cnn_strength='strong',
            opt_kwargs={"initial_accumulator_value": 0.1}
            if args.tf_mode else {})
    elif args.model == 'ARL_weak':
        model = ARL(
            config=config,  # for hparam tuning
            input_shape=dataset.dimensionality,
            pretrain_steps=args.pretrain_steps,
            prim_hidden=args.prim_hidden,
            adv_hidden=args.adv_hidden,
            optimizer=OPT_BY_NAME[args.opt],
            dataset_type=args.dataset_type,
            adv_input=set(args.adv_input),
            num_groups=len(dataset.protected_index2value),
            adv_cnn_strength='weak',
            opt_kwargs={"initial_accumulator_value": 0.1}
            if args.tf_mode else {})
    elif args.model == 'DRO':
        model = DRO(
            config=config,  # for hparam tuning
            num_features=dataset.dimensionality,
            hidden_units=args.prim_hidden,
            pretrain_steps=args.pretrain_steps,
            k=args.k,
            optimizer=OPT_BY_NAME[args.opt],
            opt_kwargs={"initial_accumulator_value": 0.1}
            if args.tf_mode else {})

    elif args.model == 'IPW':
        model = IPW(
            config=config,  # for hparam tuning
            num_features=dataset.dimensionality,
            hidden_units=args.prim_hidden,
            optimizer=OPT_BY_NAME[args.opt],
            group_probs=dataset.group_probs,
            sensitive_label=args.sensitive_label,
            opt_kwargs={"initial_accumulator_value": 0.1}
            if args.tf_mode else {})
        args.pretrain_steps = 0  # NO PRETRAINING

    elif args.model == 'baseline':
        model = BaselineModel(
            config=config,  # for hparam tuning
            num_features=dataset.dimensionality,
            hidden_units=args.prim_hidden,
            optimizer=OPT_BY_NAME[args.opt],
            dataset_type=args.dataset_type,
            opt_kwargs={"initial_accumulator_value": 0.1}
            if args.tf_mode else {})
        args.pretrain_steps = 0  # NO PRETRAINING

    # if Tensorflow mode is active, we use the TF default initialization,
    # which means Xavier/Glorot uniform (with gain 1) for the weights
    # and 0 bias
    if args.tf_mode:

        def init_weights(layer):
            if type(layer) == torch.nn.Linear:
                torch.nn.init.xavier_uniform_(layer.weight)
                torch.nn.init.zeros_(layer.bias)

        model.apply(init_weights)

    return model
Example #12
0
def train_for_n_iters(train_dataset,
                      test_dataset,
                      model_params,
                      lr_params,
                      n_iters=5,
                      train_steps=1000,
                      test_every=10,
                      pretrain_steps=250,
                      print_loss=True,
                      log_dir="logs/",
                      model_name="ARL"):
    """
    Trains the ARL model for n iterations, and averages the results. 

    Args:
        train_dataset: Data iterator of the train set.
        test_dataset: Data iterator of the test set. 
        model_params: A dictionary with model hyperparameters. 
        lr_params: A dictionary with hyperparmaeters for optimizers.
        n_iters: How often to train the model with different seeds.
        train_steps: Number of training steps. 
        test_every: How often to evaluate on test set. 
        pretrain_steps: Number of pretrain steps (steps with no adversary).
        print_loss: Wheter to print the loss during training. 
        log_dir: Directory where to save the tensorboard loggers. 
    """
    # Set the device on which to train.
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    model_params["device"] = device

    # Initiate metrics object.
    metrics = FairnessMetrics(n_iters, test_every)

    # Preparation of logging directories.
    experiment_dir = os.path.join(
        log_dir,
        datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S"))
    checkpoint_dir = os.path.join(experiment_dir, "checkpoints")
    os.makedirs(experiment_dir, exist_ok=True)
    os.makedirs(checkpoint_dir, exist_ok=True)

    # Initialte TensorBoard loggers.
    summary_writer = SummaryWriter(experiment_dir)
    logger_learner = TensorBoardLogger(summary_writer, name="learner")
    logger_adv = TensorBoardLogger(summary_writer, name="adversary")
    logger_metrics = TensorBoardLogger(summary_writer, name="metrics")

    for i in range(n_iters):
        print(f"Training model {i + 1}/{n_iters}")
        seed_everything(42 + i)

        # Load the train dataset as a pytorch dataloader.
        train_loader = DataLoader(train_dataset,
                                  batch_size=model_params["batch_size"],
                                  shuffle=True)

        # Create the model.
        if model_name == "ARL":
            model = ARL(**model_params)
        elif model_name == "baseline":
            model = baseline(**model_params)
        else:
            print("Unknown model")

        # Transfer model to correct device.
        model = model.to(device)

        # Adagrad is the defeault optimizer.
        optimizer_learner = torch.optim.Adagrad(model.learner.parameters(),
                                                lr=lr_params["learner"])
        if model_name == 'ARL':
            optimizer_adv = torch.optim.Adagrad(model.adversary.parameters(),
                                                lr=lr_params["adversary"])
        elif model_name == 'baseline':
            optimizer_adv = None

        # Train the model with current seeds.
        if print_loss:
            print("Start training on device {}".format(device))
        train_model(
            model,
            train_loader,
            test_dataset,
            train_steps,
            test_every,
            pretrain_steps,
            optimizer_learner,
            optimizer_adv,
            metrics,
            checkpoint_dir,
            logger_learner,
            logger_adv,
            logger_metrics,
            n_iters=i,
            print_loss=print_loss,
            device=device,
        )

    # Average results and return metrics
    metrics.average_results()
    return metrics