Beispiel #1
0
def process(params):
    # load and run pipeline
    datasource = CSVDataSource(params.input_file)
    p = Pipeline('DEV')
    p.load_config(params.conf_file)
    p.load_model(params.model_file)
    p(datasource)
    # save, if necessary
    if params.output_file:
        datasource.save(open(params.output_file, 'w'))
    # push to elasticsearch
    es = Elasticsearch([{
        'host': 'localhost',
        'port': 9200
    }],
                       http_auth=('admin', 'admin'))
    data = [item for item in datasource]
    for item in data:
        item['model'] = p._scoring_model_name
        item['raw'] = str(item['labels'])
        for key in item:
            if item[key] == 'NaN' or (is_numeric(item[key])
                                      and np.isnan(item[key])):
                item[key] = None
    helpers.bulk(es, data, index="anomalies", doc_type="type")
Beispiel #2
0
def process(params):
    # load and run pipeline
    datasource = CSVDataSource(params.input_file)
    p = Pipeline('DEV')
    p.load_config(params.conf_file)
    model = p.build_pipeline(datasource)
    json.dump(model, open(params.model_file, 'w'))
Beispiel #3
0
def process(params):
    datasource = CSVDataSource(params.input_file)
    sys.stdout.write('Preprocessing')
    field_type = _detect_field_type(datasource)
    sys.stdout.write('\t::Detected field types:\n')
    for key in field_type:
        sys.stdout.write('\t\t"{0}": {1}\n'.format(key, field_type[key]))

    generators = _get_generators(datasource, field_type)
    sys.stdout.write('\t::Suggested generators:\n')
    for item in generators:
        sys.stdout.write('\t\t{0}: {1}\n'.format(item[0], item[1]))

    _write_conf(generators, params.output_file)
Beispiel #4
0
                    label_list.append(label)
            all_labels.append(label_list)
        dataset[dest_field_labels] = all_labels
        dataset['_labels'] = all_labels
        if self._detect_anomalies is not None:
            scores = self._detect_anomalies(dataset)
            dataset[dest_field_score] = scores


if __name__ == '__main__':
    p = Pipeline('DEV')
    p.load_config('tests/pipeline_test.conf')
    import time

    ts1 = time.time()
    datasource = CSVDataSource('tests/test_small.csv')
    ts2 = time.time()
    pipeline_model = p.build_pipeline(datasource)
    ts3 = time.time()
    p(datasource)
    ts4 = time.time()
    json.dump(pipeline_model, open('tests/pipeline.json', 'w'), indent=4)
    for item in datasource[:10]:
        print(item)
        print()
        print()

    print(
        "Timing:\n\tLoad dataset: {0}\n\tBuild pipeline: {1}\n\tApply models:{2}\n\tDataset size: {3} entries\n".format(
            ts2 - ts1, ts3 - ts2, ts4 - ts3, len(datasource)))
Beispiel #5
0
    def from_pretrained(pretrained: str) -> AnomalyDetection:
        tmp = json.loads(pretrained)
        pre_model = pickle.loads(base64.b64decode(tmp['model']))
        model = SupervisedClassifierAnomaly()
        model._encoder = pre_model['encoder']
        model._ind_to_ground_truth = pre_model['ind_to_ground_truth']
        model._is_binary_preds = pre_model['is_binary_preds']
        model._model = pre_model['classifier']

        return model


if __name__ == "__main__":
    from osas.data.datasources import CSVDataSource

    data_source = CSVDataSource('corpus/hubble_test_tags.csv')

    def coverter(x):
        return ast.literal_eval(x)

    data_source._data['_labels'] = data_source._data['_labels'].apply(
        lambda x: coverter(x))

    model = StatisticalNGramAnomaly()
    tmp = model.build_model(data_source)
    tmp = json.dumps(tmp)
    model2 = StatisticalNGramAnomaly.from_pretrained(tmp)
    scores = model(data_source)

    scores2 = model2(data_source)
    import operator
Beispiel #6
0
        reg_lab = [(regex, label)
                   for regex, label in zip(regex_list, label_list)]
        kblg = KnowledgeBased(reg_lab, field_name)
        return kblg


if __name__ == '__main__':
    mfc = MultinomialFieldCombiner(['user', 'parent_process'],
                                   absolute_threshold=500,
                                   relative_threshold=0.005)
    nfc = NumericField('count')
    tf = TextField('command', lm_mode='token', ngram_range=(3, 5))
    klg = KeywordBased(
        keyword_list=['bash', 'java', 'netcat', 'sudo', 'apache2'],
        field_name='command')
    from osas.data.datasources import CSVDataSource

    dataset = CSVDataSource('corpus/test.csv')
    print("Building model")
    klg.build_model(dataset)
    print("Done")

    #    rez = mfc.build_model(dataset)
    for item in dataset[:20]:
        print("\n\n")
        print(item)
        print("")
        print(klg(item))
        print("\n\n")
        print("=" * 20)