def preload_process(sites):
    global _raw_features, _sites
    _raw_features = load_raw_features()
    _sites = sites
Exemple #2
0
from idpanel.decision_tree import DecisionTree
from sklearn.cross_validation import cross_val_score, train_test_split
import json
import numpy as np

decision_trees = {}
if __name__ == "__main__" or True:
    # todo Add some command line options
    # Output path for model
    # Maximum number of attempts to generate a model for each label
    maximum_model_attempts = 100
    # Maximum number of models per label
    max_models_per_label = 3

    label_indeces = load_labels()
    raw_features = load_raw_features()
    original_labels, names, vectors = load_raw_feature_vectors()
    labels = [label_indeces.index(l) for l in original_labels]

    vectors = np.array(vectors)
    print "Creating training and testing sets"
    X_train, X_test, y_train, y_test = train_test_split(vectors,
                                                        labels,
                                                        stratify=labels)
    print X_train.shape[0], "samples in training set,", len(set(
        list(y_train))), "labels in training set"
    print X_test.shape[0], "samples in training set,", len(set(
        list(y_test))), "labels in testing set"

    decision_trees = {}
    for label in label_indeces:

if __name__ == "__main__":
    print "Loading prevectors"
    data_points = []
    with open("prevectors.json", "r") as f:
        for line in f:
            line = line.strip()
            if len(line) == 0:
                continue

            line = json.loads(line)
            data_points.append(line)

    label_indeces = load_labels()
    raw_features = load_raw_features()
    print "Loaded {0} features".format(len(raw_features))

    print "Grouping prevectors by base_url"
    sites = {}
    site_labels = {}
    for dp in data_points:
        if dp['base_url'] not in sites:
            sites[dp['base_url']] = {}
            site_labels[dp['base_url']] = dp['label']

        sites[dp['base_url']][dp['offset']] = {"code": dp['code'], "content_ssdeep": dp['content_ssdeep']}

    print "Vectorizing {0} base urls".format(len(sites))
    labels = []
    names = []
    else:
        # its probably a url...
        base_url = reformat_url(base_url)
        base_urls = [base_url]

    model_path = args.model

    pool = Pool(size=16)

    offsets = set()
    with open(model_path, "rb") as f:
        model = pickle.load(f)
        classifier = model["model"]
        relevant_features = model["relevant_features"].flatten()
        for rfi, rf in enumerate(load_raw_features()):
            if relevant_features[rfi]:
                offsets.add(rf[0])

    results = {}

    stderr.write("Identifying panels we can actually reach\n")
    for base_url, r1, r2 in pool.imap_unordered(get_result_wrapper,
                                                [(i, "") for i in base_urls]):
        if base_url is not None:
            stderr.write("We can reach {0}\n".format(base_url))
            results[base_url] = {}

    requests_to_make = []
    for offset in offsets:
        for base_url in results.keys():