def preload_process(sites): global _raw_features, _sites _raw_features = load_raw_features() _sites = sites
from idpanel.decision_tree import DecisionTree from sklearn.cross_validation import cross_val_score, train_test_split import json import numpy as np decision_trees = {} if __name__ == "__main__" or True: # todo Add some command line options # Output path for model # Maximum number of attempts to generate a model for each label maximum_model_attempts = 100 # Maximum number of models per label max_models_per_label = 3 label_indeces = load_labels() raw_features = load_raw_features() original_labels, names, vectors = load_raw_feature_vectors() labels = [label_indeces.index(l) for l in original_labels] vectors = np.array(vectors) print "Creating training and testing sets" X_train, X_test, y_train, y_test = train_test_split(vectors, labels, stratify=labels) print X_train.shape[0], "samples in training set,", len(set( list(y_train))), "labels in training set" print X_test.shape[0], "samples in training set,", len(set( list(y_test))), "labels in testing set" decision_trees = {} for label in label_indeces:
if __name__ == "__main__": print "Loading prevectors" data_points = [] with open("prevectors.json", "r") as f: for line in f: line = line.strip() if len(line) == 0: continue line = json.loads(line) data_points.append(line) label_indeces = load_labels() raw_features = load_raw_features() print "Loaded {0} features".format(len(raw_features)) print "Grouping prevectors by base_url" sites = {} site_labels = {} for dp in data_points: if dp['base_url'] not in sites: sites[dp['base_url']] = {} site_labels[dp['base_url']] = dp['label'] sites[dp['base_url']][dp['offset']] = {"code": dp['code'], "content_ssdeep": dp['content_ssdeep']} print "Vectorizing {0} base urls".format(len(sites)) labels = [] names = []
else: # its probably a url... base_url = reformat_url(base_url) base_urls = [base_url] model_path = args.model pool = Pool(size=16) offsets = set() with open(model_path, "rb") as f: model = pickle.load(f) classifier = model["model"] relevant_features = model["relevant_features"].flatten() for rfi, rf in enumerate(load_raw_features()): if relevant_features[rfi]: offsets.add(rf[0]) results = {} stderr.write("Identifying panels we can actually reach\n") for base_url, r1, r2 in pool.imap_unordered(get_result_wrapper, [(i, "") for i in base_urls]): if base_url is not None: stderr.write("We can reach {0}\n".format(base_url)) results[base_url] = {} requests_to_make = [] for offset in offsets: for base_url in results.keys():