def __init__(self): """ Initialize FeatureExtractor """ config = config_loader.ConfigLoader().load() training_dir = config["training"]["root"] if (not os.path.isdir(training_dir)): os.mkdir(training_dir) # dictionary of initialized feture extractors self.extractors = {}
def __init__(self): config = config_loader.ConfigLoader().load() self.top_n = int(config["feature_extractors"]["top_n_speeds"]) self.manager = otpmanager.OTPManager( config["feature_extractors"]["otp_name"], *tuple([ float(x) for x in config["feature_extractors"]["otp_bbox"].split(",") ])) self.manager.start() self.router = route_distances.OTPDistances("localhost:%d" % self.manager.port)
def __init__(self): """ Initializes Classifier """ config = config_loader.ConfigLoader().load() features = config["classifier"]["features"].split(",") self.collection = None self.feature_extractor = FeatureExtractor() self.classifier = sklearn.ensemble.RandomForestClassifier( n_estimators=int(config["classifier"]["n_estimators"])) if ((features == "all") or (features == ["all"])): features = FEATURE_EXTRACTORS.keys() self.features = sorted(features)
def __init__(self): config = config_loader.ConfigLoader().load() crm114_dir = "%s/%s" % (config["training"]["root"], config["training"]["crm114"]) if (config["setup"]["trained_crm114"] == "n"): print("The CRM114 discriminator must be trained first.") response = prompt_yn("Train now using the caverlee-2011 dataset?") if (response): assert train_crm114.train(crm114_dir), "Training failed" self.config["setup"]["trained_crm114"] = "y" with open(CONFIG_FILE, "w") as f: self.config.write(f) else: sys.exit(1) self.crm = crm114.Classifier(crm114_dir, ["spam", "ham"])
def __init__(self): """ Initialize SafeBrowsing class Args: api_key: The Google API key to use to initialize sbserver expand_urls: Whether or not SafeBrowsing should attempt to expand any URL that passes the initial lookup db_path: The path to store the safe browsing database in """ config = config_loader.ConfigLoader().load() feature_config = config["feature_extractors"] self.expand_urls = bool( int(feature_config["google_safebrowsing_expand_urls"])) self.address = feature_config["google_sbserver_address"] bloom_path = feature_config["google_safebrowsing_bloom"] bloom_capacity = int( feature_config["google_safebrowsing_bloom_capacity"]) bloom_error_rate = float( feature_config["google_safebrowsing_bloom_err_rate"]) self.proc = subprocess.Popen([ "sbserver", "-apikey", config["credentials"]["google_api_key"], "-db", feature_config["google_sbserver_db_path"], "-srvaddr", self.address ]) atexit.register(self.proc.kill) if (os.path.isfile(bloom_path)): self.bloom_cache = pybloomfilter.BloomFilter.open(bloom_path) else: self.bloom_cache = pybloomfilter.BloomFilter( bloom_capacity, bloom_error_rate, bloom_path) # Wait for server to start start_time = time.time() while True: try: requests.get("http://%s" % self.address) break except: if (time.time() - start_time > MAX_STARTUP_TIME): raise Exception("sbserver took too long to start up") else: time.sleep(0.1)
def __init__(self): config = config_loader.ConfigLoader().load() self.tweet_sources = { "mostly_human": [], "mixed": [], "mostly_bot": [] } with open( "%s/%s" % (config["training"]["root"], config["training"]["tweet_sources"]), "r") as f: for row in csv.DictReader(f): mostly_bot = row["MOSTLY_BOT"] client = row["CLIENT"] if (mostly_bot == "-1"): self.tweet_sources["mostly_human"].append(client) elif (mostly_bot == "0"): self.tweet_sources["mixed"].append(client) elif (mostly_bot == "1"): self.tweet_sources["mostly_bot"].append(client)
Args: input_file: The path to load a new classifier from """ self.classifier = sklearn.externals.joblib.load(input_file) print("Loaded classifier from %s" % input_file) if (__name__ == "__main__"): import random n_sample = 500 classifier = Classifier() config = config_loader.ConfigLoader().load() training_root = config["training"]["root"] with open("%s/%s" % (training_root, config["training"]["spam_geotagged"])) as f: spam_ids = [int(id_) for id_ in f.readlines()] with open("%s/%s" % (training_root, config["training"]["ham_geotagged"])) as f: hpam_ids = [int(id_) for id_ in f.readlines()] classifier.connect("caverlee_2011", "spam") classifier.gen_feature_matrix(random.sample(spam_ids, n_sample), "spam.csv") classifier.connect("caverlee_2011", "ham")
def __init__(self): config = config_loader.ConfigLoader().load() self.top_n = int(config["feature_extractors"]["top_n_speeds"])