def __init__(self):
        """ Initialize FeatureExtractor """

        config = config_loader.ConfigLoader().load()

        training_dir = config["training"]["root"]
        if (not os.path.isdir(training_dir)):
            os.mkdir(training_dir)

        # dictionary of initialized feture extractors
        self.extractors = {}
Example #2
0
    def __init__(self):
        config = config_loader.ConfigLoader().load()

        self.top_n = int(config["feature_extractors"]["top_n_speeds"])
        self.manager = otpmanager.OTPManager(
            config["feature_extractors"]["otp_name"],
            *tuple([
                float(x)
                for x in config["feature_extractors"]["otp_bbox"].split(",")
            ]))
        self.manager.start()
        self.router = route_distances.OTPDistances("localhost:%d" %
                                                   self.manager.port)
    def __init__(self):
        """ Initializes Classifier """

        config = config_loader.ConfigLoader().load()
        features = config["classifier"]["features"].split(",")

        self.collection = None
        self.feature_extractor = FeatureExtractor()
        self.classifier = sklearn.ensemble.RandomForestClassifier(
            n_estimators=int(config["classifier"]["n_estimators"]))

        if ((features == "all") or (features == ["all"])):
            features = FEATURE_EXTRACTORS.keys()
        self.features = sorted(features)
Example #4
0
    def __init__(self):
        config = config_loader.ConfigLoader().load()
        crm114_dir = "%s/%s" % (config["training"]["root"],
                                config["training"]["crm114"])

        if (config["setup"]["trained_crm114"] == "n"):
            print("The CRM114 discriminator must be trained first.")
            response = prompt_yn("Train now using the caverlee-2011 dataset?")
            if (response):
                assert train_crm114.train(crm114_dir), "Training failed"
                self.config["setup"]["trained_crm114"] = "y"
                with open(CONFIG_FILE, "w") as f:
                    self.config.write(f)
            else:
                sys.exit(1)
        self.crm = crm114.Classifier(crm114_dir, ["spam", "ham"])
Example #5
0
    def __init__(self):
        """ Initialize SafeBrowsing class

        Args:
            api_key: The Google API key to use to initialize sbserver
            expand_urls: Whether or not SafeBrowsing should attempt to expand
                any URL that passes the initial lookup
            db_path: The path to store the safe browsing database in
        """

        config = config_loader.ConfigLoader().load()
        feature_config = config["feature_extractors"]

        self.expand_urls = bool(
            int(feature_config["google_safebrowsing_expand_urls"]))
        self.address = feature_config["google_sbserver_address"]

        bloom_path = feature_config["google_safebrowsing_bloom"]
        bloom_capacity = int(
            feature_config["google_safebrowsing_bloom_capacity"])
        bloom_error_rate = float(
            feature_config["google_safebrowsing_bloom_err_rate"])

        self.proc = subprocess.Popen([
            "sbserver", "-apikey", config["credentials"]["google_api_key"],
            "-db", feature_config["google_sbserver_db_path"], "-srvaddr",
            self.address
        ])
        atexit.register(self.proc.kill)

        if (os.path.isfile(bloom_path)):
            self.bloom_cache = pybloomfilter.BloomFilter.open(bloom_path)
        else:
            self.bloom_cache = pybloomfilter.BloomFilter(
                bloom_capacity, bloom_error_rate, bloom_path)

        # Wait for server to start
        start_time = time.time()
        while True:
            try:
                requests.get("http://%s" % self.address)
                break
            except:
                if (time.time() - start_time > MAX_STARTUP_TIME):
                    raise Exception("sbserver took too long to start up")
                else:
                    time.sleep(0.1)
Example #6
0
    def __init__(self):
        config = config_loader.ConfigLoader().load()
        self.tweet_sources = {
            "mostly_human": [],
            "mixed": [],
            "mostly_bot": []
        }

        with open(
                "%s/%s" %
            (config["training"]["root"], config["training"]["tweet_sources"]),
                "r") as f:
            for row in csv.DictReader(f):
                mostly_bot = row["MOSTLY_BOT"]
                client = row["CLIENT"]
                if (mostly_bot == "-1"):
                    self.tweet_sources["mostly_human"].append(client)
                elif (mostly_bot == "0"):
                    self.tweet_sources["mixed"].append(client)
                elif (mostly_bot == "1"):
                    self.tweet_sources["mostly_bot"].append(client)
        Args:
            input_file: The path to load a new classifier from
        """

        self.classifier = sklearn.externals.joblib.load(input_file)
        print("Loaded classifier from %s" % input_file)


if (__name__ == "__main__"):
    import random

    n_sample = 500

    classifier = Classifier()
    config = config_loader.ConfigLoader().load()
    training_root = config["training"]["root"]

    with open("%s/%s" %
              (training_root, config["training"]["spam_geotagged"])) as f:
        spam_ids = [int(id_) for id_ in f.readlines()]

    with open("%s/%s" %
              (training_root, config["training"]["ham_geotagged"])) as f:
        hpam_ids = [int(id_) for id_ in f.readlines()]

    classifier.connect("caverlee_2011", "spam")
    classifier.gen_feature_matrix(random.sample(spam_ids, n_sample),
                                  "spam.csv")

    classifier.connect("caverlee_2011", "ham")
Example #8
0
 def __init__(self):
     config = config_loader.ConfigLoader().load()
     self.top_n = int(config["feature_extractors"]["top_n_speeds"])