Esempio n. 1
0
def seed(ctx, language):
    """
    Create a list of seed posts for our classifier by language
    """
    options = None
    for directory, conf in confs(ctx.obj["base"]):
        if conf["language"] == language:
            options = conf
            conf_dir = directory
            break

    if options is None:
        print("Couldn't find a config for {}".format(language))
        exit()

    with open(os.path.join(conf_dir, 'seeds_config.json'), 'r') as seeds_file:
        seeds_config = json.load(seeds_file)

    graph_token_url = 'https://graph.facebook.com/oauth/access_token?' \
                      'client_id={}&client_secret={}' \
                      '&grant_type=client_credentials'
    res = requests.get(graph_token_url.format(
        os.environ['FACEBOOK_APP_ID'],
        os.environ['FACEBOOK_APP_SECRET']))

    access_token = json.loads(res.text)['access_token']
    graph = facebook.GraphAPI(access_token, version=2.7)

    messages = {
        'political': fetch(seeds_config["political"], 100, graph),
        'not_political': fetch(seeds_config["not_political"], 100, graph)
    }

    with open(os.path.join(conf_dir, 'seeds.json'), 'w') as out:
        json.dump(messages, out)
Esempio n. 2
0
def add_seeds_from_id(ctx, language):
    """
    Create a list of seed posts for our classifier by language
    """
    options = None
    for directory, conf in confs(ctx.obj["base"]):
        if conf["language"] == language:
            options = conf
            conf_dir = directory
            break

    if options is None:
        print("Couldn't find a config for {}".format(language))
        exit()

    with open(os.path.join(conf_dir, 'additional_seed_ids.json'), 'r') as additional_seeds_file:
        additional_seed_ids = json.load(additional_seeds_file)

    with open(os.path.join(conf_dir, 'seeds.json'), 'r') as old_seeds_file:
        seeds = json.load(old_seeds_file)

    for politicality in ["political", "not_political"]:
        records = DB.query("select * from ads where id in ('{}')".format("','".join(map(str, additional_seed_ids[politicality]))))
        for record in records:
            seeds[politicality].append(get_text(record))

    for politicality in ["political", "not_political"]:
        seeds[politicality] = list(set(seeds[politicality]))

    with open(os.path.join(conf_dir, 'seeds.json'), 'w') as out:
        json.dump(seeds, out)
def analyze(ctx, id):
    """
    Analyze permutes an ad's text to find which words contribute most to its political rating.
    """
    classifiers = dict()
    for (directory, conf) in confs(ctx.obj["base"]):
        with open(classifier_path(directory), 'rb') as classy:
            classifiers[conf["language"]] = {
                "classifier": dill.load(classy),
                "vectorizer": get_vectorizer(conf)
            }
    records = DB.query("""select * from ads where id = '{}'""".format(id))

    idx = 0
    for record in records:
        record_lang = record["lang"]
        if record_lang in classifiers:
            classifier = classifiers[record_lang]
            text = clean_text(get_text(record), record["advertiser"])
            baseline = text_probability(classifier, text)
            permuted_texts = permute_text(text)

            diffs = [(deleted_word,
                      baseline - text_probability(classifier, permuted_text))
                     for (deleted_word, permuted_text) in permuted_texts]

            print("text: {}".format(text))
            print("original probability: {}".format(baseline))
            biggest_diffs = sorted(
                diffs, key=lambda word_diff: -abs(word_diff[1]))[:4]
            print("top difference-makers:")
            for (deleted_word, permuted_text) in biggest_diffs:
                print(" - {}, {}".format(deleted_word, permuted_text))
Esempio n. 4
0
def classify(ctx, newest, lang):
    """
    Classify the ads in the database at $DATABASE_URL.
    """
    classifiers = dict()
    for (directory, conf) in confs(ctx.obj["base"]):
        with open(classifier_path(directory), 'rb') as classy:
            classifiers[conf["language"]] = {
                "classifier": dill.load(classy),
                "vectorizer": get_vectorizer(conf)
            }

    if newest:
        print("Running newest")
        query = "select * from ads where political_probability = 0"
        if lang:
            query = query + " and lang = '{}'".format(lang)
        else:
            langs = map(lambda x: "'{}'".format(x), classifiers.keys())
            langs = ','.join(langs)

            query = query + " and lang in ({})".format(langs)
    else:
        print("Running every")
        query = "select * from ads"
        if lang:
            query = query + " where lang = '{}'".format(lang)

    total = "select count(*) as length from ({}) as t1;"
    length = DB.query(total.format(query))[0]["length"]
    records = DB.query(query)
    print("found {} ads".format(length))
    updates = []
    query = "update ads set political_probability=:probability where id=:id"
    idx = 0
    for record in records:
        idx += 1
        record_lang = "en-US" if record["lang"] == "en-IE" else record["lang"]
        if record_lang in classifiers:
            classifier = classifiers[record_lang]
            text = classifier["vectorizer"].transform([get_text(record)])
            probability = classifier["classifier"].predict_proba(text)[0][1]
            update = {"id": record["id"], "probability": probability}
            if record["political_probability"] > update[
                    "probability"] and record[
                        "political_probability"] >= 0.70 and update[
                            "probability"] < 0.70 and not record["suppressed"]:
                print("refusing to downgrade probability of ad {}".format(
                    record["id"]))
            updates.append(update)
            out = "Classified {pid[id]} ({info[idx]} of {info[length]}) with {pid[probability]}"
            print(out.format(pid=update, info={"length": length, "idx": idx}))

            if len(updates) >= 100:
                DB.bulk_query(query, updates)
                updates = []

    if updates:
        DB.bulk_query(query, updates)
Esempio n. 5
0
def diagnostics(ctx):
    """
    Warning! Slow! Run all classifiers against our database
    """
    for (directory, conf) in confs(ctx.obj["base"]):
        for name, classifier in get_classifiers().items():
            print("Report for {} in {}".format(name, conf["language"]))
            train_classifier(classifier, get_vectorizer(conf), directory, conf["language"])
Esempio n. 6
0
def classify(ctx, newest, lang):
    """
    Classify the ads in the database at $DATABASE_URL.
    """
    if newest:
        print("Running newest")
        query = "select * from ads where political_probability = 0"
        if lang:
            query = query + " and lang = '{}'".format(lang)
    else:
        print("Running every")
        query = "select * from ads"
        if lang:
            query = query + " where lang = '{}'".format(lang)

    length = DB.query("select count(*) as length from ({}) as t1;".format(
        query))[0]["length"]
    records = DB.query(query)
    classifiers = dict()
    for (directory, conf) in confs(ctx.obj["base"]):
        with open(classifier_path(directory), 'rb') as classy:
            classifiers[conf["language"]] = {
                "classifier": dill.load(classy),
                "vectorizer": get_vectorizer(conf)
            }

    print("found {} ads".format(length))
    updates = []
    query = "update ads set political_probability=:probability where id=:id"
    idx = 0
    for record in records:
        idx += 1
        if record["lang"] in classifiers:
            classifier = classifiers[record["lang"]]
            text = classifier["vectorizer"].transform(
                [get_text(record["html"])])
            update = {
                "id": record["id"],
                "probability":
                classifier["classifier"].predict_proba(text)[0][1]
            }
            updates.append(update)

            print(
                "Classified {p[id]} ({l[idx]} of {l[length]}) with {p[probability]}"
                .format(p=update, l={
                    "length": length,
                    "idx": idx
                }))

            if len(updates) >= 100:
                DB.bulk_query(query, updates)
                updates = []

    if updates:
        DB.bulk_query(query, updates)
Esempio n. 7
0
def build(ctx):
    """
    Build classifiers for each of our languages.
    """
    for (directory, conf) in confs(ctx.obj["base"]):
        model = train_classifier(get_classifier(), get_vectorizer(conf),
                                 directory, conf["language"])
        model_path = classifier_path(directory)
        with open(model_path, 'wb') as classy:
            dill.dump(model, classy)
        print("Saved model {}".format(model_path))
def get_models(ctx, lang):
    """
    download classifiers for each of our languages.
    """
    for (directory, conf) in confs(ctx.obj["base"]):
        if lang and conf["language"] != lang:
            continue
        model_path = "data/{}/classifier.dill".format(conf["language"])
        classifier_file = "{}/classifier.dill".format(conf["language"])
        # call(["wget", "-nv", "-O", model_path, "https://s3.amazonaws.com/pp-data/fbpac-models/{}/classifier.dill".format(conf["language"])])
        print("Fetching " + classifier_file + " from S3")
        s3.download_file("tgam-fbpac-models", classifier_file, model_path)
def get_models(ctx, lang):
    """
    download classifiers for each of our languages.
    """
    for (directory, conf) in confs(ctx.obj["base"]):
        if lang and conf["language"] != lang:
            continue
        model_path = "data/{}/classifier.dill".format(conf["language"])
        call([
            "wget", "-nv", "-O", model_path,
            "https://s3.amazonaws.com/pp-data/fbpac-models/{}/classifier.dill".
            format(conf["language"])
        ])