Example #1
0
def add_seeds_from_id(ctx, language):
    """
    Create a list of seed posts for our classifier by language
    """
    options = None
    for directory, conf in confs(ctx.obj["base"]):
        if conf["language"] == language:
            options = conf
            conf_dir = directory
            break

    if options is None:
        print("Couldn't find a config for {}".format(language))
        exit()

    with open(os.path.join(conf_dir, 'additional_seed_ids.json'), 'r') as additional_seeds_file:
        additional_seed_ids = json.load(additional_seeds_file)

    with open(os.path.join(conf_dir, 'seeds.json'), 'r') as old_seeds_file:
        seeds = json.load(old_seeds_file)

    for politicality in ["political", "not_political"]:
        records = DB.query("select * from ads where id in ('{}')".format("','".join(map(str, additional_seed_ids[politicality]))))
        for record in records:
            seeds[politicality].append(get_text(record))

    for politicality in ["political", "not_political"]:
        seeds[politicality] = list(set(seeds[politicality]))

    with open(os.path.join(conf_dir, 'seeds.json'), 'w') as out:
        json.dump(seeds, out)
def analyze(ctx, id):
    """
    Analyze permutes an ad's text to find which words contribute most to its political rating.
    """
    classifiers = dict()
    for (directory, conf) in confs(ctx.obj["base"]):
        with open(classifier_path(directory), 'rb') as classy:
            classifiers[conf["language"]] = {
                "classifier": dill.load(classy),
                "vectorizer": get_vectorizer(conf)
            }
    records = DB.query("""select * from ads where id = '{}'""".format(id))

    idx = 0
    for record in records:
        record_lang = record["lang"]
        if record_lang in classifiers:
            classifier = classifiers[record_lang]
            text = clean_text(get_text(record), record["advertiser"])
            baseline = text_probability(classifier, text)
            permuted_texts = permute_text(text)

            diffs = [(deleted_word,
                      baseline - text_probability(classifier, permuted_text))
                     for (deleted_word, permuted_text) in permuted_texts]

            print("text: {}".format(text))
            print("original probability: {}".format(baseline))
            biggest_diffs = sorted(
                diffs, key=lambda word_diff: -abs(word_diff[1]))[:4]
            print("top difference-makers:")
            for (deleted_word, permuted_text) in biggest_diffs:
                print(" - {}, {}".format(deleted_word, permuted_text))
Example #3
0
def classify(ctx, newest, lang):
    """
    Classify the ads in the database at $DATABASE_URL.
    """
    classifiers = dict()
    for (directory, conf) in confs(ctx.obj["base"]):
        with open(classifier_path(directory), 'rb') as classy:
            classifiers[conf["language"]] = {
                "classifier": dill.load(classy),
                "vectorizer": get_vectorizer(conf)
            }

    if newest:
        print("Running newest")
        query = "select * from ads where political_probability = 0"
        if lang:
            query = query + " and lang = '{}'".format(lang)
        else:
            langs = map(lambda x: "'{}'".format(x), classifiers.keys())
            langs = ','.join(langs)

            query = query + " and lang in ({})".format(langs)
    else:
        print("Running every")
        query = "select * from ads"
        if lang:
            query = query + " where lang = '{}'".format(lang)

    total = "select count(*) as length from ({}) as t1;"
    length = DB.query(total.format(query))[0]["length"]
    records = DB.query(query)
    print("found {} ads".format(length))
    updates = []
    query = "update ads set political_probability=:probability where id=:id"
    idx = 0
    for record in records:
        idx += 1
        record_lang = "en-US" if record["lang"] == "en-IE" else record["lang"]
        if record_lang in classifiers:
            classifier = classifiers[record_lang]
            text = classifier["vectorizer"].transform([get_text(record)])
            probability = classifier["classifier"].predict_proba(text)[0][1]
            update = {"id": record["id"], "probability": probability}
            if record["political_probability"] > update[
                    "probability"] and record[
                        "political_probability"] >= 0.70 and update[
                            "probability"] < 0.70 and not record["suppressed"]:
                print("refusing to downgrade probability of ad {}".format(
                    record["id"]))
            updates.append(update)
            out = "Classified {pid[id]} ({info[idx]} of {info[length]}) with {pid[probability]}"
            print(out.format(pid=update, info={"length": length, "idx": idx}))

            if len(updates) >= 100:
                DB.bulk_query(query, updates)
                updates = []

    if updates:
        DB.bulk_query(query, updates)
Example #4
0
def classify(ctx, newest, lang):
    """
    Classify the ads in the database at $DATABASE_URL.
    """
    if newest:
        print("Running newest")
        query = "select * from ads where political_probability = 0"
        if lang:
            query = query + " and lang = '{}'".format(lang)
    else:
        print("Running every")
        query = "select * from ads"
        if lang:
            query = query + " where lang = '{}'".format(lang)

    length = DB.query("select count(*) as length from ({}) as t1;".format(
        query))[0]["length"]
    records = DB.query(query)
    classifiers = dict()
    for (directory, conf) in confs(ctx.obj["base"]):
        with open(classifier_path(directory), 'rb') as classy:
            classifiers[conf["language"]] = {
                "classifier": dill.load(classy),
                "vectorizer": get_vectorizer(conf)
            }

    print("found {} ads".format(length))
    updates = []
    query = "update ads set political_probability=:probability where id=:id"
    idx = 0
    for record in records:
        idx += 1
        if record["lang"] in classifiers:
            classifier = classifiers[record["lang"]]
            text = classifier["vectorizer"].transform(
                [get_text(record["html"])])
            update = {
                "id": record["id"],
                "probability":
                classifier["classifier"].predict_proba(text)[0][1]
            }
            updates.append(update)

            print(
                "Classified {p[id]} ({l[idx]} of {l[length]}) with {p[probability]}"
                .format(p=update, l={
                    "length": length,
                    "idx": idx
                }))

            if len(updates) >= 100:
                DB.bulk_query(query, updates)
                updates = []

    if updates:
        DB.bulk_query(query, updates)
def listbuilding_fundraising_classify(ctx, newest):
    """
    Classify the ads in the database at $DATABASE_URL.
    """
    lang = "en-US"  # hard coded
    try:
        nlp = spacy.load('en_fbpac3label')  # hard coded
    except OSError as e:
        print(
            "you need to do `pipenv install ~/code/fbpac-prodigy/packagedmodels/en_fbpac3label-2.0.0.tar.gz`"
        )
        raise e

    if newest:
        print("Running newest")
        query = "select * from ads where political_probability > 0.70 and listbuilding_fundraising_proba is null"
        if lang:
            query = query + " and lang = '{}'".format(lang)
    else:
        print("Running every")
        query = "select * from ads where political_probability > 0.70"
        if lang:
            query = query + " and lang = '{}'".format(lang)

    total = "select count(*) as length from ({}) as t1;"
    length = DB.query(total.format(query))[0]["length"]
    records = DB.query(query)
    print("found {} ads".format(length))
    updates = []
    query = "update ads set listbuilding_fundraising_proba=:listbuilding_fundraising_proba where id=:id"
    idx = 0
    for record in records:
        idx += 1
        record_lang = "en-US" if record["lang"] == "en-IE" else record["lang"]

        text = get_text(record).replace(
            "Learn More Watch Again Resume Video Learn More", '')
        doc = nlp(text)
        listbuilding_fundraising_proba = doc.cats["LISTBUILDING"] + doc.cats[
            "FUNDRAISING"]

        update = {
            "id": record["id"],
            "listbuilding_fundraising_proba": listbuilding_fundraising_proba
        }
        updates.append(update)
        out = "Classified {pid[id]} ({info[idx]} of {info[length]}) with {pid[listbuilding_fundraising_proba]}"
        print(out.format(pid=update, info={"length": length, "idx": idx}))

        if len(updates) >= 100:
            DB.bulk_query(query, updates)
            updates = []

    if updates:
        DB.bulk_query(query, updates)