Esempio n. 1
0
def add_seeds_from_id(ctx, language):
    """
    Create a list of seed posts for our classifier by language
    """
    options = None
    for directory, conf in confs(ctx.obj["base"]):
        if conf["language"] == language:
            options = conf
            conf_dir = directory
            break

    if options is None:
        print("Couldn't find a config for {}".format(language))
        exit()

    with open(os.path.join(conf_dir, 'additional_seed_ids.json'), 'r') as additional_seeds_file:
        additional_seed_ids = json.load(additional_seeds_file)

    with open(os.path.join(conf_dir, 'seeds.json'), 'r') as old_seeds_file:
        seeds = json.load(old_seeds_file)

    for politicality in ["political", "not_political"]:
        records = DB.query("select * from ads where id in ('{}')".format("','".join(map(str, additional_seed_ids[politicality]))))
        for record in records:
            seeds[politicality].append(get_text(record))

    for politicality in ["political", "not_political"]:
        seeds[politicality] = list(set(seeds[politicality]))

    with open(os.path.join(conf_dir, 'seeds.json'), 'w') as out:
        json.dump(seeds, out)
def analyze(ctx, id):
    """
    Analyze permutes an ad's text to find which words contribute most to its political rating.
    """
    classifiers = dict()
    for (directory, conf) in confs(ctx.obj["base"]):
        with open(classifier_path(directory), 'rb') as classy:
            classifiers[conf["language"]] = {
                "classifier": dill.load(classy),
                "vectorizer": get_vectorizer(conf)
            }
    records = DB.query("""select * from ads where id = '{}'""".format(id))

    idx = 0
    for record in records:
        record_lang = record["lang"]
        if record_lang in classifiers:
            classifier = classifiers[record_lang]
            text = clean_text(get_text(record), record["advertiser"])
            baseline = text_probability(classifier, text)
            permuted_texts = permute_text(text)

            diffs = [(deleted_word,
                      baseline - text_probability(classifier, permuted_text))
                     for (deleted_word, permuted_text) in permuted_texts]

            print("text: {}".format(text))
            print("original probability: {}".format(baseline))
            biggest_diffs = sorted(
                diffs, key=lambda word_diff: -abs(word_diff[1]))[:4]
            print("top difference-makers:")
            for (deleted_word, permuted_text) in biggest_diffs:
                print(" - {}, {}".format(deleted_word, permuted_text))
def entities(ctx):
    """
    Extract likely entitites from the database outputs a csv for now
    """
    for (directory, conf) in entities_confs(ctx.obj["base"]):
        if conf:
            lang = directory.split('/')[1]

            print("running entity extraction for %s" % lang)
            nlp = en_core_web_lg.load()
            ads = DB.query(
                "select * from ads where political_probability > 0.70 and lang = '%s' and entities = '[]'::jsonb"
                % lang)
            query = "update ads set entities=:entities where id=:id"
            updates = []

            for advert in ads:
                doc = BeautifulSoup(advert["html"], "html.parser")
                text = ' '.join([graf.get_text() for graf in doc.select("p")])

                update = {"id": advert["id"], "entities": set()}
                for ent in nlp(text).ents:
                    if ent.text in conf["exclude"] or ent.text.isspace():
                        continue

                    has_parent = False
                    for parent, children in conf["parents"].items():
                        if ent.text in children["entities"]:
                            has_parent = True
                            update["entities"].add((parent, children["label"]))

                    if not has_parent:
                        update["entities"].add((ent.text, ent.label_))

                update["entities"] = json.dumps([{
                    "entity": e[0],
                    "entity_type": LABELS[e[1]]
                } for e in update["entities"] if e[1] in LABELS.keys()])
                updates.append(update)

                if len(updates) >= 100:
                    DB.bulk_query(query, updates)
                    updates = []

            if updates:
                DB.bulk_query(query, updates)
Esempio n. 4
0
def classify(ctx, newest, lang):
    """
    Classify the ads in the database at $DATABASE_URL.
    """
    classifiers = dict()
    for (directory, conf) in confs(ctx.obj["base"]):
        with open(classifier_path(directory), 'rb') as classy:
            classifiers[conf["language"]] = {
                "classifier": dill.load(classy),
                "vectorizer": get_vectorizer(conf)
            }

    if newest:
        print("Running newest")
        query = "select * from ads where political_probability = 0"
        if lang:
            query = query + " and lang = '{}'".format(lang)
        else:
            langs = map(lambda x: "'{}'".format(x), classifiers.keys())
            langs = ','.join(langs)

            query = query + " and lang in ({})".format(langs)
    else:
        print("Running every")
        query = "select * from ads"
        if lang:
            query = query + " where lang = '{}'".format(lang)

    total = "select count(*) as length from ({}) as t1;"
    length = DB.query(total.format(query))[0]["length"]
    records = DB.query(query)
    print("found {} ads".format(length))
    updates = []
    query = "update ads set political_probability=:probability where id=:id"
    idx = 0
    for record in records:
        idx += 1
        record_lang = "en-US" if record["lang"] == "en-IE" else record["lang"]
        if record_lang in classifiers:
            classifier = classifiers[record_lang]
            text = classifier["vectorizer"].transform([get_text(record)])
            probability = classifier["classifier"].predict_proba(text)[0][1]
            update = {"id": record["id"], "probability": probability}
            if record["political_probability"] > update[
                    "probability"] and record[
                        "political_probability"] >= 0.70 and update[
                            "probability"] < 0.70 and not record["suppressed"]:
                print("refusing to downgrade probability of ad {}".format(
                    record["id"]))
            updates.append(update)
            out = "Classified {pid[id]} ({info[idx]} of {info[length]}) with {pid[probability]}"
            print(out.format(pid=update, info={"length": length, "idx": idx}))

            if len(updates) >= 100:
                DB.bulk_query(query, updates)
                updates = []

    if updates:
        DB.bulk_query(query, updates)
Esempio n. 5
0
def classify(ctx, newest, lang):
    """
    Classify the ads in the database at $DATABASE_URL.
    """
    if newest:
        print("Running newest")
        query = "select * from ads where political_probability = 0"
        if lang:
            query = query + " and lang = '{}'".format(lang)
    else:
        print("Running every")
        query = "select * from ads"
        if lang:
            query = query + " where lang = '{}'".format(lang)

    length = DB.query("select count(*) as length from ({}) as t1;".format(
        query))[0]["length"]
    records = DB.query(query)
    classifiers = dict()
    for (directory, conf) in confs(ctx.obj["base"]):
        with open(classifier_path(directory), 'rb') as classy:
            classifiers[conf["language"]] = {
                "classifier": dill.load(classy),
                "vectorizer": get_vectorizer(conf)
            }

    print("found {} ads".format(length))
    updates = []
    query = "update ads set political_probability=:probability where id=:id"
    idx = 0
    for record in records:
        idx += 1
        if record["lang"] in classifiers:
            classifier = classifiers[record["lang"]]
            text = classifier["vectorizer"].transform(
                [get_text(record["html"])])
            update = {
                "id": record["id"],
                "probability":
                classifier["classifier"].predict_proba(text)[0][1]
            }
            updates.append(update)

            print(
                "Classified {p[id]} ({l[idx]} of {l[length]}) with {p[probability]}"
                .format(p=update, l={
                    "length": length,
                    "idx": idx
                }))

            if len(updates) >= 100:
                DB.bulk_query(query, updates)
                updates = []

    if updates:
        DB.bulk_query(query, updates)
def listbuilding_fundraising_classify(ctx, newest):
    """
    Classify the ads in the database at $DATABASE_URL.
    """
    lang = "en-US"  # hard coded
    try:
        nlp = spacy.load('en_fbpac3label')  # hard coded
    except OSError as e:
        print(
            "you need to do `pipenv install ~/code/fbpac-prodigy/packagedmodels/en_fbpac3label-2.0.0.tar.gz`"
        )
        raise e

    if newest:
        print("Running newest")
        query = "select * from ads where political_probability > 0.70 and listbuilding_fundraising_proba is null"
        if lang:
            query = query + " and lang = '{}'".format(lang)
    else:
        print("Running every")
        query = "select * from ads where political_probability > 0.70"
        if lang:
            query = query + " and lang = '{}'".format(lang)

    total = "select count(*) as length from ({}) as t1;"
    length = DB.query(total.format(query))[0]["length"]
    records = DB.query(query)
    print("found {} ads".format(length))
    updates = []
    query = "update ads set listbuilding_fundraising_proba=:listbuilding_fundraising_proba where id=:id"
    idx = 0
    for record in records:
        idx += 1
        record_lang = "en-US" if record["lang"] == "en-IE" else record["lang"]

        text = get_text(record).replace(
            "Learn More Watch Again Resume Video Learn More", '')
        doc = nlp(text)
        listbuilding_fundraising_proba = doc.cats["LISTBUILDING"] + doc.cats[
            "FUNDRAISING"]

        update = {
            "id": record["id"],
            "listbuilding_fundraising_proba": listbuilding_fundraising_proba
        }
        updates.append(update)
        out = "Classified {pid[id]} ({info[idx]} of {info[length]}) with {pid[listbuilding_fundraising_proba]}"
        print(out.format(pid=update, info={"length": length, "idx": idx}))

        if len(updates) >= 100:
            DB.bulk_query(query, updates)
            updates = []

    if updates:
        DB.bulk_query(query, updates)
Esempio n. 7
0
def parse_waist_json(ctx):
    """
    Classify the ads in the database at $DATABASE_URL.
    """

    # takes 8s locally
    query = "select * from ads where targets = '[]' and targeting is not null and targeting ilike '{%'"

    total = "select count(*) as length from ({}) as t1;"
    length = DB.query(total.format(query))[0]["length"]
    records = DB.query(query)
    print("found {} ads".format(length))
    updates = []
    query = "update ads set targets=:targets where id=:id"
    idx = 0
    for record in records:
        idx += 1
        record_lang = "en-US" if record["lang"] == "en-IE" else record["lang"]

        advertiser = record["advertiser"]

        created_at = record["created_at"]

        data = json.loads(record["targeting"])
        if record["targeting"][0] == '{' and "waist_targeting_data" in data:
            targeting = data[
                "waist_targeting_data"]  # this is necessary for all post Nov 13 - Dec 6 data.
        elif record["targeting"][0] == '{' and "data" in data:
            targeting = data["data"][
                "waist_targeting_data"]  # this is necessary for all post Jan 29 (ATI code) data
        else:
            targeting = data
        if not advertiser and "waist_advertiser_info" in data:
            advertiser = data["waist_advertiser_info"]["name"]

        targets = parse_one_waist_json(targeting)

        # it appears there are never multiple distinct JSONs for one ad (besides diff profile_picture_url query strings and diff date formats)
        # TODO: once I have examples, implement unimplemented

        update = {"id": record["id"], "targets": json.dumps(targets)}
        updates.append(update)
        out = "Parsed {pid[id]} ({info[idx]} of {info[length]}) with {pid[targets]}"
        # print(out.format(pid=update, info={"length": length, "idx": idx}))

        if len(updates) >= 100 and True:
            DB.bulk_query(query, updates)
            updates = []

    if updates and True:
        DB.bulk_query(query, updates)
Esempio n. 8
0
def targeting():
    """
    Extract the targeting parameters we've seen
    """
    ads = DB.query("""
       select * from ads
       where political_probability > 0.70 and targeting is not null
    """)
    counter = Counter()
    for advert in ads:
        doc = BeautifulSoup(advert["targeting"], "html.parser")
        targets = [
            bold for bold in doc.select("b")
            if bold.get('id') != "ad_prefs_advertiser"
        ]
        counter.update(targets)

    print("parameter,type,count")
    for (target, count) in counter.items():
        print("{},{},{}".format(target.get_text(), target.get('id'), count))