def add_seeds_from_id(ctx, language): """ Create a list of seed posts for our classifier by language """ options = None for directory, conf in confs(ctx.obj["base"]): if conf["language"] == language: options = conf conf_dir = directory break if options is None: print("Couldn't find a config for {}".format(language)) exit() with open(os.path.join(conf_dir, 'additional_seed_ids.json'), 'r') as additional_seeds_file: additional_seed_ids = json.load(additional_seeds_file) with open(os.path.join(conf_dir, 'seeds.json'), 'r') as old_seeds_file: seeds = json.load(old_seeds_file) for politicality in ["political", "not_political"]: records = DB.query("select * from ads where id in ('{}')".format("','".join(map(str, additional_seed_ids[politicality])))) for record in records: seeds[politicality].append(get_text(record)) for politicality in ["political", "not_political"]: seeds[politicality] = list(set(seeds[politicality])) with open(os.path.join(conf_dir, 'seeds.json'), 'w') as out: json.dump(seeds, out)
def analyze(ctx, id): """ Analyze permutes an ad's text to find which words contribute most to its political rating. """ classifiers = dict() for (directory, conf) in confs(ctx.obj["base"]): with open(classifier_path(directory), 'rb') as classy: classifiers[conf["language"]] = { "classifier": dill.load(classy), "vectorizer": get_vectorizer(conf) } records = DB.query("""select * from ads where id = '{}'""".format(id)) idx = 0 for record in records: record_lang = record["lang"] if record_lang in classifiers: classifier = classifiers[record_lang] text = clean_text(get_text(record), record["advertiser"]) baseline = text_probability(classifier, text) permuted_texts = permute_text(text) diffs = [(deleted_word, baseline - text_probability(classifier, permuted_text)) for (deleted_word, permuted_text) in permuted_texts] print("text: {}".format(text)) print("original probability: {}".format(baseline)) biggest_diffs = sorted( diffs, key=lambda word_diff: -abs(word_diff[1]))[:4] print("top difference-makers:") for (deleted_word, permuted_text) in biggest_diffs: print(" - {}, {}".format(deleted_word, permuted_text))
def entities(ctx): """ Extract likely entitites from the database outputs a csv for now """ for (directory, conf) in entities_confs(ctx.obj["base"]): if conf: lang = directory.split('/')[1] print("running entity extraction for %s" % lang) nlp = en_core_web_lg.load() ads = DB.query( "select * from ads where political_probability > 0.70 and lang = '%s' and entities = '[]'::jsonb" % lang) query = "update ads set entities=:entities where id=:id" updates = [] for advert in ads: doc = BeautifulSoup(advert["html"], "html.parser") text = ' '.join([graf.get_text() for graf in doc.select("p")]) update = {"id": advert["id"], "entities": set()} for ent in nlp(text).ents: if ent.text in conf["exclude"] or ent.text.isspace(): continue has_parent = False for parent, children in conf["parents"].items(): if ent.text in children["entities"]: has_parent = True update["entities"].add((parent, children["label"])) if not has_parent: update["entities"].add((ent.text, ent.label_)) update["entities"] = json.dumps([{ "entity": e[0], "entity_type": LABELS[e[1]] } for e in update["entities"] if e[1] in LABELS.keys()]) updates.append(update) if len(updates) >= 100: DB.bulk_query(query, updates) updates = [] if updates: DB.bulk_query(query, updates)
def classify(ctx, newest, lang): """ Classify the ads in the database at $DATABASE_URL. """ classifiers = dict() for (directory, conf) in confs(ctx.obj["base"]): with open(classifier_path(directory), 'rb') as classy: classifiers[conf["language"]] = { "classifier": dill.load(classy), "vectorizer": get_vectorizer(conf) } if newest: print("Running newest") query = "select * from ads where political_probability = 0" if lang: query = query + " and lang = '{}'".format(lang) else: langs = map(lambda x: "'{}'".format(x), classifiers.keys()) langs = ','.join(langs) query = query + " and lang in ({})".format(langs) else: print("Running every") query = "select * from ads" if lang: query = query + " where lang = '{}'".format(lang) total = "select count(*) as length from ({}) as t1;" length = DB.query(total.format(query))[0]["length"] records = DB.query(query) print("found {} ads".format(length)) updates = [] query = "update ads set political_probability=:probability where id=:id" idx = 0 for record in records: idx += 1 record_lang = "en-US" if record["lang"] == "en-IE" else record["lang"] if record_lang in classifiers: classifier = classifiers[record_lang] text = classifier["vectorizer"].transform([get_text(record)]) probability = classifier["classifier"].predict_proba(text)[0][1] update = {"id": record["id"], "probability": probability} if record["political_probability"] > update[ "probability"] and record[ "political_probability"] >= 0.70 and update[ "probability"] < 0.70 and not record["suppressed"]: print("refusing to downgrade probability of ad {}".format( record["id"])) updates.append(update) out = "Classified {pid[id]} ({info[idx]} of {info[length]}) with {pid[probability]}" print(out.format(pid=update, info={"length": length, "idx": idx})) if len(updates) >= 100: DB.bulk_query(query, updates) updates = [] if updates: DB.bulk_query(query, updates)
def classify(ctx, newest, lang): """ Classify the ads in the database at $DATABASE_URL. """ if newest: print("Running newest") query = "select * from ads where political_probability = 0" if lang: query = query + " and lang = '{}'".format(lang) else: print("Running every") query = "select * from ads" if lang: query = query + " where lang = '{}'".format(lang) length = DB.query("select count(*) as length from ({}) as t1;".format( query))[0]["length"] records = DB.query(query) classifiers = dict() for (directory, conf) in confs(ctx.obj["base"]): with open(classifier_path(directory), 'rb') as classy: classifiers[conf["language"]] = { "classifier": dill.load(classy), "vectorizer": get_vectorizer(conf) } print("found {} ads".format(length)) updates = [] query = "update ads set political_probability=:probability where id=:id" idx = 0 for record in records: idx += 1 if record["lang"] in classifiers: classifier = classifiers[record["lang"]] text = classifier["vectorizer"].transform( [get_text(record["html"])]) update = { "id": record["id"], "probability": classifier["classifier"].predict_proba(text)[0][1] } updates.append(update) print( "Classified {p[id]} ({l[idx]} of {l[length]}) with {p[probability]}" .format(p=update, l={ "length": length, "idx": idx })) if len(updates) >= 100: DB.bulk_query(query, updates) updates = [] if updates: DB.bulk_query(query, updates)
def listbuilding_fundraising_classify(ctx, newest): """ Classify the ads in the database at $DATABASE_URL. """ lang = "en-US" # hard coded try: nlp = spacy.load('en_fbpac3label') # hard coded except OSError as e: print( "you need to do `pipenv install ~/code/fbpac-prodigy/packagedmodels/en_fbpac3label-2.0.0.tar.gz`" ) raise e if newest: print("Running newest") query = "select * from ads where political_probability > 0.70 and listbuilding_fundraising_proba is null" if lang: query = query + " and lang = '{}'".format(lang) else: print("Running every") query = "select * from ads where political_probability > 0.70" if lang: query = query + " and lang = '{}'".format(lang) total = "select count(*) as length from ({}) as t1;" length = DB.query(total.format(query))[0]["length"] records = DB.query(query) print("found {} ads".format(length)) updates = [] query = "update ads set listbuilding_fundraising_proba=:listbuilding_fundraising_proba where id=:id" idx = 0 for record in records: idx += 1 record_lang = "en-US" if record["lang"] == "en-IE" else record["lang"] text = get_text(record).replace( "Learn More Watch Again Resume Video Learn More", '') doc = nlp(text) listbuilding_fundraising_proba = doc.cats["LISTBUILDING"] + doc.cats[ "FUNDRAISING"] update = { "id": record["id"], "listbuilding_fundraising_proba": listbuilding_fundraising_proba } updates.append(update) out = "Classified {pid[id]} ({info[idx]} of {info[length]}) with {pid[listbuilding_fundraising_proba]}" print(out.format(pid=update, info={"length": length, "idx": idx})) if len(updates) >= 100: DB.bulk_query(query, updates) updates = [] if updates: DB.bulk_query(query, updates)
def parse_waist_json(ctx): """ Classify the ads in the database at $DATABASE_URL. """ # takes 8s locally query = "select * from ads where targets = '[]' and targeting is not null and targeting ilike '{%'" total = "select count(*) as length from ({}) as t1;" length = DB.query(total.format(query))[0]["length"] records = DB.query(query) print("found {} ads".format(length)) updates = [] query = "update ads set targets=:targets where id=:id" idx = 0 for record in records: idx += 1 record_lang = "en-US" if record["lang"] == "en-IE" else record["lang"] advertiser = record["advertiser"] created_at = record["created_at"] data = json.loads(record["targeting"]) if record["targeting"][0] == '{' and "waist_targeting_data" in data: targeting = data[ "waist_targeting_data"] # this is necessary for all post Nov 13 - Dec 6 data. elif record["targeting"][0] == '{' and "data" in data: targeting = data["data"][ "waist_targeting_data"] # this is necessary for all post Jan 29 (ATI code) data else: targeting = data if not advertiser and "waist_advertiser_info" in data: advertiser = data["waist_advertiser_info"]["name"] targets = parse_one_waist_json(targeting) # it appears there are never multiple distinct JSONs for one ad (besides diff profile_picture_url query strings and diff date formats) # TODO: once I have examples, implement unimplemented update = {"id": record["id"], "targets": json.dumps(targets)} updates.append(update) out = "Parsed {pid[id]} ({info[idx]} of {info[length]}) with {pid[targets]}" # print(out.format(pid=update, info={"length": length, "idx": idx})) if len(updates) >= 100 and True: DB.bulk_query(query, updates) updates = [] if updates and True: DB.bulk_query(query, updates)
def targeting(): """ Extract the targeting parameters we've seen """ ads = DB.query(""" select * from ads where political_probability > 0.70 and targeting is not null """) counter = Counter() for advert in ads: doc = BeautifulSoup(advert["targeting"], "html.parser") targets = [ bold for bold in doc.select("b") if bold.get('id') != "ad_prefs_advertiser" ] counter.update(targets) print("parameter,type,count") for (target, count) in counter.items(): print("{},{},{}".format(target.get_text(), target.get('id'), count))