def add_seeds_from_id(ctx, language): """ Create a list of seed posts for our classifier by language """ options = None for directory, conf in confs(ctx.obj["base"]): if conf["language"] == language: options = conf conf_dir = directory break if options is None: print("Couldn't find a config for {}".format(language)) exit() with open(os.path.join(conf_dir, 'additional_seed_ids.json'), 'r') as additional_seeds_file: additional_seed_ids = json.load(additional_seeds_file) with open(os.path.join(conf_dir, 'seeds.json'), 'r') as old_seeds_file: seeds = json.load(old_seeds_file) for politicality in ["political", "not_political"]: records = DB.query("select * from ads where id in ('{}')".format("','".join(map(str, additional_seed_ids[politicality])))) for record in records: seeds[politicality].append(get_text(record)) for politicality in ["political", "not_political"]: seeds[politicality] = list(set(seeds[politicality])) with open(os.path.join(conf_dir, 'seeds.json'), 'w') as out: json.dump(seeds, out)
def analyze(ctx, id): """ Analyze permutes an ad's text to find which words contribute most to its political rating. """ classifiers = dict() for (directory, conf) in confs(ctx.obj["base"]): with open(classifier_path(directory), 'rb') as classy: classifiers[conf["language"]] = { "classifier": dill.load(classy), "vectorizer": get_vectorizer(conf) } records = DB.query("""select * from ads where id = '{}'""".format(id)) idx = 0 for record in records: record_lang = record["lang"] if record_lang in classifiers: classifier = classifiers[record_lang] text = clean_text(get_text(record), record["advertiser"]) baseline = text_probability(classifier, text) permuted_texts = permute_text(text) diffs = [(deleted_word, baseline - text_probability(classifier, permuted_text)) for (deleted_word, permuted_text) in permuted_texts] print("text: {}".format(text)) print("original probability: {}".format(baseline)) biggest_diffs = sorted( diffs, key=lambda word_diff: -abs(word_diff[1]))[:4] print("top difference-makers:") for (deleted_word, permuted_text) in biggest_diffs: print(" - {}, {}".format(deleted_word, permuted_text))
def classify(ctx, newest, lang): """ Classify the ads in the database at $DATABASE_URL. """ classifiers = dict() for (directory, conf) in confs(ctx.obj["base"]): with open(classifier_path(directory), 'rb') as classy: classifiers[conf["language"]] = { "classifier": dill.load(classy), "vectorizer": get_vectorizer(conf) } if newest: print("Running newest") query = "select * from ads where political_probability = 0" if lang: query = query + " and lang = '{}'".format(lang) else: langs = map(lambda x: "'{}'".format(x), classifiers.keys()) langs = ','.join(langs) query = query + " and lang in ({})".format(langs) else: print("Running every") query = "select * from ads" if lang: query = query + " where lang = '{}'".format(lang) total = "select count(*) as length from ({}) as t1;" length = DB.query(total.format(query))[0]["length"] records = DB.query(query) print("found {} ads".format(length)) updates = [] query = "update ads set political_probability=:probability where id=:id" idx = 0 for record in records: idx += 1 record_lang = "en-US" if record["lang"] == "en-IE" else record["lang"] if record_lang in classifiers: classifier = classifiers[record_lang] text = classifier["vectorizer"].transform([get_text(record)]) probability = classifier["classifier"].predict_proba(text)[0][1] update = {"id": record["id"], "probability": probability} if record["political_probability"] > update[ "probability"] and record[ "political_probability"] >= 0.70 and update[ "probability"] < 0.70 and not record["suppressed"]: print("refusing to downgrade probability of ad {}".format( record["id"])) updates.append(update) out = "Classified {pid[id]} ({info[idx]} of {info[length]}) with {pid[probability]}" print(out.format(pid=update, info={"length": length, "idx": idx})) if len(updates) >= 100: DB.bulk_query(query, updates) updates = [] if updates: DB.bulk_query(query, updates)
def classify(ctx, newest, lang): """ Classify the ads in the database at $DATABASE_URL. """ if newest: print("Running newest") query = "select * from ads where political_probability = 0" if lang: query = query + " and lang = '{}'".format(lang) else: print("Running every") query = "select * from ads" if lang: query = query + " where lang = '{}'".format(lang) length = DB.query("select count(*) as length from ({}) as t1;".format( query))[0]["length"] records = DB.query(query) classifiers = dict() for (directory, conf) in confs(ctx.obj["base"]): with open(classifier_path(directory), 'rb') as classy: classifiers[conf["language"]] = { "classifier": dill.load(classy), "vectorizer": get_vectorizer(conf) } print("found {} ads".format(length)) updates = [] query = "update ads set political_probability=:probability where id=:id" idx = 0 for record in records: idx += 1 if record["lang"] in classifiers: classifier = classifiers[record["lang"]] text = classifier["vectorizer"].transform( [get_text(record["html"])]) update = { "id": record["id"], "probability": classifier["classifier"].predict_proba(text)[0][1] } updates.append(update) print( "Classified {p[id]} ({l[idx]} of {l[length]}) with {p[probability]}" .format(p=update, l={ "length": length, "idx": idx })) if len(updates) >= 100: DB.bulk_query(query, updates) updates = [] if updates: DB.bulk_query(query, updates)
def listbuilding_fundraising_classify(ctx, newest): """ Classify the ads in the database at $DATABASE_URL. """ lang = "en-US" # hard coded try: nlp = spacy.load('en_fbpac3label') # hard coded except OSError as e: print( "you need to do `pipenv install ~/code/fbpac-prodigy/packagedmodels/en_fbpac3label-2.0.0.tar.gz`" ) raise e if newest: print("Running newest") query = "select * from ads where political_probability > 0.70 and listbuilding_fundraising_proba is null" if lang: query = query + " and lang = '{}'".format(lang) else: print("Running every") query = "select * from ads where political_probability > 0.70" if lang: query = query + " and lang = '{}'".format(lang) total = "select count(*) as length from ({}) as t1;" length = DB.query(total.format(query))[0]["length"] records = DB.query(query) print("found {} ads".format(length)) updates = [] query = "update ads set listbuilding_fundraising_proba=:listbuilding_fundraising_proba where id=:id" idx = 0 for record in records: idx += 1 record_lang = "en-US" if record["lang"] == "en-IE" else record["lang"] text = get_text(record).replace( "Learn More Watch Again Resume Video Learn More", '') doc = nlp(text) listbuilding_fundraising_proba = doc.cats["LISTBUILDING"] + doc.cats[ "FUNDRAISING"] update = { "id": record["id"], "listbuilding_fundraising_proba": listbuilding_fundraising_proba } updates.append(update) out = "Classified {pid[id]} ({info[idx]} of {info[length]}) with {pid[listbuilding_fundraising_proba]}" print(out.format(pid=update, info={"length": length, "idx": idx})) if len(updates) >= 100: DB.bulk_query(query, updates) updates = [] if updates: DB.bulk_query(query, updates)