def dedupe_table(mc, table): processed_docs = set() collection = evergreen_utils.get_table(mc, table) evergreens = evergreen_utils.load_evergreen_articles(mc, table, {}) for doc in evergreens: docid = doc["docid"] if docid in processed_docs: continue cur_docs = evergreen_utils.load_evergreen_articles(mc, table, {"docid": docid}) docs = [] for cur_doc in cur_docs: docs.append(cur_doc) if len(docs) == 1: continue # Sort all docs for the same docid, and keep the first one only. docs = sorted(docs, key = get_key) for i in range(1, len(docs)): if DRY_RUN: print "deleting", cur_doc["docid"], cur_doc["_id"] else: collection.delete_one({"_id": cur_doc["_id"]}) processed_docs.add(docid)
def update_adult(mc, news_data_table, table): evergreen_table = evergreen_utils.get_table(mc, table) evergreens = evergreen_utils.load_evergreen_articles(mc, table) for doc in evergreens: if doc.has_key("is_adult"): continue docid = doc["docid"] news_dict = news_data_table.find({"_id": docid}, projection = ['cat_class']) is_adult = False for news in news_dict: if news.has_key('cat_class'): for cat in news['cat_class']: if cat == 'adult': is_adult = True if is_adult: if DRY_RUN: print docid, is_adult else: evergreen_table.update_one({"docid": docid}, {"$set": {"is_adult": "1"}})
def dump_table(mc, table, date): evergreens = evergreen_utils.load_evergreen_articles(mc, table, {}) fp = open(OUTPUT_DIR + table + "." + date + ".txt", 'w') for candidate in evergreens: fp.write(str(candidate) + "\n") fp.close()