Example #1
0
def dedupe_table(mc, table):
  processed_docs = set()
  collection = evergreen_utils.get_table(mc, table)
  evergreens = evergreen_utils.load_evergreen_articles(mc, table, {})
  for doc in evergreens:
    docid = doc["docid"]
    if docid in processed_docs:
      continue
    cur_docs = evergreen_utils.load_evergreen_articles(mc, table, {"docid": docid})
    docs = []
    for cur_doc in cur_docs:
      docs.append(cur_doc)
    if len(docs) == 1:
      continue
    # Sort all docs for the same docid, and keep the first one only.
    docs = sorted(docs, key = get_key)
    for i in range(1, len(docs)):
      if DRY_RUN:
        print "deleting", cur_doc["docid"], cur_doc["_id"]
      else:
        collection.delete_one({"_id": cur_doc["_id"]})
    processed_docs.add(docid)
Example #2
0
def update_adult(mc, news_data_table, table):
  evergreen_table = evergreen_utils.get_table(mc, table)
  evergreens = evergreen_utils.load_evergreen_articles(mc, table)
  for doc in evergreens:
    if doc.has_key("is_adult"):
      continue
    docid = doc["docid"]
    news_dict = news_data_table.find({"_id": docid}, projection = ['cat_class'])
    is_adult = False
    for news in news_dict:
      if news.has_key('cat_class'):
        for cat in news['cat_class']:
          if cat == 'adult':
            is_adult = True
    if is_adult:
      if DRY_RUN:
        print docid, is_adult
      else:
        evergreen_table.update_one({"docid": docid}, {"$set": {"is_adult": "1"}})
Example #3
0
def dump_table(mc, table, date):
  evergreens = evergreen_utils.load_evergreen_articles(mc, table, {})
  fp = open(OUTPUT_DIR + table + "." + date + ".txt", 'w')
  for candidate in evergreens:
    fp.write(str(candidate) + "\n")
  fp.close()