def mine(db, mined_from=None, entry_count=200): subreddit_parser = SubredditParser(mined_from) entry_parser = EntryParser() ids = set([]) last_id = "" step_size = entry_count / 10 count = 0 accepted = 0 retries = 0 while accepted < entry_count: entries = None while entries is None: try: entries = subreddit_parser.parse_entries(step_size, last_id) except Exception, error: retries += 1 if retries < 3: logging.error("Timeout: %s %s %s" % (mined_from, count, error)) sleep(randint(10, 20)) elif retries < 8: logging.error("Timeout: %s %s %s" % (mined_from, count, error)) sleep(randint(10, 20)) continue else: thread.exit() unchanged = False skipped = False for i, entry in enumerate(entries): if entry["reddit_id"] in ids: unchanged = True logging.info("Unchanged: entries %d-%d in %s" % (i, count + len(entries) - 1, mined_from)) break last_id = entry["reddit_id"] ids.add(last_id) saved_entry = db["entries"].find_one(reddit_id=entry["reddit_id"]) if saved_entry is None: entry["article"] = None while entry["article"] is None: try: entry["article"] = entry_parser.get_content(entry["link"]) db["entries"].insert(entry) accepted += 1 except Exception, error: retries += 1 if retries < 3: logging.error("Error: %s %s %s" % (mined_from, count, error)) sleep(randint(10, 20)) elif retries < 8: logging.error("Error: %s %s %s" % (mined_from, count, error)) sleep(randint(10, 20)) continue else: thread.exit() else: skipped = True logging.info("Skipped: %d-%d in %s" % (i, count + len(entries) - 1, mined_from)) break sleep(randint(1, 3))
from random import shuffle from helpers.db import setup_db from helpers.EntryParser import EntryParser from os import listdir parsed = [] for f in listdir('./json'): with open('./json/%s' % f) as data: parsed.extend(json.loads(data.read())) shuffle(parsed) db = setup_db() entry_parser = EntryParser() logging.basicConfig(format='%(asctime)s %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', filename='miner.log',level=logging.DEBUG) for entry in parsed: saved_entry = db['entries'].find_one(reddit_id=entry['reddit_id']) if saved_entry is None: entry['article'] = None try: entry['article'] = entry_parser.get_content(entry['link']) db['entries'].insert(entry) except Exception, error: logging.error('Error: %s %s %s' % (entry['mined_from'], entry['link'], error)) continue