def main(): while True: if isKilled: break try: b_id, host, realm, last_crawl, last_post, succeed = database.get_meta() if not succeed: logger.log("Getting blog entity is failed") return try: article_list = get_article_list(host, realm, last_post) except Exception, e: logger.log("##ERROR:get_list", host, e.message) database.flag_rollback(b_id) continue logger.log(host, " [", len(article_list), "]") success_count = 0 for article in article_list: try: data = get_article(article, realm) except Exception, e: logger.log("##ERROR:get_article", article, e.message) continue if len(data) == 0: continue if database.save_article(b_id, data): success_count += 1 logger.log(success_count, "accepted") database.flag(b_id, 0)
data = get_article(article, realm) except Exception, e: logger.log("##ERROR:get_article", article, e.message) continue if len(data) == 0: continue if database.save_article(b_id, data): success_count += 1 logger.log(success_count, "accepted") database.flag(b_id, 0) except Exception, e: logger.log("##ERROR:global_error:", b_id, e.message) database.flag_rollback(b_id) def get_article_list(host, realm=None, lp=None): if "http://" not in host: host = "http://" + host re = requests.get(host, headers={"User-agent": UserAgent}, timeout=5.0) article_list = [] if re.status_code == 404: return article_list if realm == "Tistory" or "tistory.com" in host: article_list = mTistory.get_article_list(host, lp) #