def read_run_id(): try: with open(os.path.join('var', 'cache', 'run_id'), 'r') as f: return int(f.read()) except: logger.error(traceback.format_exc()) return 0
def retry_info(fetch_type, mysql_db): meta_type = None max_retry_num = None sleep_seconds = None if fetch_type == 'Photo': meta_type = mysql_db.img_type max_retry_num = cf.photo_fail_max_retry sleep_seconds = min(cf.photo_fail_interval_sec, 10) elif fetch_type == 'Video': meta_type = mysql_db.video_type max_retry_num = cf.video_fail_max_retry sleep_seconds = min(cf.video_fail_interval_sec, 10) elif fetch_type == 'VDDB': meta_type = mysql_db.vddb_type max_retry_num = cf.vddb_fail_max_retry sleep_seconds = min(cf.vddb_fail_interval_sec, 10) elif fetch_type == 'TDDB': meta_type = mysql_db.tddb_type max_retry_num = cf.tddb_fail_max_retry sleep_seconds = min(cf.tddb_fail_interval_sec, 10) elif fetch_type == 'BUSY': meta_type = mysql_db.busy_type max_retry_num = cf.service_busy_max_retry sleep_seconds = min(cf.service_busy_interval_sec, 10) elif fetch_type == 'OUT': meta_type = mysql_db.out_type max_retry_num = cf.out_exception_fail_max_retry sleep_seconds = min(cf.out_exception_fail_interval_sec, 10) else: logger.error('get retry_info, unkown type [{}]'.format(fetch_type)) return meta_type, max_retry_num, sleep_seconds
def run(): def process_task(tasks, message): def parse_one_task(task): def add_out_retry(): mysql_db.down_fail(doc_lib_id, doc_id, mysql_db.out_type, 1) try: doc_id = str(task[ld[0]]) doc_lib_id = str(task[ld[1]]) main_log = '{}: {}, {}: {}, {}: {}, {}: {}'.format( ld[0], doc_id, ld[1], doc_lib_id, ld[2], task[ld[2]], ld[3], task[ld[3]]) except: logger.error('task:[{}], can not found [{}], {}'.format( str(task), ld, traceback.format_exc())) else: try: logger.message_decorate(main=main_log) bAdd = True if 'add' == task[ld[3]]: if 6 == task[ld[1]]: bAdd = False logger.warn('drop task, not add 6 报刊') else: bAdd = False logger.warn('drop task, not add type, {}'.format(task)) if bAdd: article_info = article_obj.fetch(doc_id, doc_lib_id) if 'fetch_error' in article_info: raise FetchArticleException() elif 'not_in_config_error' in article_info: raise LineNotConfigException( article_info['not_in_config_error']) else: ingest(mysql_db, download_obj, article_info, 1) except TokenExpiresException: add_out_retry() except DownloadFailException, e: logger.warn(str(e)) add_out_retry() except (ParameterException, HTTPConnectException), e: try: j = json.loads(str(e)) logger.error(j) except: logger.error(str(e)) add_out_retry() except FetchArticleException: logger.error(article_info) if 'message' in article_info: if '系统异常,请联系相关负责人' in article_info['message']: logger.error('ack this task (系统异常,请联系相关负责人)') else: add_out_retry() else: add_out_retry() except NoDocIdException: logger.error(article_info)
def fn(*args, **kwargs): exception = self.exceptions for i in range(self.tries): try: return f(*args, **kwargs) except self.exceptions, e: if (i + 1) < self.tries: time.sleep(self.delay) logger.error(str(e)) exception = e
def always_send(send_mq, send): if cf.local_test: return while True: if go_quit(): break try: send_mq.send(base64.b64encode(json.dumps(send))) except: logger.error(traceback.format_exc()) time.sleep(3) else: break
def get_error_type(source): if isinstance(source, dict): if 'resultmsg' in source: if '系统繁忙,此时请开发者稍候再试' in source['resultmsg']: return ERROR_TYPE_BUSY elif 'access_token已过期' in source['resultmsg']: return ERROR_TYPE_TOKEN elif 'resultcode' in source: if '系统繁忙,此时请开发者稍候再试' in source['resultcode']: return ERROR_TYPE_BUSY elif 'message' in source: if '找不到附件' in source['message']: return ERROR_TYPE_NO_ATTACHE else: logger.error('source is not dict, [{}]'.format(source))
def save_tmp_web_html(file_name, content): if cf.local_test: if not content: logger.warn('content is None, save to %s error' % file_name) return try: codecs.open(file_name, 'wb', encoding='utf-8').write(content) except: try: codecs.open(file_name, 'wb', encoding='utf-8').write( unicode(content, encoding='gbk')) except: try: codecs.open(file_name, 'wb', encoding='utf-8').write( unicode(content, encoding='utf-8')) except: logger.error(traceback.format_exc())
def invalid_url(url, strategy): if com_invalid_url(url, strategy): logger.error('com_invalid_url [{}]'.format(url)) return True if domain_invalid_url(url, strategy): logger.error('domain_invalid_url [{}]'.format(url)) return True domain = strategy.get('domain') link_type = strategy.get('link_type') add_index_pages = strategy.get('add_index_pages') if add_index_pages: if url in add_index_pages: if link_type and (link_type == TYPE_ARITICLE): return True else: return False if link_type and (link_type == TYPE_ARITICLE): if domain: if isinstance(domain, (str, unicode)): if url.lower().rstrip('/').endswith(domain.lower()): return True elif isinstance(domain, list): for one_domain in domain: if url.lower().rstrip('/').endswith(one_domain.lower()): return True for ends in cf.invalid_ends + cf.media_ends: if url.lower().endswith(ends): return True full_remove_urls = strategy.get('invalid_full') if full_remove_urls: for r_url in full_remove_urls: if url.rstrip('/') == r_url.lower().rstrip('/'): return True remove_urls = strategy.get('invalid') if remove_urls: for r_url in remove_urls: m = re.compile(r_url).findall(url) if m: return True
def load_sites_first(): try: if os.path.exists('sites_first.json'): return json.load(open('sites_first.json', 'r')).get('ids') except: logger.error(traceback.format_exc())
def save_run_id(id): try: with open(os.path.join('var', 'cache', 'run_id'), 'w') as f: f.write(str(id)) except: logger.error(traceback.format_exc())
def main(): try: logger.info('ingest start') run() except: logger.error(traceback.format_exc())
else: add_out_retry() else: add_out_retry() except NoDocIdException: logger.error(article_info) except LineNotConfigException, e: logger.warn(str(e)) except SaveMysqlRetryException, e: try: j = json.loads(str(e)) logger.warn(j) except: logger.warn(str(e)) except: logger.error(traceback.format_exc()) add_out_retry() finally: logger.message_undecorate() if not tasks: logger.warn('receive empty task') message.ack() return else: try: eval(tasks) except: logger.error('tasks:[{}], {}'.format(str(tasks), traceback.format_exc())) message.ack()