Example #1
0
def read_run_id():
    try:
        with open(os.path.join('var', 'cache', 'run_id'), 'r') as f:
            return int(f.read())
    except:
        logger.error(traceback.format_exc())
        return 0
Example #2
0
def retry_info(fetch_type, mysql_db):
    meta_type = None
    max_retry_num = None
    sleep_seconds = None
    if fetch_type == 'Photo':
        meta_type = mysql_db.img_type
        max_retry_num = cf.photo_fail_max_retry
        sleep_seconds = min(cf.photo_fail_interval_sec, 10)
    elif fetch_type == 'Video':
        meta_type = mysql_db.video_type
        max_retry_num = cf.video_fail_max_retry
        sleep_seconds = min(cf.video_fail_interval_sec, 10)
    elif fetch_type == 'VDDB':
        meta_type = mysql_db.vddb_type
        max_retry_num = cf.vddb_fail_max_retry
        sleep_seconds = min(cf.vddb_fail_interval_sec, 10)
    elif fetch_type == 'TDDB':
        meta_type = mysql_db.tddb_type
        max_retry_num = cf.tddb_fail_max_retry
        sleep_seconds = min(cf.tddb_fail_interval_sec, 10)
    elif fetch_type == 'BUSY':
        meta_type = mysql_db.busy_type
        max_retry_num = cf.service_busy_max_retry
        sleep_seconds = min(cf.service_busy_interval_sec, 10)
    elif fetch_type == 'OUT':
        meta_type = mysql_db.out_type
        max_retry_num = cf.out_exception_fail_max_retry
        sleep_seconds = min(cf.out_exception_fail_interval_sec, 10)
    else:
        logger.error('get retry_info, unkown type [{}]'.format(fetch_type))
    return meta_type, max_retry_num, sleep_seconds
Example #3
0
def run():
    def process_task(tasks, message):
        def parse_one_task(task):
            def add_out_retry():
                mysql_db.down_fail(doc_lib_id, doc_id, mysql_db.out_type, 1)

            try:
                doc_id = str(task[ld[0]])
                doc_lib_id = str(task[ld[1]])
                main_log = '{}: {}, {}: {}, {}: {}, {}: {}'.format(
                    ld[0], doc_id, ld[1], doc_lib_id, ld[2], task[ld[2]],
                    ld[3], task[ld[3]])
            except:
                logger.error('task:[{}], can not found [{}], {}'.format(
                    str(task), ld, traceback.format_exc()))
            else:
                try:
                    logger.message_decorate(main=main_log)
                    bAdd = True
                    if 'add' == task[ld[3]]:
                        if 6 == task[ld[1]]:
                            bAdd = False
                            logger.warn('drop task, not add 6 报刊')
                    else:
                        bAdd = False
                        logger.warn('drop task, not add type, {}'.format(task))
                    if bAdd:
                        article_info = article_obj.fetch(doc_id, doc_lib_id)
                        if 'fetch_error' in article_info:
                            raise FetchArticleException()
                        elif 'not_in_config_error' in article_info:
                            raise LineNotConfigException(
                                article_info['not_in_config_error'])
                        else:
                            ingest(mysql_db, download_obj, article_info, 1)
                except TokenExpiresException:
                    add_out_retry()
                except DownloadFailException, e:
                    logger.warn(str(e))
                    add_out_retry()
                except (ParameterException, HTTPConnectException), e:
                    try:
                        j = json.loads(str(e))
                        logger.error(j)
                    except:
                        logger.error(str(e))
                    add_out_retry()
                except FetchArticleException:
                    logger.error(article_info)
                    if 'message' in article_info:
                        if '系统异常,请联系相关负责人' in article_info['message']:
                            logger.error('ack this task (系统异常,请联系相关负责人)')
                        else:
                            add_out_retry()
                    else:
                        add_out_retry()
                except NoDocIdException:
                    logger.error(article_info)
Example #4
0
 def fn(*args, **kwargs):
     exception = self.exceptions
     for i in range(self.tries):
         try:
             return f(*args, **kwargs)
         except self.exceptions, e:
             if (i + 1) < self.tries:
                 time.sleep(self.delay)
             logger.error(str(e))
             exception = e
Example #5
0
def always_send(send_mq, send):
    if cf.local_test:
        return
    while True:
        if go_quit():
            break
        try:
            send_mq.send(base64.b64encode(json.dumps(send)))
        except:
            logger.error(traceback.format_exc())
            time.sleep(3)
        else:
            break
Example #6
0
def get_error_type(source):
    if isinstance(source, dict):
        if 'resultmsg' in source:
            if '系统繁忙,此时请开发者稍候再试' in source['resultmsg']:
                return ERROR_TYPE_BUSY
            elif 'access_token已过期' in source['resultmsg']:
                return ERROR_TYPE_TOKEN
        elif 'resultcode' in source:
            if '系统繁忙,此时请开发者稍候再试' in source['resultcode']:
                return ERROR_TYPE_BUSY
        elif 'message' in source:
            if '找不到附件' in source['message']:
                return ERROR_TYPE_NO_ATTACHE
    else:
        logger.error('source is not dict, [{}]'.format(source))
Example #7
0
def save_tmp_web_html(file_name, content):
    if cf.local_test:
        if not content:
            logger.warn('content is None, save to %s error' % file_name)
            return
        try:
            codecs.open(file_name, 'wb', encoding='utf-8').write(content)
        except:
            try:
                codecs.open(file_name, 'wb', encoding='utf-8').write(
                    unicode(content, encoding='gbk'))
            except:
                try:
                    codecs.open(file_name, 'wb', encoding='utf-8').write(
                        unicode(content, encoding='utf-8'))
                except:
                    logger.error(traceback.format_exc())
Example #8
0
def invalid_url(url, strategy):
    if com_invalid_url(url, strategy):
        logger.error('com_invalid_url [{}]'.format(url))
        return True

    if domain_invalid_url(url, strategy):
        logger.error('domain_invalid_url [{}]'.format(url))
        return True

    domain = strategy.get('domain')
    link_type = strategy.get('link_type')
    add_index_pages = strategy.get('add_index_pages')
    if add_index_pages:
        if url in add_index_pages:
            if link_type and (link_type == TYPE_ARITICLE):
                return True
            else:
                return False
    if link_type and (link_type == TYPE_ARITICLE):
        if domain:
            if isinstance(domain, (str, unicode)):
                if url.lower().rstrip('/').endswith(domain.lower()):
                    return True
            elif isinstance(domain, list):
                for one_domain in domain:
                    if url.lower().rstrip('/').endswith(one_domain.lower()):
                        return True
        for ends in cf.invalid_ends + cf.media_ends:
            if url.lower().endswith(ends):
                return True
    full_remove_urls = strategy.get('invalid_full')
    if full_remove_urls:
        for r_url in full_remove_urls:
            if url.rstrip('/') == r_url.lower().rstrip('/'):
                return True
    remove_urls = strategy.get('invalid')
    if remove_urls:
        for r_url in remove_urls:
            m = re.compile(r_url).findall(url)
            if m:
                return True
Example #9
0
def load_sites_first():
    try:
        if os.path.exists('sites_first.json'):
            return json.load(open('sites_first.json', 'r')).get('ids')
    except:
        logger.error(traceback.format_exc())
Example #10
0
def save_run_id(id):
    try:
        with open(os.path.join('var', 'cache', 'run_id'), 'w') as f:
            f.write(str(id))
    except:
        logger.error(traceback.format_exc())
Example #11
0
def main():
    try:
        logger.info('ingest start')
        run()
    except:
        logger.error(traceback.format_exc())
Example #12
0
                        else:
                            add_out_retry()
                    else:
                        add_out_retry()
                except NoDocIdException:
                    logger.error(article_info)
                except LineNotConfigException, e:
                    logger.warn(str(e))
                except SaveMysqlRetryException, e:
                    try:
                        j = json.loads(str(e))
                        logger.warn(j)
                    except:
                        logger.warn(str(e))
                except:
                    logger.error(traceback.format_exc())
                    add_out_retry()
                finally:
                    logger.message_undecorate()

        if not tasks:
            logger.warn('receive empty task')
            message.ack()
            return
        else:
            try:
                eval(tasks)
            except:
                logger.error('tasks:[{}], {}'.format(str(tasks),
                                                     traceback.format_exc()))
                message.ack()