def parse_index(ex, type_, content, conf): text = content.decode(conf['encoding'], 'ignore') for values in re.compile(conf['detail'], re.DOTALL).findall(text): d = { key: re.sub(r'(</?[a-zA-Z]+>|\s+)', '', value.strip()) for key, value in zip(conf['fields'], values) } if 'relative' in conf and not d['url'].startswith('http'): d['url'] = conf['relative'] + d['url'] if Announcement.query_one({'url': d['url']}): continue if ex.abbr == '中港邮币卡': d['published_at'] = re.sub('<[^>]*>', '-', d['published_at']) if ex.abbr == '三点零': pa = d['published_at'] pa = re.sub('<[^>]*>', '', pa) d['published_at'] = pa[2:] + '/' + pa[:2] d['published_at'] = parse_datetime(d['published_at']) \ - timedelta(hours=8) d['exchange'] = ex._id d['type_'] = type_ content = session.get(d['url'], timeout=(5, 10)).content d['html'] = content.decode(conf['encoding'], 'ignore') d['html'] = d['html'].replace(conf['encoding'], 'utf-8') log.info('[{exchange}]{published_at}: {title}'.format(**d)) Announcement(d).upsert()
def parse_index(ex, type_, content, conf): text = content.decode(conf['encoding'], 'ignore') for values in re.compile(conf['detail'], re.DOTALL).findall(text): d = {key: re.sub(r'(</?[a-zA-Z]+>|\s+)', '', value.strip()) for key, value in zip(conf['fields'], values)} if 'relative' in conf and not d['url'].startswith('http'): d['url'] = conf['relative'] + d['url'] if Announcement.query_one({'url': d['url']}): continue if ex.abbr == '中港邮币卡': d['published_at'] = re.sub('<[^>]*>', '-', d['published_at']) if ex.abbr == '三点零': pa = d['published_at'] pa = re.sub('<[^>]*>', '', pa) d['published_at'] = pa[2:] + '/' + pa[:2] d['published_at'] = parse_datetime(d['published_at']) \ - timedelta(hours=8) d['exchange'] = ex._id d['type_'] = type_ content = session.get(d['url'], timeout=(5, 10)).content d['html'] = content.decode(conf['encoding'], 'ignore') d['html'] = d['html'].replace(conf['encoding'], 'utf-8') log.info('[{exchange}]{published_at}: {title}'.format(**d)) Announcement(d).upsert()
def do_cron(parser, args): setup_config(args) lockfile = '/tmp/ybk.cron.lock' path = pathlib.Path(lockfile) class doing(object): def __enter__(self): path.open('w').write('') def __exit__(self, type, value, traceback): if value: crawl_log.exception('出错啦') path.unlink() return True if not path.exists(): with doing(): crawl_all() now = datetime.utcnow() + timedelta(hours=8) with doing(): if 9 <= now.hour <= 20: realtime_all() with doing(): if now.hour == 6 and now.minute < 5: history_all() with doing(): if 9 <= now.hour <= 20: # 生成所有人的历史收益记录 ProfitLog.ensure_all_profits() # 更新所有交易账号的状态 if now.hour == 22 and 30 <= now.minute <= 35: trade_account_all() else: crawl_log.info('已有cron在跑, 直接退出')