def parse_index(ex, type_, content, conf): text = content.decode(conf['encoding'], 'ignore') for values in re.compile(conf['detail'], re.DOTALL).findall(text): d = { key: re.sub(r'(</?[a-zA-Z]+>|\s+)', '', value.strip()) for key, value in zip(conf['fields'], values) } if 'relative' in conf and not d['url'].startswith('http'): d['url'] = conf['relative'] + d['url'] if Announcement.query_one({'url': d['url']}): continue if ex.abbr == '中港邮币卡': d['published_at'] = re.sub('<[^>]*>', '-', d['published_at']) if ex.abbr == '三点零': pa = d['published_at'] pa = re.sub('<[^>]*>', '', pa) d['published_at'] = pa[2:] + '/' + pa[:2] d['published_at'] = parse_datetime(d['published_at']) \ - timedelta(hours=8) d['exchange'] = ex._id d['type_'] = type_ content = session.get(d['url'], timeout=(5, 10)).content d['html'] = content.decode(conf['encoding'], 'ignore') d['html'] = d['html'].replace(conf['encoding'], 'utf-8') log.info('[{exchange}]{published_at}: {title}'.format(**d)) Announcement(d).upsert()