def ensure_articles(articles: typing.Sequence[dict]) -> list: valid_articles = [] # ensure_keys = ("url_key", "title", "cover", "desc", "source", # "review", "ts_publish", "lang") keys_set = None now = ttime() today_0_0 = f'{now[:10]} 00:00:00' for article in articles: if not isinstance(article, dict): continue if not keys_set: keys_set = set(article.keys()) else: # 如果 keys 和第一个不一样, 就没法使用 executemany, 所以跳过 if set(article.keys()) != keys_set: continue # 这些 key 必须都存在才能入库 source = content_sources_dict.get(article['source']) if not source: continue for ensure_key in ('url_key', 'title'): if not article.get(ensure_key): continue article.setdefault('cover', '') article.setdefault('desc', '') article.setdefault('source', 'unknown') article.setdefault('review', '') article.setdefault('level', source.get('level', 3)) article.setdefault('lang', source.get('lang', 'CN')) article.setdefault('ts_publish', '1970-01-01 08:00:01') article['desc'] = re.sub( r'<script[\s\S]*?</script>|<style[\s\S]*?</style>', '', article['desc']).strip() article['title'] = article['title'].strip() # mysql 会报错 0000-00-00 00:00:00 格式错误; 顺便尝试转换掉错误的发布时间 if ttime(ptime(article['ts_publish'])) == '1970-01-01 08:00:00': article['ts_publish'] = '1970-01-01 08:00:01' if not article.get('ts_create'): # 今天发布的, 使用当前时间做抓取时间 # 如果发布时间不存在, 也使用当前时间做抓取时间 if article['ts_publish'] >= today_0_0 or article[ 'ts_publish'] == '1970-01-01 08:00:01': article['ts_create'] = now else: # 不是今天发布的, 使用发布时间做抓取时间 article['ts_create'] = article['ts_publish'] valid_articles.append(article) return valid_articles
def get_ts_latest(cursor): cursor.execute('select max(ts_update) from articles') result = cursor.fetchone()[0] if result: return result else: return ttime(0)
def main(): # work() while 1: work() print(ttime()) tick = 10 for _ in range(interval // tick): print(_, '.', end='', flush=1) time.sleep(tick) print()
async def daily_python_list(req): """Python 日报列表, 其实就是个按照日期伪造的页面, 用来订阅 rss""" language = req.path_params['language'].lower() if language not in {'cn', 'en', 'any'}: return PlainTextResponse('language should be cn / en / any.') limit: int = int(req.query_params.get('limit') or 10) xml_data: dict = { 'channel': { 'title': 'Python Daily', 'description': 'Python Daily Newspaper', 'link': f'https://{ONLINE_HOST}/newspaper/daily.python.list.rss.{language}', 'language': { 'cn': 'zh-cn', 'any': 'zh-cn' }.get(language, 'en'), }, 'items': [] } for date_delta in range(1, limit): title_date: str = ttime(time.time() - 86400 * date_delta)[:10] # 当日 0 点发布前一天的结果 pubDate: str = ttime(ptime(ttime(time.time() - 86400 * (date_delta - 1))[:10], fmt='%Y-%m-%d'), fmt='%a, %d %b %Y') link: str = f'https://{ONLINE_HOST}/newspaper/daily.python/{title_date}?lang={language}' item: dict = { 'title': f'Python Daily [{title_date}]', 'link': link, 'guid': link, 'pubDate': pubDate } xml_data['items'].append(item) xml: str = gen_rss(xml_data) return Response(xml, media_type='text/xml')
async def log(request: Request, max_lines: int = 50, refresh_every: int = 0, log_names: str = 'info-server-error'): window: deque = deque((), max_lines) names: list = log_names.split('-') items = [] for name in names: file_name = f'{name}.log' fp: Path = Config.CONFIG_DIR / file_name if not fp.is_file(): continue fp_stat = fp.stat() file_size = format_size(fp_stat.st_size) st_mtime = ttime(fp_stat.st_mtime) line_no = 0 async with aiofiles.open(fp, encoding=Config.ENCODING) as f: async for line in f: line_no += 1 window.append(line) item = { 'name': name, 'line_no': line_no, 'file_size': file_size, 'st_mtime': st_mtime, 'log_text': "".join(window), 'file_size_mb': Config.LOGGING_FILE_CONFIG.get(file_name, {}).get( 'file_size_mb', '-1'), } items.append(item) window.clear() context = { 'request': request, 'items': items, 'log_names': log_names, 'refresh_every': refresh_every, 'max_lines': max_lines, } return templates.TemplateResponse("logs.html", context=context)
async def _crawl_once(task_name: Optional[str] = None, chunk_size: int = 20): """task_name means force crawl""" db: Database = Config.db now = datetime.now() logger = Config.logger logger.info(f'crawl_once task_name={task_name} start.') # sqlite do not has datediff... if task_name: query = tasks.select().where(tasks.c.name == task_name) else: query = tasks.select().where(tasks.c.enable == 1).where( tasks.c.next_check_time <= now) query = query.limit(chunk_size) todo = [] now = datetime.now() update_values = [] CLEAR_CACHE_NEEDED = False fetched_tasks = await db.fetch_all(query=query) has_more = len(fetched_tasks) >= chunk_size for _task in fetched_tasks: task = Task(**dict(_task)) # check work hours need_crawl, next_check_time = find_next_check_time(task, now) if task_name: # always crawl for given task_name need_crawl = True if need_crawl: t = ensure_future(crawl(task)) # add task_name for logger setattr(t, 'task_name', task.name) todo.append(t) # update next_check_time values = { 'last_check_time': now, 'next_check_time': next_check_time, 'task_id': task.task_id } # update task variable for callback task.__dict__.update(values) update_values.append(values) if not need_crawl: logger.info( f'Task [{task.name}] is not on work time, next_check_time reset to {next_check_time}' ) update_query = 'update tasks set `last_check_time`=:last_check_time,`next_check_time`=:next_check_time where task_id=:task_id' await db.execute_many(query=update_query, values=update_values) if update_values: CLEAR_CACHE_NEEDED = True logger.info(f'crawl_once crawling {len(todo)} valid tasks.') if todo: done, pending = await wait(todo, timeout=Config.default_crawler_timeout) if pending: names = [getattr(t, 'task_name', None) for t in pending] logger.error(f'crawl timeout {len(names)}: {names}') ttime_now = ttime() changed_tasks = [] update_counts = 0 crawl_errors = [] for t in done: task, error, result_list = t.result() if error != task.error: crawl_errors.append({'task_id': task.task_id, 'error': error}) if error or result_list is None: # ignore update this task continue # compare latest_result and new list # later first, just like the saved result_list sortings old_latest_result = loads(task.latest_result) # try to use the key, or it self old_latest_result_key = get_result_key(old_latest_result) try: old_result_list = loads( task.result_list) if task.result_list else [] except JSONDecodeError: old_result_list = [] if old_latest_result.get('unique'): # unique mode will skip all the Duplicated results exist_keys = { get_result_key(_old_result['result']) for _old_result in old_result_list } else: exist_keys = {old_latest_result_key} # list of dict to_insert_result_list = [] for result in result_list: result_key = get_result_key(result) if result_key in exist_keys: break to_insert_result_list.append(result) if to_insert_result_list: # update db update_counts += 1 # new result updated query = UpdateTaskQuery(task.task_id) # JSON new_latest_result = dumps(to_insert_result_list[0], sort_keys=True) query.add('latest_result', new_latest_result) query.add('last_change_time', now) # older insert first, keep the newer is on the top new_seeds = [] for result in to_insert_result_list[::-1]: # result is dict, not json string old_result_list.insert(0, { 'result': result, 'time': ttime_now }) new_seeds.append(result) await save_feed(new_seeds, db, task) new_result_list = dumps( old_result_list[:task.max_result_count]) query.add('result_list', new_result_list) logger.info(f'[Updated] {task.name}. +++') await db.execute(**query.kwargs) task.latest_result = new_latest_result task.last_change_time = now task.result_list = new_result_list changed_tasks.append(task) if crawl_errors: update_query = 'update tasks set `error`=:error where task_id=:task_id' await db.execute_many(query=update_query, values=crawl_errors) logger.info( f'Crawl task_name={task_name} finished. Crawled: {len(done)}, Error: {len(crawl_errors)}, Timeout: {len(pending)}, Update: {update_counts}.{" +++" if update_counts else ""}' ) for task in changed_tasks: ensure_future(try_catch(Config.callback_handler.callback, task)) query_feeds.cache_clear() else: logger.info(f'Crawl task_name={task_name} finished. 0 todo.') if CLEAR_CACHE_NEEDED: logger.info('Clear cache for crawling new tasks.') query_tasks.cache_clear() if task_name: query = tasks.select().where(tasks.c.name == task_name) _task = await db.fetch_one(query=query) return dict(_task) else: return has_more
def fetch_detail(item): if not item['room_id'].isdigit(): return item if item['room_id'] in ss.rooms: exist_item = ss.rooms[item['room_id']] if 'tags' in exist_item and 'string' not in exist_item and '√' in exist_item.get( 'status', '') and 'release' not in exist_item.get( 'status', ''): # 已经抓过 tags, string 作为过期字段已经被清理掉, 可签约, 不是待释放 item.update(exist_item) return item else: item['time'] = exist_item.get('time') or ttime() print(cc.x, '/', total_rooms_count, '采集房间', item.get('title', ''), item['url'], flush=1) scode = '' for _ in range(5): if 'Z_name' in scode: break r = req.get(item['url'], **kwargs) scode = r.text else: print(scode) print('程序崩溃, fetch_detail 重试次数过多', item['url']) raise RequestErrorForRetry() html = BeautifulSoup(scode, features='html.parser') item['title'] = html.select_one('h1.Z_name').text.replace('自如友家·', '') neighbors = html.select('#meetinfo ul.rent_list>li') item['rooms'] = len(neighbors) + 1 genders = {'女', '男'} other_rooms = '' for n in neighbors: gender = n.select_one('.info>.mt10>span').text.strip() if gender in genders: other_rooms += gender else: other_rooms += '空' item['other_rooms'] = other_rooms duration = '未知时长' for i in html.select('#live-tempbox .jiance>li'): if '签约时长' in i.text: tag = i.select_one('.info_value') if tag: duration = tag.text break # 空气检测 air = '无空气检测结果' for i in html.select('#areacheck .jiance>li'): if '检测日期' in i.text: tag = i.select_one('.info_value') if tag: air = '检测日期: %s' % tag.text break elif '空置时长' in i.text: tag = i.select_one('.info_value') if tag: air = '空置时长: %s' % tag.text break ok = '-' if not ('√' in item['status'] or 'X' in item['status']): if html.select_one('[class="Z_prelook active"]'): ok = '√' else: ok = 'X' if not ('检测日期' in item['status'] or '空置时长' in item['status']): item[ 'status'] = f'{"√" if ok else "X"}: {item["status"]}({duration}|{air})' item['target'] = html.select_one( '.Z_home_info>.Z_home_b>dl:nth-of-type(2)>dd').text tags = [i.text for i in html.select('.Z_tags>.tag')] tags = ", ".join(tags) item['tags'] = tags or '-' item['girls'] = item['other_rooms'].count('女') item['score'] = get_score(item) item['price'] = '-' item['time'] = item.get('time') or ttime() print(get_string(item), flush=1) return item
def work(): with open('list_urls.txt', 'r', encoding='u8') as f: list_urls = f.read() list_urls = [i.strip() for i in list_urls.splitlines()] list_urls = set([i for i in list_urls if i]) try: with open('ziru_old_metas_dict.txt') as f: old_metas_dict = json.load(f) except FileNotFoundError: old_metas_dict = {} detail_metas = fetch_list(list_urls) + list(old_metas_dict.values()) # print(fetch_list(list_urls)) # return detail_metas_unique = {} for meta in detail_metas: rid = meta['rid'] if rid not in detail_metas_unique: detail_metas_unique[rid] = meta else: if int(meta['distance']) < int( detail_metas_unique[rid]['distance']): detail_metas_unique[rid] = meta detail_metas = list(detail_metas_unique.values()) # print(len(details), 'rooms') metas = fetch_detail(detail_metas) now = ttime() for meta in metas: score = 0 score -= int(meta['female']) * 0.5 if int(meta['floor']) <= 6: score += 0 elif int(meta['floor']) < 12: score += 0.5 else: score += 1 score += 3 - int(meta['rooms']) score += (float(meta['area']) - 10) * 0.1 distance = int(meta['distance']) price = int(meta['price']) score += round((2500 - price) / 100) * 0.2 if re.search('0[34567]卧', meta['name']): score -= 0.5 if meta['floor'] == meta['max_floor']: score -= 1 if distance < 500: score += 1 elif distance < 1000: score += 0.5 elif distance > 1500: score -= 0.5 meta['score'] = round(score, 2) if meta['rid'] in old_metas_dict: meta['create_time'] = old_metas_dict[meta['rid']].get( 'create_time') or now else: meta['create_time'] = now metas.sort(key=lambda x: x['score'], reverse=1) keys = 'rid name subway station distance price area rooms floor max_floor orient neighbor female score create_time button url'.split( ) has_new = 0 with open('ziru_now.txt', 'w', encoding='u8') as f: with open('ziru_new.txt', 'a', encoding='u8') as ff: print(*keys, sep='\t', file=f) for i in metas: print(*[re.sub('\s+', ' ', str(i[key])) for key in keys], sep='\t', file=f) if i['create_time'] == now: print(*[re.sub('\s+', ' ', str(i[key])) for key in keys], sep='\t', file=ff) print('new!') has_new = 1 # save metas_dict = {i['rid']: i for i in metas} with open('ziru_old_metas_dict.txt', 'w') as f: json.dump(metas_dict, f) if has_new: alarm()
async def query_articles( self, query: str = None, start_time: str = "", end_time: str = "", source: str = "", order_by: str = 'ts_create', sorting: str = 'desc', limit: int = 30, offset: int = 0, date: str = '', lang: str = 'ANY', ) -> dict: args: list = [] where_list: list = [] result: dict = {} source = str(source) order_by = order_by.strip(' `') limit = min((self.max_limit, int(limit))) offset = int(offset) lang = str(lang).upper() extra_select_words: str = '' if query: # 带检索词的,添加上字段方便排序 extra_select_words = ', MATCH (`title`, `desc`, `url`) AGAINST (%s IN BOOLEAN MODE) as relevance' args.append(query) where_list.append( 'MATCH (`title`, `desc`, `url`) AGAINST (%s in BOOLEAN MODE)') args.append(query) if order_by not in self.articles_table_columns and order_by != 'relevance': order_by = 'ts_create' order_by_sorting = f'order by {order_by} {sorting}' if date: if date == 'today': date = ttime()[:10] elif date == 'yesterday': date = ttime(time.time() - 86400)[:10] # 将 date 换算成起止时间并覆盖 date = str(date) if not re.match('\\d\\d\\d\\d-\\d\\d-\\d\\d', date): raise ValueError(f'日期参数的格式不对 {date}, 例: 2019-05-14') start_time = f'{date} 00:00:00' end_time = f'{date} 23:59:59' limit = 9999 if sorting.lower() not in ('desc', 'asc'): sorting = 'desc' if start_time: where_list.append("`ts_publish` >= %s") args.append(start_time) result['start_time'] = start_time if end_time: where_list.append("`ts_publish` <= %s") args.append(end_time) result['end_time'] = end_time if source: where_list.append("`source` = %s") args.append(source) result['source'] = source if lang in {'CN', 'EN'}: where_list.append("`lang` = %s") args.append(lang) else: lang = 'ANY' result['order_by'] = order_by result['query'] = query or '' result['sorting'] = sorting result['limit'] = limit result['offset'] = offset result['date'] = date args.extend([limit + 1, offset]) if where_list: where_string = 'where ' + ' and '.join(where_list) else: where_string = '' sql = f"SELECT *{extra_select_words} from articles {where_string} {order_by_sorting} limit %s offset %s" logger.info(f'fetching articles sql: {sql}, args: {args}') items = await self.execute(sql, args) result['has_more'] = 1 if len(items) > limit else 0 articles = self.format_output_articles(items[:limit]) result['articles'] = articles result['lang'] = lang return result