Beispiel #1
0
 def ensure_articles(articles: typing.Sequence[dict]) -> list:
     valid_articles = []
     # ensure_keys = ("url_key", "title", "cover", "desc", "source",
     #                "review", "ts_publish", "lang")
     keys_set = None
     now = ttime()
     today_0_0 = f'{now[:10]} 00:00:00'
     for article in articles:
         if not isinstance(article, dict):
             continue
         if not keys_set:
             keys_set = set(article.keys())
         else:
             # 如果 keys 和第一个不一样, 就没法使用 executemany, 所以跳过
             if set(article.keys()) != keys_set:
                 continue
         # 这些 key 必须都存在才能入库
         source = content_sources_dict.get(article['source'])
         if not source:
             continue
         for ensure_key in ('url_key', 'title'):
             if not article.get(ensure_key):
                 continue
         article.setdefault('cover', '')
         article.setdefault('desc', '')
         article.setdefault('source', 'unknown')
         article.setdefault('review', '')
         article.setdefault('level', source.get('level', 3))
         article.setdefault('lang', source.get('lang', 'CN'))
         article.setdefault('ts_publish', '1970-01-01 08:00:01')
         article['desc'] = re.sub(
             r'<script[\s\S]*?</script>|<style[\s\S]*?</style>', '',
             article['desc']).strip()
         article['title'] = article['title'].strip()
         # mysql 会报错 0000-00-00 00:00:00 格式错误; 顺便尝试转换掉错误的发布时间
         if ttime(ptime(article['ts_publish'])) == '1970-01-01 08:00:00':
             article['ts_publish'] = '1970-01-01 08:00:01'
         if not article.get('ts_create'):
             # 今天发布的, 使用当前时间做抓取时间
             # 如果发布时间不存在, 也使用当前时间做抓取时间
             if article['ts_publish'] >= today_0_0 or article[
                     'ts_publish'] == '1970-01-01 08:00:01':
                 article['ts_create'] = now
             else:
                 # 不是今天发布的, 使用发布时间做抓取时间
                 article['ts_create'] = article['ts_publish']
         valid_articles.append(article)
     return valid_articles
Beispiel #2
0
def get_ts_latest(cursor):
    cursor.execute('select max(ts_update) from articles')
    result = cursor.fetchone()[0]
    if result:
        return result
    else:
        return ttime(0)
Beispiel #3
0
def main():
    # work()
    while 1:
        work()
        print(ttime())
        tick = 10
        for _ in range(interval // tick):
            print(_, '.', end='', flush=1)
            time.sleep(tick)
        print()
Beispiel #4
0
async def daily_python_list(req):
    """Python 日报列表, 其实就是个按照日期伪造的页面, 用来订阅 rss"""
    language = req.path_params['language'].lower()
    if language not in {'cn', 'en', 'any'}:
        return PlainTextResponse('language should be cn / en / any.')
    limit: int = int(req.query_params.get('limit') or 10)
    xml_data: dict = {
        'channel': {
            'title': 'Python Daily',
            'description': 'Python Daily Newspaper',
            'link':
            f'https://{ONLINE_HOST}/newspaper/daily.python.list.rss.{language}',
            'language': {
                'cn': 'zh-cn',
                'any': 'zh-cn'
            }.get(language, 'en'),
        },
        'items': []
    }
    for date_delta in range(1, limit):
        title_date: str = ttime(time.time() - 86400 * date_delta)[:10]
        # 当日 0 点发布前一天的结果
        pubDate: str = ttime(ptime(ttime(time.time() - 86400 *
                                         (date_delta - 1))[:10],
                                   fmt='%Y-%m-%d'),
                             fmt='%a, %d %b %Y')
        link: str = f'https://{ONLINE_HOST}/newspaper/daily.python/{title_date}?lang={language}'
        item: dict = {
            'title': f'Python Daily [{title_date}]',
            'link': link,
            'guid': link,
            'pubDate': pubDate
        }
        xml_data['items'].append(item)
    xml: str = gen_rss(xml_data)
    return Response(xml, media_type='text/xml')
Beispiel #5
0
async def log(request: Request,
              max_lines: int = 50,
              refresh_every: int = 0,
              log_names: str = 'info-server-error'):
    window: deque = deque((), max_lines)
    names: list = log_names.split('-')
    items = []
    for name in names:
        file_name = f'{name}.log'
        fp: Path = Config.CONFIG_DIR / file_name
        if not fp.is_file():
            continue
        fp_stat = fp.stat()
        file_size = format_size(fp_stat.st_size)
        st_mtime = ttime(fp_stat.st_mtime)
        line_no = 0
        async with aiofiles.open(fp, encoding=Config.ENCODING) as f:
            async for line in f:
                line_no += 1
                window.append(line)
        item = {
            'name': name,
            'line_no': line_no,
            'file_size': file_size,
            'st_mtime': st_mtime,
            'log_text': "".join(window),
            'file_size_mb': Config.LOGGING_FILE_CONFIG.get(file_name, {}).get(
                'file_size_mb', '-1'),
        }
        items.append(item)
        window.clear()
    context = {
        'request': request,
        'items': items,
        'log_names': log_names,
        'refresh_every': refresh_every,
        'max_lines': max_lines,
    }
    return templates.TemplateResponse("logs.html", context=context)
Beispiel #6
0
async def _crawl_once(task_name: Optional[str] = None, chunk_size: int = 20):
    """task_name means force crawl"""
    db: Database = Config.db
    now = datetime.now()
    logger = Config.logger
    logger.info(f'crawl_once task_name={task_name} start.')
    # sqlite do not has datediff...
    if task_name:
        query = tasks.select().where(tasks.c.name == task_name)
    else:
        query = tasks.select().where(tasks.c.enable == 1).where(
            tasks.c.next_check_time <= now)
        query = query.limit(chunk_size)
    todo = []
    now = datetime.now()
    update_values = []
    CLEAR_CACHE_NEEDED = False
    fetched_tasks = await db.fetch_all(query=query)
    has_more = len(fetched_tasks) >= chunk_size
    for _task in fetched_tasks:
        task = Task(**dict(_task))
        # check work hours
        need_crawl, next_check_time = find_next_check_time(task, now)
        if task_name:
            # always crawl for given task_name
            need_crawl = True
        if need_crawl:
            t = ensure_future(crawl(task))
            # add task_name for logger
            setattr(t, 'task_name', task.name)
            todo.append(t)
        # update next_check_time
        values = {
            'last_check_time': now,
            'next_check_time': next_check_time,
            'task_id': task.task_id
        }
        # update task variable for callback
        task.__dict__.update(values)
        update_values.append(values)
        if not need_crawl:
            logger.info(
                f'Task [{task.name}] is not on work time, next_check_time reset to {next_check_time}'
            )
    update_query = 'update tasks set `last_check_time`=:last_check_time,`next_check_time`=:next_check_time where task_id=:task_id'
    await db.execute_many(query=update_query, values=update_values)
    if update_values:
        CLEAR_CACHE_NEEDED = True
    logger.info(f'crawl_once crawling {len(todo)} valid tasks.')
    if todo:
        done, pending = await wait(todo,
                                   timeout=Config.default_crawler_timeout)
        if pending:
            names = [getattr(t, 'task_name', None) for t in pending]
            logger.error(f'crawl timeout {len(names)}: {names}')
        ttime_now = ttime()
        changed_tasks = []
        update_counts = 0
        crawl_errors = []
        for t in done:
            task, error, result_list = t.result()
            if error != task.error:
                crawl_errors.append({'task_id': task.task_id, 'error': error})
            if error or result_list is None:
                # ignore update this task
                continue
            # compare latest_result and new list
            # later first, just like the saved result_list sortings
            old_latest_result = loads(task.latest_result)
            # try to use the key, or it self
            old_latest_result_key = get_result_key(old_latest_result)
            try:
                old_result_list = loads(
                    task.result_list) if task.result_list else []
            except JSONDecodeError:
                old_result_list = []
            if old_latest_result.get('unique'):
                # unique mode will skip all the Duplicated results
                exist_keys = {
                    get_result_key(_old_result['result'])
                    for _old_result in old_result_list
                }
            else:
                exist_keys = {old_latest_result_key}
            # list of dict
            to_insert_result_list = []
            for result in result_list:
                result_key = get_result_key(result)
                if result_key in exist_keys:
                    break
                to_insert_result_list.append(result)
            if to_insert_result_list:
                # update db
                update_counts += 1
                # new result updated
                query = UpdateTaskQuery(task.task_id)
                # JSON
                new_latest_result = dumps(to_insert_result_list[0],
                                          sort_keys=True)
                query.add('latest_result', new_latest_result)
                query.add('last_change_time', now)
                # older insert first, keep the newer is on the top
                new_seeds = []
                for result in to_insert_result_list[::-1]:
                    # result is dict, not json string
                    old_result_list.insert(0, {
                        'result': result,
                        'time': ttime_now
                    })
                    new_seeds.append(result)
                await save_feed(new_seeds, db, task)
                new_result_list = dumps(
                    old_result_list[:task.max_result_count])
                query.add('result_list', new_result_list)
                logger.info(f'[Updated] {task.name}. +++')
                await db.execute(**query.kwargs)
                task.latest_result = new_latest_result
                task.last_change_time = now
                task.result_list = new_result_list
                changed_tasks.append(task)
        if crawl_errors:
            update_query = 'update tasks set `error`=:error where task_id=:task_id'
            await db.execute_many(query=update_query, values=crawl_errors)
        logger.info(
            f'Crawl task_name={task_name} finished. Crawled: {len(done)}, Error: {len(crawl_errors)}, Timeout: {len(pending)}, Update: {update_counts}.{" +++" if update_counts else ""}'
        )
        for task in changed_tasks:
            ensure_future(try_catch(Config.callback_handler.callback, task))
        query_feeds.cache_clear()
    else:
        logger.info(f'Crawl task_name={task_name} finished. 0 todo.')
    if CLEAR_CACHE_NEEDED:
        logger.info('Clear cache for crawling new tasks.')
        query_tasks.cache_clear()
    if task_name:
        query = tasks.select().where(tasks.c.name == task_name)
        _task = await db.fetch_one(query=query)
        return dict(_task)
    else:
        return has_more
Beispiel #7
0
def fetch_detail(item):
    if not item['room_id'].isdigit():
        return item
    if item['room_id'] in ss.rooms:
        exist_item = ss.rooms[item['room_id']]
        if 'tags' in exist_item and 'string' not in exist_item and '√' in exist_item.get(
                'status', '') and 'release' not in exist_item.get(
                    'status', ''):
            # 已经抓过 tags, string 作为过期字段已经被清理掉, 可签约, 不是待释放
            item.update(exist_item)
            return item
        else:
            item['time'] = exist_item.get('time') or ttime()
    print(cc.x,
          '/',
          total_rooms_count,
          '采集房间',
          item.get('title', ''),
          item['url'],
          flush=1)
    scode = ''
    for _ in range(5):
        if 'Z_name' in scode:
            break
        r = req.get(item['url'], **kwargs)
        scode = r.text
    else:
        print(scode)
        print('程序崩溃, fetch_detail 重试次数过多', item['url'])
        raise RequestErrorForRetry()

    html = BeautifulSoup(scode, features='html.parser')
    item['title'] = html.select_one('h1.Z_name').text.replace('自如友家·', '')
    neighbors = html.select('#meetinfo ul.rent_list>li')
    item['rooms'] = len(neighbors) + 1
    genders = {'女', '男'}
    other_rooms = ''
    for n in neighbors:
        gender = n.select_one('.info>.mt10>span').text.strip()
        if gender in genders:
            other_rooms += gender
        else:
            other_rooms += '空'
    item['other_rooms'] = other_rooms
    duration = '未知时长'
    for i in html.select('#live-tempbox .jiance>li'):
        if '签约时长' in i.text:
            tag = i.select_one('.info_value')
            if tag:
                duration = tag.text
            break
    # 空气检测
    air = '无空气检测结果'
    for i in html.select('#areacheck .jiance>li'):
        if '检测日期' in i.text:
            tag = i.select_one('.info_value')
            if tag:
                air = '检测日期: %s' % tag.text
            break
        elif '空置时长' in i.text:
            tag = i.select_one('.info_value')
            if tag:
                air = '空置时长: %s' % tag.text
            break
    ok = '-'
    if not ('√' in item['status'] or 'X' in item['status']):
        if html.select_one('[class="Z_prelook active"]'):
            ok = '√'
        else:
            ok = 'X'
    if not ('检测日期' in item['status'] or '空置时长' in item['status']):
        item[
            'status'] = f'{"√" if ok else "X"}: {item["status"]}({duration}|{air})'
    item['target'] = html.select_one(
        '.Z_home_info>.Z_home_b>dl:nth-of-type(2)>dd').text
    tags = [i.text for i in html.select('.Z_tags>.tag')]
    tags = ", ".join(tags)
    item['tags'] = tags or '-'
    item['girls'] = item['other_rooms'].count('女')
    item['score'] = get_score(item)
    item['price'] = '-'
    item['time'] = item.get('time') or ttime()
    print(get_string(item), flush=1)
    return item
Beispiel #8
0
def work():

    with open('list_urls.txt', 'r', encoding='u8') as f:
        list_urls = f.read()
        list_urls = [i.strip() for i in list_urls.splitlines()]
        list_urls = set([i for i in list_urls if i])
    try:
        with open('ziru_old_metas_dict.txt') as f:
            old_metas_dict = json.load(f)
    except FileNotFoundError:
        old_metas_dict = {}

    detail_metas = fetch_list(list_urls) + list(old_metas_dict.values())
    # print(fetch_list(list_urls))
    # return
    detail_metas_unique = {}
    for meta in detail_metas:
        rid = meta['rid']
        if rid not in detail_metas_unique:
            detail_metas_unique[rid] = meta
        else:
            if int(meta['distance']) < int(
                    detail_metas_unique[rid]['distance']):
                detail_metas_unique[rid] = meta
    detail_metas = list(detail_metas_unique.values())
    # print(len(details), 'rooms')
    metas = fetch_detail(detail_metas)
    now = ttime()
    for meta in metas:
        score = 0
        score -= int(meta['female']) * 0.5
        if int(meta['floor']) <= 6:
            score += 0
        elif int(meta['floor']) < 12:
            score += 0.5
        else:
            score += 1
        score += 3 - int(meta['rooms'])
        score += (float(meta['area']) - 10) * 0.1
        distance = int(meta['distance'])
        price = int(meta['price'])
        score += round((2500 - price) / 100) * 0.2
        if re.search('0[34567]卧', meta['name']):
            score -= 0.5
        if meta['floor'] == meta['max_floor']:
            score -= 1
        if distance < 500:
            score += 1
        elif distance < 1000:
            score += 0.5
        elif distance > 1500:
            score -= 0.5
        meta['score'] = round(score, 2)
        if meta['rid'] in old_metas_dict:
            meta['create_time'] = old_metas_dict[meta['rid']].get(
                'create_time') or now
        else:
            meta['create_time'] = now

    metas.sort(key=lambda x: x['score'], reverse=1)
    keys = 'rid name subway station distance price area rooms floor max_floor orient neighbor female score create_time button url'.split(
    )
    has_new = 0
    with open('ziru_now.txt', 'w', encoding='u8') as f:
        with open('ziru_new.txt', 'a', encoding='u8') as ff:
            print(*keys, sep='\t', file=f)
            for i in metas:
                print(*[re.sub('\s+', ' ', str(i[key])) for key in keys],
                      sep='\t',
                      file=f)
                if i['create_time'] == now:
                    print(*[re.sub('\s+', ' ', str(i[key])) for key in keys],
                          sep='\t',
                          file=ff)
                    print('new!')
                    has_new = 1

    # save
    metas_dict = {i['rid']: i for i in metas}
    with open('ziru_old_metas_dict.txt', 'w') as f:
        json.dump(metas_dict, f)
    if has_new:
        alarm()
Beispiel #9
0
    async def query_articles(
        self,
        query: str = None,
        start_time: str = "",
        end_time: str = "",
        source: str = "",
        order_by: str = 'ts_create',
        sorting: str = 'desc',
        limit: int = 30,
        offset: int = 0,
        date: str = '',
        lang: str = 'ANY',
    ) -> dict:
        args: list = []
        where_list: list = []
        result: dict = {}
        source = str(source)
        order_by = order_by.strip(' `')
        limit = min((self.max_limit, int(limit)))
        offset = int(offset)
        lang = str(lang).upper()
        extra_select_words: str = ''

        if query:
            # 带检索词的,添加上字段方便排序
            extra_select_words = ', MATCH (`title`, `desc`, `url`) AGAINST (%s IN BOOLEAN MODE)  as relevance'
            args.append(query)
            where_list.append(
                'MATCH (`title`, `desc`, `url`) AGAINST (%s in BOOLEAN MODE)')
            args.append(query)
        if order_by not in self.articles_table_columns and order_by != 'relevance':
            order_by = 'ts_create'
        order_by_sorting = f'order by {order_by} {sorting}'
        if date:
            if date == 'today':
                date = ttime()[:10]
            elif date == 'yesterday':
                date = ttime(time.time() - 86400)[:10]
            # 将 date 换算成起止时间并覆盖
            date = str(date)
            if not re.match('\\d\\d\\d\\d-\\d\\d-\\d\\d', date):
                raise ValueError(f'日期参数的格式不对 {date}, 例: 2019-05-14')
            start_time = f'{date} 00:00:00'
            end_time = f'{date} 23:59:59'
            limit = 9999
        if sorting.lower() not in ('desc', 'asc'):
            sorting = 'desc'
        if start_time:
            where_list.append("`ts_publish` >= %s")
            args.append(start_time)
            result['start_time'] = start_time
        if end_time:
            where_list.append("`ts_publish` <= %s")
            args.append(end_time)
            result['end_time'] = end_time
        if source:
            where_list.append("`source` = %s")
            args.append(source)
            result['source'] = source

        if lang in {'CN', 'EN'}:
            where_list.append("`lang` = %s")
            args.append(lang)
        else:
            lang = 'ANY'

        result['order_by'] = order_by
        result['query'] = query or ''
        result['sorting'] = sorting
        result['limit'] = limit
        result['offset'] = offset
        result['date'] = date
        args.extend([limit + 1, offset])
        if where_list:
            where_string = 'where ' + ' and '.join(where_list)
        else:
            where_string = ''
        sql = f"SELECT *{extra_select_words} from articles {where_string} {order_by_sorting} limit %s offset %s"
        logger.info(f'fetching articles sql: {sql}, args: {args}')
        items = await self.execute(sql, args)
        result['has_more'] = 1 if len(items) > limit else 0
        articles = self.format_output_articles(items[:limit])
        result['articles'] = articles
        result['lang'] = lang
        return result