Beispiel #1
0
def fetch_all_pages(city_id, threads_num=10):
    db_session = Session()

    all_communities = db_session.query(Community).filter(
        Community.city_id == city_id,
        Community.page_fetched_at == None
    ).all()

    db_session.close()

    communities_queue = Queue()

    for a_community in all_communities:
        communities_queue.put(a_community)

    _counts['total'] = len(all_communities)

    logging.info(f'city_id={city_id}, 待抓取={_counts["total"]}')
    logging.info('抓取中...')

    for _ in range(threads_num):
        worker = Thread(target=do_fetch, args=[communities_queue])
        worker.start()

    communities_queue.join()

    logging.info('已全部抓取完成.')
Beispiel #2
0
def update_communities(city_id):
    """
    获取/更新小区信息
    """
    days = 3
    deadline = datetime.now() - timedelta(days=days)
    logging.info('更新久于 {} 天的小区信息...'.format(days))

    db_session = Session()

    biz_circles = db_session.query(BizCircle).filter(
        BizCircle.city_id == city_id,
        (BizCircle.communities_updated_at == None) |
        (BizCircle.communities_updated_at < deadline)
    ).all()

    total_count = len(biz_circles)
    logging.info('需更新总商圈数量: {}'.format(total_count))

    for i, biz_circle in enumerate(biz_circles):
        logging.info(
            '进度={}/{}, 商圈={}'.format(i + 1, total_count, biz_circle.name)
        )
        communities = get_communities_by_biz_circle(city_id, biz_circle.id)
        logging.info('小区数={}'.format(communities['count']))
        update_db(db_session, biz_circle, communities)

    db_session.close()

    logging.info('小区信息更新完毕.')
def do_fetch(city: City, communities_queue: Queue):
    # 抓取, 直到没东西可抓

    # http://docs.sqlalchemy.org/en/latest/orm/session_basics.html#is-the-session-thread-safe
    db_session = Session()

    while not communities_queue.empty():

        a_community = communities_queue.get()

        try:
            fetch_page(city, a_community.id)
            a_community.page_fetched_at = datetime.now()
        except Exception as e:
            _counts['failed'] += 1
            logging.error(
                f'# 抓取失败, community_id={a_community.id}, message="{e}"')

        else:
            db_session.add(a_community)
            db_session.commit()

        _counts['completed'] += 1

        if _counts['completed'] % 10 == 0:
            count_remaining = _counts["total"] - _counts["completed"]
            logging.info(
                f'进度={_counts["completed"]}/{_counts["total"]}, 剩余={count_remaining}'
            )

        communities_queue.task_done()

    db_session.close()
Beispiel #4
0
def parse_all_communities(city_id):
    db_session = Session()

    communities = db_session.query(Community).filter(
        Community.city_id == city_id, Community.detail == None,
        Community.page_fetched_at != None).all()

    total_count = len(communities)
    logging.info(f'city_id={city_id}, 待分析={total_count}')

    for i, a_community in enumerate(communities):
        detail = parse_community_detail(a_community.id)

        if detail:
            a_community.detail = detail

        if (i + 1) % 100 == 0 or (i == total_count - 1):
            logging.info(f'进度={i + 1}/{total_count}, 剩余={total_count - i - 1}')
            db_session.commit()

    logging.info('已全部分析完成.')
    db_session.close()
def fetch_all_pages(city_id, threads_num=10):
    db_session = Session()

    city = db_session.query(City).filter(
        City.id == city_id
    ).first()

    if not city:
        return logging.error('请先获取目标城市信息后再进行抓取~')

    # all_communities = db_session.query(Community).filter(
    #     Community.city_id == city_id,
    #     Community.page_fetched_at == None
    # ).all()

    all_communities = db_session.query(Community).filter(
        Community.city_id == city_id,
        Community.address == None
    ).all()

    db_session.close()

    communities_queue = Queue()

    for a_community in all_communities:
        communities_queue.put(a_community)

    _counts['total'] = len(all_communities)

    logging.info(f'city_id={city.id}, city_name={city.name}, 待抓取={_counts["total"]}')
    logging.info('抓取中...')

    for _ in range(threads_num):
        worker = Thread(target=do_fetch, args=[city, communities_queue])
        worker.start()

    communities_queue.join()

    logging.info('已全部抓取完成.')
Beispiel #6
0
def update_city(city_id):
    """
    初始化/更新城市信息
    """
    logging.info('初始化/更新城市信息... city_id={}'.format(city_id))

    city_info = get_city_info(city_id)
    city = City(city_info)

    db_session = Session()
    db_session.merge(city)

    for district_info in city_info['district']:
        district = District(city.id, district_info)
        logging.info(
            '城市={}, 区域={}, 商圈数={}'.format(
                city.name, district.name, district.biz_circles_count
            )
        )
        DISTRICT_MAP[district.name] = district.id
        db_session.merge(district)

        for biz_circle_info in district_info['bizcircle']:
            biz_circle = db_session.query(BizCircle).filter(
                BizCircle.id == int(biz_circle_info['bizcircle_id'])
            ).first()

            if biz_circle:
                # 记录已存在,可能需要更新 district_id
                if district.id not in biz_circle.district_id:
                    # biz_circle.district_id.append()、district_id += 等方式都不能更新表
                    biz_circle.district_id = biz_circle.district_id + [
                        district.id
                    ]
            else:
                biz_circle = BizCircle(city.id, district.id, biz_circle_info)
                db_session.add(biz_circle)

    db_session.commit()
    db_session.close()

    logging.info('初始化/更新城市信息结束.')
Beispiel #7
0
import config
from util.orm import Session
from model.community import Community
import requests
import logging
from pyquery import PyQuery
import json

db_session = Session()


def fetch_detail_page(db, city_index):
    communities = db.query(Community).filter(
        Community.city_index == city_index,
        Community.detail_info == None
    ).all()
    for community in communities:
        logging.info(f'开始抓取{community.alias}小区详情页数据')
        url = f'http://m.58.com/xiaoqu/{community.listname}/'
        try:
            res = requests.get(url, headers=config.config.headers, timeout=5)
            res.raise_for_status()
        except Exception as e:
            logging.error(f' 错误信息: {e}')
            continue
        print(url)
        doc = PyQuery(res.content)
        keys = []
        values = []
        keys_values = doc('.xq-info .info-con span')
        index = 0