Ejemplo n.º 1
0
def xc_scene(url):
    rsp = requests.get(url=url, headers=HEADER).text
    selector = etree.HTML(rsp)
    nodes = selector.xpath(settings.NODES_XC)
    if nodes:
        for node in nodes:
            name = xpath_handler(node.xpath(settings.NAME_XC))
            address = xpath_handler(node.xpath(settings.ADDRESS_XC))
            grade = xpath_handler(node.xpath(settings.SCORE_XC))
            comment = re.search('\d+',
                                xpath_handler(node.xpath(settings.COMMENT_XC)))
            comment = comment.group() if comment else 0
            url = detail_url = settings.DOMAIN_XC + xpath_handler(
                node.xpath(settings.DETAIL_XC))
            intro, website, contact = get_detail(detail_url)
            params = dict(
                name=name.encode('utf-8'),
                address=address.encode('utf-8'),
                grade=float(grade.encode('utf-8')) if grade else 0,
                comment=comment.encode('utf-8'),
                url=url.encode('utf-8'),
                intro=intro.encode('utf-8'),
                website=website.encode('utf-8'),
                contact=contact.encode('utf-8'),
            )
            print name
            MY_DB.insert(settings.QUERY_XC, params)
    else:
        print "***warning url:%s" % url
Ejemplo n.º 2
0
def qne_scene(url=settings.SCENE_QNE_URL):
    rsp = requests.get(url=url, headers=HEADER).text
    selector = etree.HTML(rsp)
    nodes = selector.xpath(settings.NODES_QNE)
    if nodes:
        for node in nodes:
            detail_url = node.strip()
            print detail_url
            MY_DB.insert('insert into qne_url(url) VALUES (%(url)s)', dict(url=detail_url))
            # get_detail(detail_url)
            REDIS_CLIENT.set(detail_url, 1)
    else:
        print "***warning url:%s" % url
Ejemplo n.º 3
0
Archivo: crawl.py Proyecto: w1024k/wbb
def spider(params, level, page_num):
    params['pagenow'] = page_num
    rsp = requests.post(url=settings.COMMENT_URL,
                        headers=settings.HEADER,
                        params=params).text
    selector = etree.HTML(rsp)
    nodes = selector.xpath(settings.ROOT_PATH)
    for node in nodes:
        record = dict(
            nick=xpath_handler(node.xpath(settings.NICK_PATH)),
            date=xpath_handler(node.xpath(settings.DATE_PATH)),
            comment=xpath_handler(node.xpath(settings.COMMENT_PATH)),
            level=level,
        )
        # print level, record['nick']
        MY_DB.insert(settings.SQL_QUERY, record)
Ejemplo n.º 4
0
def get_detail(url):
    rsp = requests.get(url=url, headers=HEADER).text
    selector = etree.HTML(rsp)
    name = xpath_handler(selector.xpath(settings.NAME_MFW))
    if name:
        print 111, name
        address, intro, comment, open, time, contact, website = list(detail_old(selector))
        print address, intro, comment, open, time, contact, website
        params = dict(
            name=name,
            address=address,
            intro=intro,
            comment=comment,
            open=open,
            time=time,
            contact=contact,
            website=website,
            url=url
        )
        MY_DB.insert(settings.QUERY_MFW, params)
    else:
        name = xpath_handler(selector.xpath(settings.NAME_MFW_NEW))
        if name:
            print 222, name
            address, intro, comment, contact, grade = list(detail_new(selector))
            print address, intro, comment, contact, grade
            params = dict(
                name=name,
                address=address,
                intro=intro,
                comment=comment,
                contact=contact,
                grade=grade,
                url=url
            )
            MY_DB.insert(settings.QUERY_MFW_NEW, params)

        else:
            print 333
Ejemplo n.º 5
0
def get_param():
    router_url = 'http://www.mafengwo.cn/ajax/router.php'
    domain = 'http://www.mafengwo.cn'
    sql = 'insert into mfw_url(url) VALUES (%(url)s)'
    for num in xrange(1, 195):
        print num
        params = {
            'sAct': 'KMdd_StructWebAjax|GetPoisByTag',
            'iMddid': 10099,
            'iTagId': 0,
            'iPage': num
        }
        html = requests.post(url=router_url, data=params,
                             headers=HEADER).json()
        if html.get('succ') == 1:
            html = html['data']['list']
            selector = etree.HTML(html)

            links = selector.xpath('//@href')
            for link in links:
                url = domain + link.strip()
                print url
                MY_DB.insert(sql, dict(url=url))
Ejemplo n.º 6
0
def get_detail(url):
    rsp = requests.get(url=url, headers=HEADER, proxies=get_proxy()).text

    print 'get_detail_end...'

    selector = etree.HTML(rsp)
    name = xpath_handler(selector.xpath(settings.NAME_QNE))
    address_phone = selector.xpath(settings.ADDRESS_PHONE_QNE)
    address = address_phone[0].strip() if len(address_phone) else ''
    contact = address_phone[1].strip() if len(address_phone) > 1 else ''
    coord = xpath_handler(selector.xpath(settings.COORD_QNE))
    if coord:
        coord = coord.split(',')
        lon = coord[0]
        lat = coord[1]
    else:
        lon = lat = None
    grade = xpath_handler(selector.xpath(settings.GRADE_QNE)) or None
    comment = xpath_handler(selector.xpath(settings.COMMENT_QNE), 0)
    if comment:
        comment = re.search('\d+', comment)
        comment = comment.group() if comment else 0
    open_time = xpath_handler(selector.xpath(settings.OPEN_QNE))
    time_advise = xpath_handler(selector.xpath(settings.TIME_QNE))
    time_advise = time_advise and time_advise.split(u':')[1]
    website = xpath_handler(selector.xpath(settings.WEBSITE_QNE))
    intro = xpath_handler(selector.xpath(settings.INTRO_QNE))
    if not name:
        return
    print name
    print address
    print contact
    print lon
    print lat
    try:
        grade = float(grade)
    except:
        grade = 0
    print grade
    print comment
    print open_time
    print time_advise
    print website
    print intro
    params = dict(
        name=name,
        address=address,
        grade=grade,
        comment=comment,
        url=url,
        intro=intro,
        website=website,
        contact=contact,
        lon=lon,
        lat=lat,
        open=open_time,
        time=time_advise
    )
    MY_DB.insert(settings.QUERY_QNE, params)
    # MY_DB.insert('delete from qne_url where url=%(url)s', dict(url=url))
    if name:
        REDIS_CLIENT.set(url, 0)
Ejemplo n.º 7
0
def main():
    gevent_download(urls=url_product(), func=qne_scene)


def get_proxy():
    content = requests.get("http://123.207.35.36:5010/get").content
    ip, port = content.split(":")
    proxy_address = "http://%s:%s" % (ip, port)
    proxy_attr = {"http": proxy_address}
    return proxy_attr


if __name__ == '__main__':
    # 先调main生成redis记录
    # main()
    rows = MY_DB.select('select url from qne_url')
    urls = []
    for row in rows:
        if int(REDIS_CLIENT.get(row[0])) == 0:
            print 'skip ...\n\n\n\n'
            continue
        print row[0]
        urls.append(row[0])

    while True:
        t = Process(target=gevent_download, kwargs=dict(urls=urls, func=get_detail))
        t.start()
        t.join(10)
        print 'kill ..'
        t.terminate()
Ejemplo n.º 8
0
def url_product():
    for i in xrange(1, 2908):
        row = MY_DB.select(settings.MFW_SQL_URL, dict(id=i))
        yield row[0][0]