Beispiel #1
0
from urllib3 import *
import json
import re
from Spider_Def import Read_Headers
from bs4 import BeautifulSoup

headers = Read_Headers('head_jd.txt')

# 忽略告警是一个不安全的选择,这样证书报错就会被略过
disable_warnings()

# PoolManager介绍
# http://www.cnblogs.com/shadowwalker/p/5283372.html
http = PoolManager()


def getJSONDetail(url, productId, page):
    # 找到url中productId,page,callback内容
    re_result = re.match(
        '.+callback=(.+)&productId=(.+)&score.+&page=(.+)&pageSize.+',
        url).groups()
    url_callback = re_result[0]
    url_productId = re_result[1]
    url_page = re_result[2]

    # 替换productId
    url = url.replace(url_productId, str(productId))
    # 替换page
    url1 = url.split('&page=')
    url2 = url.split('&pageSize')
    url = url1[0] + '&page=' + page + '&pageSize' + url2[1]
Beispiel #2
0
from urllib3 import *
import json
import re
from Spider_Def import Read_Headers
from bs4 import BeautifulSoup

# 读取头部信息,主要是读取Cookie值
headers = Read_Headers('head_tm.txt')

# 忽略告警是一个不安全的选择,这样证书报错就会被略过
disable_warnings()

# PoolManager介绍,主要用于控制并发
# http://www.cnblogs.com/shadowwalker/p/5283372.html
http = PoolManager()


def getJSONDetail(url, itemId, currentPage):
    # 找到url中itemId,currentPage,callback内容
    re_result = re.match(
        '.+itemId=(.+)&spuId.+&currentPage=(.+)&append.+callback=(.+)$',
        url).groups()
    url_itemId = re_result[0]
    url_currentPage = re_result[1]
    url_callback = re_result[2]

    # 替换itemId
    url = url.replace(url_itemId, str(itemId))
    # 替换currentPage
    url1 = url.split('&currentPage=')
    url2 = url.split('&append=')