def make_request(self, seed): url = "https://club.jd.com/comment/skuProductPageComments.action?callback=fetchJSON_comment98&productId={0}&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1".format( seed.value) request = { "url": url, "encoding": "gbk", "method": "get", "proxies": { "http": random.choice(HttpProxy.getHttpProxy()), "https": random.choice(HttpProxy.getHttpsProxy()) }, "headers": { 'Host': 'club.jd.com', 'Connection': 'close', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36', 'Referer': 'https://item.jd.com/{0}.html'.format(seed.value) } } return request
def __init__(self, **kwargs): super(GetProductId, self).__init__(**kwargs) self.retries = 3 self.proxies = HttpProxy.getHttpsProxy() self.ua = UserAgent() with op.DBManger() as m: last_brand_collect = m.get_lasted_collection( "jingdong", filter={"name": { "$regex": r"^brand20\d\d\d\d\d\d$" }}) pipeline = [{ "$match": { "cate_id": { "$ne": None } } }, { "$match": { "brand_id": { "$ne": None } } }, { "$match": { "_status": 0 } }] data_set = collections.DataSet( m.read_from(db_collect=("jingdong", last_brand_collect), out_field=("cate_id", "brand_id", "name"), pipeline=pipeline)) for i, seed in enumerate(data_set.distinct()): self.seeds_queue.put( Seed(value=seed, retries=self.retries, type=0)) self.first_pettern = re.compile(r"search000014_log:{wids:'([,\d]*?)',") self.skuids_pettern = re.compile(r'{.*?"skuId":(\d+).*?}') self.totalpage_perttern = re.compile( r'<div id="J_topPage"[\s\S]*?<b>\d+</b><em>/</em><i>(\d+)</i>')
"Referer": "https://item.m.jd.com/72321801855.html", 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36', #"cookie":"shshshfpa=230b299e-b267-3f39-748a-5274ba04573e-1526388430; shshshfpb=0e4a63e00c3146f1205679ecef0af468fb452b7038a3edfd15afad6d12; __jdu=1595213903005116662704; pin=jd_49e6f74229a5c; unick=jd_188014ctk; _tp=f8skMf7S6k8VPMVjjhCn8S7vk6UqsuFMW8o68xx3ddc%3D; _pst=jd_49e6f74229a5c; ipLocation=%u5317%u4eac; pinId=CL2LG1jQi0fBGlwodztkXrV9-x-f3wj7; unpl=V2_ZzNtbRAFShd8AUZWfk0IB2JTRwgSBxBBfAtGUHseXFFkCxINclRCFnQURldnG1wUZwQZWUNcRhJFCEdkeB5fA2AFEFlBZxBFLV0CFi9JH1c%2bbRpdS1BKFnQLRlZLKV8FVwMTbUJTSxF2CERcehtdBGMDElpFUEATdA12ZHwpbDVjCxVUQVdzFEUJdhYvRVsNbwAaWw9XRx1xC0ZWcxheBGYHEl1FUEQWcwlDZHopXw%3d%3d; __jdv=76161171|direct|-|none|-|1609750532540; TrackID=14z86bRECmD_c8hnyUzWqPbiv0pHgxgGJ0tgMH9b8UmBPkuTndrN5VhNCH5t8h3LTmlYuJbzhHXbftdRKDtXKBnPgOEXXzqhXAH9ZY-6s5MAR2ncnCvnEbToPqbFrYgEt; user-key=43a6e8ea-993d-49e9-8763-4e756d81ae6f; cn=0; PCSYCityID=CN_110000_110100_110105; areaId=1; ipLoc-djd=1-72-55653-0; wxa_level=1; jxsid=16109508048628837173; webp=1; visitkey=31014972499970792; __jda=122270672.1595213903005116662704.1595213903.1610948455.1610955353.123; __jdc=122270672; 3AB9D23F7A4B3C9B=HV7XTTHFGASMIJRSRKK34KLHMYELLS47K4NBCIR2PEFYCZUMIX225JHQCMEJUTEKYFDA47E3QEMFC3TYKKQRYXFS2Q; shshshfp=8e6807b1ccf37dd2a527f63ee133d3e6; shshshsID=48908f8b4d08dd6a4ad6ea045c548f30_2_1610955836722; wq_logid=1610955927.1063071573; retina=1; cid=9; wqmnx1=MDEyNjM4NHMubXQxMzQyL25yOzVNQUszTEdoLjFsaTFzZjQyRUgmUg%3D%3D; __jdb=122270672.12.1595213903005116662704|123.1610955353; mba_muid=1595213903005116662704; mba_sid=16109559266537842608963232693.1" }} cate_pattern = re.compile(r'navThird[1-9]: (\[.*\])') cate_pattern1 = re.compile(r'<li data-sku="(\d+)"[\s\S]*?class="gl-item">[\s\S]*?<em>([^¥][\s\S]*?)</em>[\s\S]*?</li>') first_pettern = re.compile(r"search000014_log:{wids:'([,\d]*?)',") comments_pattern = re.compile(r'"comments":[\s\S]*?(\[[\s\S]*\])') allcnt_pattern = re.compile(r'"CommentCount": \"(\d+)\",') import json import time from ast import literal_eval import json from fake_useragent import UserAgent from multiprocess.core import HttpProxy proxies = HttpProxy.getHttpsProxy() countlist={} countlisttimeout={} for proxy in proxies: countlist[proxy] = 0 countlisttimeout[proxy] = 0 ua = UserAgent() #,proxies={"https": "https://*****:*****@192.168.0.71:3128","http": "http://*****:*****@192.168.0.71:3128"} for i in range(100000): time.sleep(0.5) src = requests.get(**request) print(src.headers) print(allcnt_pattern.findall(src.text)) # first_pettern = re.compile(r"search000014_log:{wids:'([,\d]*?)',") # shopid_pettern = re.compile(r'shopId:\'(\d*)\',') # venderid_pettern = re.compile(r'venderId:(\d*),')
current_date = timeUtil.current_time() process_manger.kill_old_process(sys.argv[0]) import logging new_config = { "job_name": "jdcomment", "spider_num": 1, "retries": 3, "rest_time": 5, "complete_timeout": 1 * 60, "seeds_file": "resource/month202007", "dateindex": current_date, "mongo_config": { "addr": "mongodb://192.168.0.13:27017", "db": "jicheng", "collection": "comment" + current_date }, "log_config": { "level": logging.DEBUG, "filename": sys.argv[0] + '.logging', "filemode": 'a', "format": '%(asctime)s - %(filename)s - %(processName)s - [line:%(lineno)d] - %(levelname)s: %(message)s' }, "proxies_pool": HttpProxy.getHttpsProxy() } p = GetComment(**new_config) p.main_loop(show_process=True)