Beispiel #1
0
 def make_request(self, seed):
     url = "https://club.jd.com/comment/skuProductPageComments.action?callback=fetchJSON_comment98&productId={0}&score=0&sortType=5&page=0&pageSize=10&isShadowSku=0&fold=1".format(
         seed.value)
     request = {
         "url": url,
         "encoding": "gbk",
         "method": "get",
         "proxies": {
             "http": random.choice(HttpProxy.getHttpProxy()),
             "https": random.choice(HttpProxy.getHttpsProxy())
         },
         "headers": {
             'Host': 'club.jd.com',
             'Connection': 'close',
             'User-Agent':
             'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
             'Referer': 'https://item.jd.com/{0}.html'.format(seed.value)
         }
     }
     return request
Beispiel #2
0
 def __init__(self, seeds_file, **kwargs):
     super(NewPhone2, self).__init__(**kwargs)
     self.proxies = HttpProxy.getHttpProxy()
     self.ua = UserAgent()
     self.phone_regx = re.compile(r'^\d{11,11}$')
     self.phone_number_checker = stringUtils.check_legality(
         pattern=r'^\d{11,11}$')
     for seed in open(seeds_file):
         seed = seed.strip("\n")
         if (self.phone_number_checker(seed)):
             self.seeds_queue.put(Seed(seed, kwargs["retries"]))
         else:
             self.log.info("legal_format: " + seed)
     self.pro_city_pattern = re.compile(
         r'<dd><span>号码归属地:</span>(.*?) (.*?)</dd>')
     self.telcompany_pattern = re.compile(
         r'<dd><span>手机卡类型:</span>(.*?)</dd>')
Beispiel #3
0
 def __init__(self, **kwargs):
     super(GetProductId, self).__init__(**kwargs)
     self.retries = 3
     self.proxies = HttpProxy.getHttpProxy()
     self.ua = UserAgent()
     with op.DBManger() as m:
         last_brand_collect = m.get_lasted_collection(
             "jingdong",
             filter={"name": {
                 "$regex": r"^brand20\d\d\d\d\d\d$"
             }})
         pipeline = [{
             "$match": {
                 "cate_id": {
                     "$ne": None
                 }
             }
         }, {
             "$match": {
                 "brand_id": {
                     "$ne": None
                 }
             }
         }, {
             "$match": {
                 "name": {
                     "$ne": None
                 }
             }
         }, {
             "$match": {
                 "_status": 0
             }
         }]
         data_set = collections.DataSet(
             m.read_from(db_collect=("jingdong", last_brand_collect),
                         out_field=("cate_id", "brand_id", "name"),
                         pipeline=pipeline))
         for i, seed in enumerate(data_set.distinct()):
             self.seeds_queue.put(
                 Seed(value=seed, retries=self.retries, type=0))
     self.first_pettern = re.compile(r"search000014_log:{wids:'([,\d]*?)',")
     self.skuids_pettern = re.compile(r'{.*?"skuId":(\d+).*?}')
     self.totalpage_perttern = re.compile(
         r'<div id="J_topPage"[\s\S]*?<b>\d+</b><em>/</em><i>(\d+)</i>')
Beispiel #4
0
    #                     info = str(info) + '\t' + str(sale[0])
    #                 info = info.lstrip("\t")
    #                 result.append({"values": info})
    #     print(result)
    #     if result:
    #         self.write(result)
    #     else:
    #         self.write([{"_seed": seed.value}])
    #     print(result)
    #     seed.ok()


if __name__ == "__main__":
    current_date = timeUtil.current_time()
    process_manger.kill_old_process(sys.argv[0])
    import logging
    from multiprocess.core import HttpProxy
    config = {"job_name": "jdprice"
              , "spider_num": 40
              , "retries": 10
              , "request_timeout": 10
              , "complete_timeout": 5*60
              , "sleep_interval": 1
              , "rest_time": 5
              , "write_seed": False
              , "mongo_config": {"addr": "mongodb://192.168.0.13:27017", "db": "jingdong", "collection": "jdprice"+current_date}
              , "log_config": {"level": logging.INFO, "filename": sys.argv[0] + '.logging', "format":'%(asctime)s - %(filename)s - %(processName)s - [line:%(lineno)d] - %(levelname)s: %(message)s'}
              , "proxies_pool": HttpProxy.getHttpProxy()}
    p = JDPriceMiss(**config)
    p.main_loop(show_process=True)
Beispiel #5
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from multiprocess.core import HttpProxy
import logging
config = {
    "job_name": "shoujiguishudi",
    "spider_num": 3,
    "retries": 3,
    "request_timeout": 3,
    "completetimeout": 5 * 60,
    "seeds_file": "resource/buyer_phone.3",
    "mongo_config": {
        "addr": "mongodb://192.168.0.13:27017",
        "db": "jicheng"
    },
    "proxies": HttpProxy.getHttpProxy(),
    "log_config": {
        "level":
        logging.DEBUG,
        "format":
        '%(asctime)s - %(filename)s - %(processName)s - [line:%(lineno)d] - %(levelname)s: %(message)s'
    },
    "headers": {
        "Connection": "close"
    }
}
Beispiel #6
0
               "Referer": "https://item.m.jd.com/72321801855.html",
               'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
               #"cookie":"shshshfpa=230b299e-b267-3f39-748a-5274ba04573e-1526388430; shshshfpb=0e4a63e00c3146f1205679ecef0af468fb452b7038a3edfd15afad6d12; __jdu=1595213903005116662704; pin=jd_49e6f74229a5c; unick=jd_188014ctk; _tp=f8skMf7S6k8VPMVjjhCn8S7vk6UqsuFMW8o68xx3ddc%3D; _pst=jd_49e6f74229a5c; ipLocation=%u5317%u4eac; pinId=CL2LG1jQi0fBGlwodztkXrV9-x-f3wj7; unpl=V2_ZzNtbRAFShd8AUZWfk0IB2JTRwgSBxBBfAtGUHseXFFkCxINclRCFnQURldnG1wUZwQZWUNcRhJFCEdkeB5fA2AFEFlBZxBFLV0CFi9JH1c%2bbRpdS1BKFnQLRlZLKV8FVwMTbUJTSxF2CERcehtdBGMDElpFUEATdA12ZHwpbDVjCxVUQVdzFEUJdhYvRVsNbwAaWw9XRx1xC0ZWcxheBGYHEl1FUEQWcwlDZHopXw%3d%3d; __jdv=76161171|direct|-|none|-|1609750532540; TrackID=14z86bRECmD_c8hnyUzWqPbiv0pHgxgGJ0tgMH9b8UmBPkuTndrN5VhNCH5t8h3LTmlYuJbzhHXbftdRKDtXKBnPgOEXXzqhXAH9ZY-6s5MAR2ncnCvnEbToPqbFrYgEt; user-key=43a6e8ea-993d-49e9-8763-4e756d81ae6f; cn=0; PCSYCityID=CN_110000_110100_110105; areaId=1; ipLoc-djd=1-72-55653-0; wxa_level=1; jxsid=16109508048628837173; webp=1; visitkey=31014972499970792; __jda=122270672.1595213903005116662704.1595213903.1610948455.1610955353.123; __jdc=122270672; 3AB9D23F7A4B3C9B=HV7XTTHFGASMIJRSRKK34KLHMYELLS47K4NBCIR2PEFYCZUMIX225JHQCMEJUTEKYFDA47E3QEMFC3TYKKQRYXFS2Q; shshshfp=8e6807b1ccf37dd2a527f63ee133d3e6; shshshsID=48908f8b4d08dd6a4ad6ea045c548f30_2_1610955836722; wq_logid=1610955927.1063071573; retina=1; cid=9; wqmnx1=MDEyNjM4NHMubXQxMzQyL25yOzVNQUszTEdoLjFsaTFzZjQyRUgmUg%3D%3D; __jdb=122270672.12.1595213903005116662704|123.1610955353; mba_muid=1595213903005116662704; mba_sid=16109559266537842608963232693.1"
           }}
cate_pattern = re.compile(r'navThird[1-9]: (\[.*\])')
cate_pattern1 = re.compile(r'<li data-sku="(\d+)"[\s\S]*?class="gl-item">[\s\S]*?<em>([^¥][\s\S]*?)</em>[\s\S]*?</li>')
first_pettern = re.compile(r"search000014_log:{wids:'([,\d]*?)',")
comments_pattern = re.compile(r'"comments":[\s\S]*?(\[[\s\S]*\])')
allcnt_pattern = re.compile(r'"CommentCount": \"(\d+)\",')
import json
import time
from ast import literal_eval
import json
from fake_useragent import UserAgent
from multiprocess.core import HttpProxy
proxies = HttpProxy.getHttpsProxy()
countlist={}
countlisttimeout={}
for proxy in proxies:
    countlist[proxy] = 0
    countlisttimeout[proxy] = 0
ua = UserAgent()
#,proxies={"https": "https://*****:*****@192.168.0.71:3128","http": "http://*****:*****@192.168.0.71:3128"}
for i in range(100000):
    time.sleep(0.5)
    src = requests.get(**request)
    print(src.headers)
    print(allcnt_pattern.findall(src.text))
# first_pettern = re.compile(r"search000014_log:{wids:'([,\d]*?)',")
# shopid_pettern = re.compile(r'shopId:\'(\d*)\',')
# venderid_pettern = re.compile(r'venderId:(\d*),')