Exemple #1
0
 def parse1(self, response):
     seed = Seed.parse_seed(response.meta["_seed"])
     cate_id, brand_id, page, s = seed.value
     r1 = self.first_pettern.findall(response.text)
     if r1:
         r1 = r1[0]
         if r1:
             cate_id, brand_id, page, s, items = cate_id, brand_id, page + 1, s + 30, r1
             if brand_id:
                 en_cate_id, en_brand_id = urllib.parse.urlencode({
                     "cat":
                     cate_id
                 }), urllib.parse.urlencode({"ev": "exbrand_" + brand_id})
                 url = 'https://list.jd.com/list.html?{0}&{1}&psort=4&page={2}&s={3}&scrolling=y&log_id=1596108547754.6591&tpl=1_M&isList=1&show_items={4}'.format(
                     en_cate_id, en_brand_id, page, s, items)
                 request = Request(url=url,
                                   callback=self.parse2,
                                   priority=2)
                 request.headers[
                     "Referer"] = "https://list.jd.com/list.html?{0}&{1}&page={2}&s={3}&psort=4&click=1".format(
                         en_cate_id, en_brand_id, page - 1, s - 30)
             else:
                 en_cate_id = urllib.parse.urlencode({"cat": cate_id})
                 url = 'https://list.jd.com/list.html?{0}&psort=4&page={1}&s={2}&log_id=1596108547754.6591&tpl=1_M&isList=1&show_items={3}'.format(
                     en_cate_id, page, s, items)
                 request = Request(url=url,
                                   callback=self.parse2,
                                   priority=2)
                 request.headers[
                     "Referer"] = "https://list.jd.com/list.html?{0}&page={1}&s={2}&psort=4&click=1".format(
                         en_cate_id, page - 1, s - 30)
             request.meta["_seed"] = str(
                 Seed((cate_id, brand_id, page, s), type=2))
             request.meta["last_page_pids"] = r1
             yield request
Exemple #2
0
    def __init__(self, seeds_file, **kwargs):
        super(JDPrice, self).__init__(**kwargs)
        self.proxies = list(
            map(lambda x: ("http://u{}:[email protected]:3128".format(x)),
                range(28)))
        self.ua = UserAgent()
        with open(seeds_file) as infile:
            for i, seed in enumerate(infile):
                current = seed.strip('\n').split("\t")[0]
                if i % 60 == 0:
                    if i != 0:
                        self.seeds_queue.put(Seed(strr, kwargs["retries"]))
                    strr = current
                else:
                    strr = strr + '%2CJ_' + current
        if strr:
            self.seeds_queue.put(Seed(strr, kwargs["retries"]))
        self.price_ad = 'http://p.3.cn/prices/mgets?&type=1&skuIds=J_'

        self.block_pattern = re.compile(r'{.*?}')
        self.innerid_pattern = re.compile(r'\d+')
        self.innerprice_pattern = re.compile(r'"\d+.\d+"')
        self.op_pattern = re.compile(r'"op":"(\d+.\d+)"')
        self.p_pattern = re.compile(r'(\d+.\d+)"')
        self.p2_pattern = re.compile(r'(-\d+.\d+)')
        self.p1 = re.compile(r'id":.*?p":".*?"}')
        self.id_pattern = re.compile(r'id:"(\d+)"')
        self.first_pattern = re.compile(r'([a-zA-Z]*)":')
        self.rid = random.randint(100000000, 999999999)
        self.usrid = str(self.rid)
        self.up_pattern = re.compile('"up":"tpp"')
Exemple #3
0
    def __init__(self, **kwargs):
        super(JDPriceMiss, self).__init__(**kwargs)
        self.ua = UserAgent()
        with op.DBManger() as m:
            table = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^summary_201905_20\d\d\d\d$"}})
            skuid_set = set()
            for item in m.read_from(db_collect=("jingdong",table), out_field=("skuid",)):
                skuid_set.add(item[0])
            for i, seed in enumerate(skuid_set):
                current = seed.strip()
                if i % 60 == 0:
                    if i != 0:
                        self.seeds_queue.put(Seed(strr, kwargs["retries"]))
                    strr = current
                else:
                    strr = strr + '%2CJ_' + current
            if strr:
                self.seeds_queue.put(Seed(strr, kwargs["retries"]))

        self.block_pattern = re.compile(r'{.*?}')
        self.innerid_pattern = re.compile(r'\d+')
        self.innerprice_pattern = re.compile(r'"\d+.\d+"')
        self.op_pattern = re.compile(r'"op":"(\d+.\d+)"')
        self.p_pattern = re.compile(r'"(\d+.\d+)"')
        self. p2_pattern = re.compile(r'"(-\d+.\d+)"')
        self.p1 = re.compile(r'"id":.*?"}')
        self.id_pattern = re.compile(r'id:"J_(\d+)"')
        self.first_pattern = re.compile(r'([a-zA-Z]*)":')
        self.rid = random.randint(100000000, 999999999)
        self.usrid = str(self.rid)
        self.up_pattern = re.compile('"up":"tpp"')
        self.price_pattern = re.compile(r'^\d+\.\d\d$')
Exemple #4
0
 def init_start_urls(self):
     self.redis.delete(self.start_urls_redis_key)
     self.redis.delete(self.items_redis_key)
     buffer = []
     buffer_size = 1024
     for i, seed in enumerate(
             open("shoujiguishudi/resource/buyer_phone.3")):
         seed = Seed(value=seed.strip(), type=0)
         buffer.append(str(seed))
         if len(buffer) % buffer_size == 0:
             self.redis.sadd(self.start_urls_redis_key, *buffer)
             buffer = []
     if buffer:
         self.redis.sadd(self.start_urls_redis_key, *buffer)
 def init_start_urls(self):
     self.redis.delete(self.start_urls_redis_key)
     self.redis.delete(self.items_redis_key)
     with op.DBManger() as m:
         pipeline = [
             {
                 "$match": {
                     "skuid": {
                         "$ne": None
                     }
                 }
             },
         ]
         skuid_set = set()
         for item in m.read_from(db_collect=("jingdong",
                                             "jdprice_miss_seed"),
                                 out_field=("skuid", ),
                                 pipeline=pipeline):
             skuid_set.add(int(item[0]))
         self.logger.info(
             "total new skuid of comment larger than 0 is: {}".format(
                 len(skuid_set)))
         buffer = []
         for i, seed in enumerate(skuid_set):
             seed = str(seed)
             current = seed.strip()
             if i % 60 == 0:
                 if i != 0:
                     seed = Seed(value=strr, type=0)
                     buffer.append(str(seed))
                 strr = current
             else:
                 strr = strr + '%2CJ_' + current
         if strr:
             seed = Seed(value=strr, type=0)
             buffer.append(str(seed))
         if buffer:
             buffer1 = []
             buffer_size = 10000
             for i, seed in enumerate(buffer):
                 buffer1.append(str(seed))
                 if len(buffer1) % buffer_size == 0:
                     random.shuffle(buffer1)
                     self.redis.sadd(self.start_urls_redis_key, *buffer1)
                     buffer1 = []
             if buffer1:
                 random.shuffle(buffer1)
                 self.redis.sadd(self.start_urls_redis_key, *buffer1)
 def make_request_from_data(self, data):
     str_seed = bytes_to_str(data, self.redis_encoding)
     seed = Seed.parse_seed(str_seed)
     print(seed)
     if seed.type == 0:
         skuid = seed.value
         url = "https://club.jd.com/comment/getSkuPageFoldComments.action?callback=jQuery2675603&productId={0}&score=0&sortType=6&page=0&pageSize=10".format(
             skuid)
         return Request(
             url=url,
             meta={
                 "_seed": str_seed,
                 "current_page": 0,
                 "headers": {
                     "Connection": "close",
                     "Referer":
                     "https://item.m.jd.com/{0}.html".format(skuid)
                 }
             },
             priority=0,
             callback=self.parse)
     elif seed.type == 3:
         str_seed = seed.value
         request = Request.deserialize(str_seed, self)
         return request
Exemple #7
0
 def parse3(self, response):
     seed = Seed.parse_seed(response.meta["_seed"])
     cate_id, brand_id, page, s = seed.value
     for item in json.loads(self.json_pettern.findall(response.text)[0]):
         if item:
             print(item)
             yield {
                 "skuid":
                 item.get("pid"),
                 "cate_id":
                 cate_id,
                 "brand_id":
                 brand_id,
                 "shopid":
                 item.get("shopId"),
                 "venderid":
                 item.get("venderId", None),
                 "shop_name":
                 item.get("seller"),
                 "ziying":
                 1 if item.get("seller")
                 and item.get("seller").find("自营") != -1 else 0
             }
         else:
             print(item)
Exemple #8
0
 def make_request_from_data(self, data):
     str_seed = bytes_to_str(data, self.redis_encoding)
     seed = Seed.parse_seed(str_seed)
     if seed.type == 0:
         cate_id, brand_id, name = seed.value
         if brand_id:
             cid1, cid2, cid3 = re.split(',', cate_id)
             # if cid1 == "1713":
             #     en_cate_id, en_brand_id = urllib.parse.urlencode({"cat": cate_id}), urllib.parse.urlencode(
             #         {"ev": "expublishers_" + brand_id})
             # else:
             #en_cate_id, en_brand_id = urllib.parse.urlencode({"cat": cate_id}), urllib.parse.urlencode({"ev": "exbrand_" + name})
             url = 'https://list.jd.com/list.html?{0}&{1}&psort=4&click=1'.format(
                 urllib.parse.urlencode({"cat": cate_id}),
                 urllib.parse.urlencode({"ev": "exbrand_" + name}))
         else:
             url = 'https://list.jd.com/list.html?{0}&psort=4&click=1'.format(
                 urllib.parse.urlencode({"cat": cate_id}))
         return Request(url=url,
                        meta={
                            "_seed": str_seed,
                            "headers": {
                                "Referer": "https://www.jd.com/"
                            }
                        },
                        priority=0,
                        callback=self.parse)
     elif seed.type == 3:
         str_seed = seed.value
         request = Request.deserialize(str_seed, self)
         return request
Exemple #9
0
 def init_start_urls(self):
     self.redis.delete(self.start_urls_redis_key)
     self.redis.delete(self.items_redis_key)
     buffer_size = 1024
     with op.DBManger() as m:
         m.create_db_collection(
             db_collection=("jingdong",
                            "jdchaoshi{0}_sep".format(current_date)))
         buffer = []
         import requests
         request = {
             "url": "https://chaoshi.jd.com/",
             "headers": {
                 'Connection':
                 'close',
                 "Referer":
                 "https://www.jd.com",
                 'User-Agent':
                 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36'
             }
         }
         nav_pattern = re.compile(r'navThird[1-9]: (\[.*\])')
         import time
         from ast import literal_eval
         src = requests.get(**request).text
         for i in nav_pattern.findall(src):
             for j in literal_eval(i):
                 for k in j["children"]:
                     seed = Seed(value=k["URL"].replace("\\", ""), type=0)
                     buffer.append(str(seed))
                     if len(buffer) % buffer_size == 0:
                         self.redis.sadd(self.start_urls_redis_key, *buffer)
                         buffer = []
         if buffer:
             self.redis.sadd(self.start_urls_redis_key, *buffer)
Exemple #10
0
 def init_start_urls(self):
     self.redis.delete(self.start_urls_redis_key)
     self.redis.delete(self.items_redis_key)
     buffer = []
     buffer_size = 1024
     with op.DBManger() as m:
         pipeline = [
             {
                 "$match": {
                     "_status": 3
                 }
             },
         ]
         data_set = collections.DataSet(
             m.read_from(db_collect=("jingdong", self.last_retry_collect),
                         out_field=("_seed", "_status"),
                         pipeline=pipeline))
         should_exit = True
         for i, (seed, status) in enumerate(data_set.distinct()):
             should_exit = False
             seed = Seed(value=seed, type=3)
             buffer.append(str(seed))
             if len(buffer) % buffer_size == 0:
                 self.redis.sadd(self.start_urls_redis_key, *buffer)
                 buffer = []
         if buffer:
             self.redis.sadd(self.start_urls_redis_key, *buffer)
         if should_exit:
             import sys
             sys.exit(0)
Exemple #11
0
 def __init__(self, **kwargs):
     super(JDPrice, self).__init__(**kwargs)
     self.ua = UserAgent()
     with op.DBManger() as m:
         #创建临时表本月任务的分界线
         m.create_db_collection(db_collection=("jingdong","jdcommentdetail{0}_sep".format(current_date)))
         skuid_set = {}
         top1000w = TopK(1)
         #skuids in last result
         last_result = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^summary_201905_20\d\d\d\d$"}})
         pipeline = [
             {
                 "$project": {
                     "skuid": "$skuid",
                     "comment_{}".format(last_result[-6:]):"$comment_{}".format(last_result[-6:])
                 }
             },
             {"$limit": 100}
         ]
         for item, comments in m.read_from(db_collect=("jingdong", last_result), out_field=("skuid","comment_{}".format(last_result[-6:])),pipeline=pipeline):
             if int(item) not in skuid_set:
                 top1000w.push(int(comments))
                 skuid_set[int(item)] = int(comments)
         top1000w = set(top1000w.get_topk())
         for i, seed in enumerate(skuid_set):
             if skuid_set[seed] in top1000w:
                 seed = Seed(value=seed, type=0)
                 self.seeds_queue.put(seed)
Exemple #12
0
 def init_start_urls(self):
     self.redis.delete(self.start_urls_redis_key)
     self.redis.delete(self.items_redis_key)
     buffer = []
     buffer_size = 1024
     with op.DBManger() as m:
         m.create_db_collection(db_collection=("jingdong", "jdskuid{0}_sep".format(current_date)))
         last_sep = m.get_lasted_collection("jingdong", filter={"name": {"$regex": r"^jdbrand20\d\d\d\d\d\d_sep$"}})
         seed_set = set()
         for table in  m.list_tables("jingdong", filter={"name": {"$regex": r"^jdbrand20\d\d\d\d\d\dretry\d*$"}}):
             if not last_sep or table > last_sep:
                 self.logger.info("valid table : {}".format(table))
                 pipeline = [
                     {"$match": {
                         "$and": [{"_status": 0}, {"$or": [{"status": 0}, {"status": -1}]}]
                         }
                     }
                 ]
                 for seed in m.read_from(db_collect=("jingdong", table), out_field=("cate_id", "brand_id"), pipeline=pipeline):
                     seed_set.add(seed)
         for i, seed in enumerate(seed_set):
             seed = Seed(value=seed, type=0)
             buffer.append(str(seed))
             if len(buffer) % buffer_size == 0:
                 self.redis.sadd(self.start_urls_redis_key, *buffer)
                 buffer = []
         if buffer:
             self.redis.sadd(self.start_urls_redis_key, *buffer)
Exemple #13
0
 def __init__(self, current_date, **kwargs):
     super(SecooWeekJob, self).__init__(**kwargs)
     self.proxies = list(map(lambda x:("http://u{}:[email protected]:3128".format(x)), range(28)))
     self.ua = UserAgent()
     self.current_date = current_date
     space = np.linspace(0, 5800000, kwargs["spider_num"] + 1)
     ranges = [(int(space[i]), int(space[i + 1])) for i in range(len(space) - 1)]
     totalpages_pattern = re.compile(r'<strong>共<i>(\d+)</i>页,到第 <b>')
     self.block_pattern = re.compile(r'dlProId=[\s\W\w]*?</dl>')
     self.pid_pattern = re.compile(r'ProId="\d+"')
     self.name_pattern = re.compile(r'title=".*?"')
     self.lo_pattern = re.compile(r'"s1"[\s\W\w]*?</span>')
     self.price_pattern = re.compile(r'secoo_price.*?</span>')
     self.br_pattern = re.compile(r'</i>.*?</span')
     for r in ranges:
         request = {"url": "http://list.secoo.com/all/0-0-0-0-0-7-0-0-0-10-{0}_{1}-0-100-0.shtml".format(r[0], r[1]),
                    "proxies": {"http": random.choice(self.proxies)},
                    "headers": {"Connection": "close",
                                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36 Edge/17.17134'}
                    }
         page = self.get_request(request)
         tmp = totalpages_pattern.findall(page)
         if tmp:
             page_num = int(tmp[0])
             self.log.info((page_num, r[0], r[1]))
             for pageindex in range(1, page_num + 1):
                 self.seeds_queue.put(Seed((pageindex, r[0], r[1]), kwargs["retries"]))
         else:
             self.log.info((0, r[0], r[1]))
Exemple #14
0
 def parse1(self, response):
     seed = Seed.parse_seed(response.meta["_seed"])
     cate_id, brand_id, name, page, s = seed.value
     sku2title = {}
     for sku in self.sku_pattern1.findall(response.text):
         sku2title[sku[0]] = re.sub("<[\s\S]*?>|\t|\n", "", sku[1])
     r1 = self.first_pettern.findall(response.text)
     if r1:
         r1 = r1[0]
         if r1:
             cate_id, brand_id, page, s, items = cate_id, brand_id, page + 1, s + 30, r1
             if brand_id:
                 en_cate_id, ename = urllib.parse.urlencode({
                     "cat": cate_id
                 }), urllib.parse.urlencode({"ev": "exbrand_" + name})
                 url = 'https://list.jd.com/list.html?{0}&{1}&psort=4&page={2}&s={3}&scrolling=y&log_id=1596108547754.6591&tpl=1_M&isList=1&show_items={4}'.format(
                     en_cate_id, ename, page, s, items)
                 request = Request(url=url,
                                   callback=self.parse2,
                                   priority=2)
                 request.headers[
                     "Referer"] = "https://list.jd.com/list.html?{0}&{1}&page={2}&s={3}&psort=4&click=1".format(
                         en_cate_id, ename, page - 1, s - 30)
             else:
                 en_cate_id = urllib.parse.urlencode({"cat": cate_id})
                 url = 'https://list.jd.com/list.html?{0}&psort=4&page={1}&s={2}&log_id=1596108547754.6591&tpl=1_M&isList=1&show_items={3}'.format(
                     en_cate_id, page, s, items)
                 request = Request(url=url,
                                   callback=self.parse2,
                                   priority=2)
                 request.headers[
                     "Referer"] = "https://list.jd.com/list.html?{0}&page={1}&s={2}&psort=4&click=1".format(
                         en_cate_id, page - 1, s - 30)
             request.meta["_seed"] = str(
                 Seed((cate_id, brand_id, page, s), type=2))
             request.meta["last_page_pids"] = r1
             request.meta["sku2title"] = sku2title
             request.meta["totalpage"] = response.meta["totalpage"]
             request.meta["currentpage"] = page - 1
             request.meta["dydmc_delay"] = 2.5
             yield request
         else:
             raise Exception(response.request.url)
     else:
         raise Exception(response.request.url)
Exemple #15
0
 def __init__(self, seeds_file, **kwargs):
     super(GetBrands, self).__init__(**kwargs)
     self.ua = UserAgent()
     with open(seeds_file) as infile:
         data_set = collections.DataSet(infile)
         for i, seed in enumerate(data_set.map(lambda line: line.strip('\n').split("\t")[0].replace('-', ','))
                                          .shuffle(1024)):
             self.seeds_queue.put(Seed(seed, kwargs["retries"]))
     self.pattern = re.compile(r'<li id="brand-(\d+)[\s\S]*?品牌::([\s\S]*?)\'\)"')
Exemple #16
0
 def parse5(self, response):
     seed = Seed.parse_seed(response.meta["_seed"])
     count = self.allcnt_pattern.findall(response.text)
     skuid = response.meta["prices"]["id"]
     result = {}
     result.update(response.meta["prices"])
     result.update({"comment": count[0]})
     result.update(response.meta["info"][int(skuid)])
     result.pop("id")
     yield result
Exemple #17
0
 def parse1(self, response):
     seed = Seed.parse_seed(response.meta["_seed"])
     cate_id, brand_id, page, s = seed.value
     sku2title = {}
     for sku in self.sku_pattern1.findall(response.text):
         sku2title[sku[0]] = re.sub("<[\s\S]*?>|\t|\n", "", sku[1])
     r1 = self.first_pettern.findall(response.text)
     if r1:
         r1 = r1[0]
         if r1:
             if len(r1.split(",")) == 30:
                 cate_id, brand_id, page, s, items = cate_id, brand_id, page + 1, s + 30, r1
                 if brand_id:
                     en_cate_id, en_brand_id = urllib.parse.urlencode(
                         {"cat": cate_id}), urllib.parse.urlencode({"ev": "exbrand_" + brand_id})
                     url = 'https://list.jd.com/list.html?{0}&{1}&psort=4&page={2}&s={3}&scrolling=y&log_id=1596108547754.6591&tpl=1_M&isList=1&show_items={4}'.format(
                         en_cate_id, en_brand_id, page, s, items)
                     request = Request(url=url, callback=self.parse2, priority=2)
                     request.headers["Referer"] = "https://list.jd.com/list.html?{0}&{1}&page={2}&s={3}&psort=4&click=1".format(
                         en_cate_id, en_brand_id, page-1, s-30)
                     self.logger.info("jd_skuids 6")
                 else:
                     en_cate_id = urllib.parse.urlencode({"cat": cate_id})
                     url = 'https://list.jd.com/list.html?{0}&psort=4&page={1}&s={2}&log_id=1596108547754.6591&tpl=1_M&isList=1&show_items={3}'.format(
                         en_cate_id, page, s, items)
                     request = Request(url=url, callback=self.parse2, priority=2)
                     request.headers["Referer"] = "https://list.jd.com/list.html?{0}&page={1}&s={2}&psort=4&click=1".format(
                         en_cate_id, page - 1, s - 30)
                     self.logger.info("jd_skuids 7")
                 request.meta["_seed"] = str(Seed((cate_id, brand_id, page, s), type=2))
                 request.meta["last_page_pids"] = r1
                 request.meta["sku2title"] = sku2title
                 request.meta["totalpage"] = response.meta["totalpage"]
                 request.meta["currentpage"] = page - 1
                 yield request
             else:
                 # 说明没有下半页"https://chat1.jd.com/api/checkChat?pidList=10020242230938,1999899692,72276507174,19999997645,1999899692,100000002015,100000002686,200134637813&callback=jQuery8117083"
                 yield Request(url="https://chat1.jd.com/api/checkChat?pidList={0}&callback=jQuery8117083".format(
                     r1), callback=self.parse3, meta=response.meta)
         else:
             raise Exception(response.request.url)
     else:
         raise Exception(response.request.url)
Exemple #18
0
 def parse(self, response):
     seed = Seed.parse_seed(response.meta["_seed"])
     page_strs = self.totalpage_perttern.findall(response.text)
     if page_strs:
         page_strs = page_strs[0]
         for i in range(1, int(page_strs) + 1):
             page, s = 2 * i - 1, 60 * (i - 1) + 1
             cate_id, brand_id, name = seed.value
             if brand_id:
                 en_cate_id, ename = urllib.parse.urlencode({
                     "cat": cate_id
                 }), urllib.parse.urlencode({"ev": "exbrand_" + name})
                 url = 'https://list.jd.com/list.html?{0}&{1}&page={2}&s={3}&psort=4&click=1'.format(
                     en_cate_id, ename, page, s)
                 refer = "https://www.jd.com/" if i == 1 else 'https://list.jd.com/list.html?{0}&{1}&page={2}&s={3}&psort=4&click=1'.format(
                     en_cate_id, ename, 2 * (i - 1) - 1, 60 * (i - 2) + 1)
             else:
                 en_cate_id = urllib.parse.urlencode({"cat": cate_id})
                 url = 'https://list.jd.com/list.html?{0}&page={1}&s={2}&psort=4&click=1'.format(
                     en_cate_id, page, s)
                 refer = "https://www.jd.com/" if i == 1 else 'https://list.jd.com/list.html?{0}&page={1}&s={2}&psort=4&click=1'.format(
                     en_cate_id, 2 * (i - 1) - 1, 60 * (i - 2) + 1)
             yield Request(url=url,
                           callback=self.parse1,
                           meta={
                               "dydmc_delay":
                               2.5,
                               "totalpage":
                               int(page_strs),
                               "currentpage":
                               page,
                               "_seed":
                               str(
                                   Seed((cate_id, brand_id, name, page, s),
                                        type=1)),
                               "headers": {
                                   "Connection": "close",
                                   "Referer": refer
                               }
                           },
                           priority=1)
     else:
         raise Exception(response.request.url)
Exemple #19
0
 def __init__(self, seeds_file, dateindex, **kwargs):
     super(GetComment, self).__init__(**kwargs)
     self.ua = UserAgent()
     with open(seeds_file) as infile:
         data_set = collections.DataSet(infile)
         for i, seed in enumerate(
                 data_set.map(lambda line: line.strip('\n').split("\t")[0]).
                 shuffle(2048)):
             self.seeds_queue.put(Seed(seed, kwargs["retries"]))
     self.allcnt_pattern = re.compile(r'"commentCount":(\d+),')
     self.dateindex = dateindex
Exemple #20
0
 def parse(self, response):
     seed = Seed.parse_seed(response.meta["_seed"])
     page_strs = self.totalpage_perttern.findall(response.text)
     if int(page_strs[0]) < 100:
         #no need to flip
         pass
     else:
         #need to flip
         pass
     if page_strs:
         page_strs = page_strs[0]
Exemple #21
0
 def make_request_from_data(self, data):
     str_seed = bytes_to_str(data, self.redis_encoding)
     seed = Seed.parse_seed(str_seed)
     if seed.type == 0:
         return Request(url=seed.value,
                        meta={"_seed": str_seed},
                        priority=0,
                        callback=self.parse)
     elif seed.type == 3:
         str_seed = seed.value
         request = Request.deserialize(str_seed, self)
         return request
Exemple #22
0
 def __init__(self, seeds_file, **kwargs):
     super(GetBrands1, self).__init__(**kwargs)
     self.proxies = list(
         map(lambda x: ("http://u{}:[email protected]:3128".format(x)),
             range(28)))
     self.ua = UserAgent()
     with open(seeds_file) as infile:
         data_set = collections.DataSet(infile)
         for i, seed in enumerate(
                 data_set.map(lambda line: line.strip('\n').split("\t")[0].
                              replace('-', ',')).shuffle(1024)):
             self.seeds_queue.put(Seed(seed, kwargs["retries"]))
     self.pattern = re.compile(r'"id":.*?"name":".*?"')
Exemple #23
0
 def make_request_from_data(self, data):
     str_seed = bytes_to_str(data, self.redis_encoding)
     seed = Seed.parse_seed(str_seed)
     if seed.type == 0:
         phonenumber = seed.value.strip()
         url = "http://shouji.xpcha.com/{0}.html".format(phonenumber)
         return Request(url=url,
                        meta={"_seed": str_seed},
                        priority=0,
                        callback=self.parse)
     elif seed.type == 3:
         str_seed = seed.value
         request = Request.deserialize(str_seed, self)
         return request
Exemple #24
0
 def __init__(self, seeds_file, dateindex, **kwargs):
     super(GetComment1, self).__init__(**kwargs)
     self.proxies = list(
         map(lambda x: ("http://u{}:[email protected]:3128".format(x)),
             range(28)))
     self.ua = UserAgent()
     with open(seeds_file) as infile:
         data_set = collections.DataSet(infile)
         for i, seed in enumerate(
                 data_set.map(lambda line: line.strip('\n').split("\t")[0]).
                 shuffle(2048)):
             self.seeds_queue.put(Seed(seed, kwargs["retries"]))
     self.allcnt_pattern = re.compile(r'"CommentCount": "(\d+)"')
     self.dateindex = dateindex
Exemple #25
0
 def parse0(self, response):
     seed = Seed.parse_seed(response.meta["_seed"])
     tuples = self.pattern.findall(response.text)
     for item in tuples:
         cate_id, brand_id, name = seed.value, item[0], item[1]
         if brand_id:
             url = 'https://list.jd.com/list.html?{0}&{1}&psort=4&click=1'.format(
                 urllib.parse.urlencode({"cat": cate_id}),
                 urllib.parse.urlencode({"ev": "exbrand_" + name}))
         else:
             url = 'https://list.jd.com/list.html?{0}&psort=4&click=1'.format(
                 urllib.parse.urlencode({"cat": cate_id}))
         yield Request(url=url,
                       meta={
                           "dydmc_delay": 2.5,
                           "_seed":
                           str(Seed(value=(cate_id, brand_id, name))),
                           "headers": {
                               "Referer": "https://www.jd.com/"
                           }
                       },
                       priority=0,
                       callback=self.parse)
 def init_start_urls(self):
     self.redis.delete(self.start_urls_redis_key)
     self.redis.delete(self.items_redis_key)
     with op.DBManger() as m:
         #创建临时表本月任务的分界线
         m.create_db_collection(
             db_collection=("jingdong",
                            "jdcommentdetail{0}_sep".format(current_date)))
         skuid_set = {}
         top1000w = TopK(1000000)
         #skuids in last result
         last_result = m.get_lasted_collection(
             "jingdong",
             filter={"name": {
                 "$regex": r"^summary_201905_20\d\d\d\d$"
             }})
         pipeline = [
             {
                 "$project": {
                     "skuid":
                     "$skuid",
                     "comment_{}".format(last_result[-6:]):
                     "$comment_{}".format(last_result[-6:])
                 }
             },
             #{"$limit": 1000}
         ]
         for item, comments in m.read_from(
                 db_collect=("jingdong", last_result),
                 out_field=("skuid", "comment_{}".format(last_result[-6:])),
                 pipeline=pipeline):
             if int(item) not in skuid_set:
                 top1000w.push(int(comments))
                 skuid_set[int(item)] = int(comments)
         top1000w = set(top1000w.get_topk())
         buffer = []
         buffer_size = 10000
         for i, seed in enumerate(skuid_set):
             if skuid_set[seed] in top1000w:
                 seed = Seed(value=seed, type=0)
                 buffer.append(str(seed))
                 if len(buffer) % buffer_size == 0:
                     random.shuffle(buffer)
                     self.redis.sadd(self.start_urls_redis_key, *buffer)
                     buffer = []
         if buffer:
             random.shuffle(buffer)
             self.redis.sadd(self.start_urls_redis_key, *buffer)
Exemple #27
0
 def parse3(self, response):
     seed = Seed.parse_seed(response.meta["_seed"])
     cate_id, brand_id, page, s = seed.value
     r = json.loads(self.json_pettern.findall(response.text)[0])
     if r:
         for item in r:
             if item:
                 self.logger.info("jd_skuids 1")
                 yield {"skuid": item.get("pid"), "cate_id": cate_id, "brand_id": brand_id, "shopid": item.get("shopId"),
                        "venderid": item.get("venderId", None), "shop_name": item.get("seller"),
                        "ziying": 1 if item.get("seller") and item.get("seller").find("自营") != -1 else 0,
                        "title":response.meta["sku2title"][str(item.get("pid"))],"chaoshi":1 if "京东超市" in response.meta["sku2title"][str(item.get("pid"))] else 0}
             else:
                 self.logger.info("jd_skuids 2")
     else:
         raise Exception(response.request.url)
Exemple #28
0
 def __init__(self, seeds_file, **kwargs):
     super(NewPhone, self).__init__(**kwargs)
     self.ua = UserAgent()
     self.phone_regx = re.compile(r'^\d{11,11}$')
     self.phone_number_checker = stringUtils.check_legality(
         pattern=r'^\d{11,11}$')
     for seed in open(seeds_file):
         seed = seed.strip("\n")
         if (self.phone_number_checker(seed)):
             self.seeds_queue.put(Seed(seed, kwargs["retries"]))
         else:
             self.log.info("legal_format: " + seed)
     self.pro_city_pattern = re.compile(
         r'<dd><span>号码归属地:</span>(.*?) (.*?)</dd>')
     self.telcompany_pattern = re.compile(
         r'<dd><span>手机卡类型:</span>(.*?)</dd>')
Exemple #29
0
 def parse3(self, response):
     seed = Seed.parse_seed(response.meta["_seed"])
     cate_id, brand_id, page, s = seed.value
     r = json.loads(self.json_pettern.findall(response.text)[0])
     if r:
         tmp = {}
         for item in r:
             if item:
                 tmp[item.get("pid")] = {
                     "skuid":
                     item.get("pid"),
                     "cate_id":
                     cate_id,
                     "brand_id":
                     brand_id,
                     "shopid":
                     item.get("shopId"),
                     "venderid":
                     item.get("venderId", None),
                     "shop_name":
                     item.get("seller"),
                     "ziying":
                     1 if item.get("seller")
                     and item.get("seller").find("自营") != -1 else 0,
                     "title":
                     response.meta["sku2title"][str(item.get("pid"))],
                     "chaoshi":
                     1 if "京东超市" in response.meta["sku2title"][str(
                         item.get("pid"))] else 0
                 }
         response.meta["info"] = tmp
         response.meta["dydmc_delay"] = 1
         response.meta["headers"] = {
             "Connection":
             "close",
             "User-Agent":
             "Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Mobile Safari/537.36"
         }
         yield Request(
             url="https://p.3.cn/prices/mgets?&type=1&skuIds=J_" +
             "%2CJ_".join([sku for sku in response.meta["sku2title"]]) +
             '&pduid=' + str(random.randint(100000000, 999999999)),
             callback=self.parse4,
             meta=response.meta,
             priority=4)
     else:
         raise Exception(response.request.url)
Exemple #30
0
 def make_request_from_data(self, data):
     str_seed = bytes_to_str(data, self.redis_encoding)
     seed = Seed.parse_seed(str_seed)
     if seed.type == 0:
         skuid = seed.value
         #url = "https://wq.jd.com/commodity/comment/getcommentlist?callback=fetchJSON_comment98&pagesize=10&sceneval=2&skucomment=1&score=0&sku={0}&sorttype=6&page=0".format(skuid)
         url = "https://wq.jd.com/commodity/comment/getcommentlist?callback=skuJDEvalB&version=v2&pagesize=10&sceneval=2&skucomment=1&score=0&sku={}&sorttype=6&page=1&t=0.5156075450518778".format(
             skuid)
         headers = {
             'Connection':
             'close',
             'Host':
             'wq.jd.com',
             'accept':
             '*/*',
             'sec-fetch-site':
             'same-site',
             'sec-fetch-mode':
             'no-cors',
             'sec-fetch-dest':
             'script',
             "Referer":
             "https://item.m.jd.com/ware/view.action?wareId={}&sid=null".
             format(skuid),
             'accept-encoding':
             'gzip, deflate, br',
             'accept-language':
             'zh-CN,zh;q=0.9,en-US;q=0.8,en;q=0.7',
             'User-Agent':
             'Mozilla/5.0 (Linux; Android 10; HRY-AL00a; HMSCore 5.1.1.303) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.106 HuaweiBrowser/11.0.7.303 Mobile Safari/537.36',
             "cookie":
             "__jdc=122270672; mba_muid=16087105855231456793479; shshshfpa=b86c237d-b506-9cc9-730d-39db2f5ea48c-1608710586; shshshfpb=aW2xjA0PZevBiTvJrQ6rk4A%3D%3D; retina=1; webp=1; visitkey=31140776387466944; sbx_hot_h=null; deviceVersion=83.0.4103.106; deviceOS=android; deviceOSVersion=10; deviceName=Chrome; rurl=https%3A%2F%2Fwqs.jd.com%2Ffaqs%2Findex.html%3Fsceneval%3D2%26ptag%3D7001.1.124%26productId%3D12991458%26ispg%3D%26_fd%3Djdm%26jxsid%3D16109541564584400343; equipmentId=A75Q6PQS36IHI62HBEUGC44IVLERE7257UWVYTGEXPMR6NOKARSVVF2Q6EBPSVGNR537LK6GQN3ENW47JREOEXNAVI; __jdv=122270672%7Cdirect%7C-%7Cnone%7C-%7C1614224630058; sc_width=360; shshshfp=c6774e911e47825ddd51cefc23f9b157; wxa_level=1; cid=9; jxsid=16145705280303310338; __jda=122270672.16087105855231456793479.1608710585.1614224630.1614570529.10; wq_ug=14; fingerprint=794164a430090764096f40466260c718; mt_xid=V2_52007VwMVU1ReUlsbQB1YBmUDF1ZaXlpYGk8RbFVuBEBVWV9RRkhIGw4ZYlcRWkFQWwlIVR5aAjAAR1BZX1tZHnkaXQZnHxNQQVlSSx9JElgFbAEbYl9oUmoXSB5dDWYKE1BZXlNeF08cVQNvMxJbWV8%3D; wq_logid=1614571192.282863947; wqmnx1=MDEyNjM5M3AuL3d3MiY2NjQ1eGQtTTFBaSBsby8zd3IzZTUyNy00UkghKQ%3D%3D; __jdb=122270672.9.16087105855231456793479|10.1614570529; mba_sid=16145705290954323095988279117.9; __wga=1614571199267.1614570547761.1614225998734.1610954174749.5.6; PPRD_P=UUID.16087105855231456793479-LOGID.1614571199300.300139660; jxsid_s_t=1614571199496; jxsid_s_u=https%3A//item.m.jd.com/ware/view.action; sk_history=70241615154%2C101609%2C615036%2C54761686610%2C1399903%2C10024515889185%2C10381689654%2C12991458%2C100010062010%2C58070892025%2C100007627009%2C; shshshsID=e45b3b58ca53b7ab42489de6ebc02d6b_5_1614571200418"
         }
         return Request(url=url,
                        meta={
                            "_seed": str_seed,
                            "dydmc_delay": 0.15 + random.random() * 0.1,
                            "headers": headers
                        },
                        priority=0,
                        callback=self.parse)
     elif seed.type == 3:
         str_seed = seed.value
         request = Request.deserialize(str_seed, self)
         return request