def run(cls): url = \ "https://sycm.taobao.com/mq/overview/reportIndex.json?cateId={cateId}&" \ "dateRange={start_time}%7C{end_time}&dateType={dateType}&device=0&" \ "indexCode=uv|pv|searchUvCnt|searchPvCnt|searchClkRate|favBuyerCnt|favCnt|addCartBuyerCnt|" \ "addCartCnt|payPct|visitItemCnt|sellerCnt|visitSellerCnt|paySellerCnt|" \ "payItemQty|searchIndex|tradeIndex|payAmtParentRate&seller={seller}" cates = ConfigData().getFullCate() dateType = 'month' sellers = [1, -1] crawl_dates = [[datetime.date(v.year, v.month, 1).strftime('%Y-%m-%d'), v.strftime('%Y-%m-%d')] for v in pd.date_range('2015-09-01', '2018-09-01', freq='1M').date] arr = [] for cateId in cates: for crawl_date in crawl_dates: for seller in sellers: new_url = url.format( cateId=cateId, start_time=crawl_date[0], end_time=crawl_date[1], dateType=dateType, seller=seller ) arr.append(new_url) print(new_url) return arr
def run(cls): url = "https://sycm.taobao.com/mq/brand/rank.json?" \ "cateId={cateId}&" \ "dateRange={start_time}%7C{end_time}&" \ "dateType={dateType}&device=0&orderField=tradeIndex&orderType=desc&page=1&pageSize=10&rankType=0&search=&seller={seller}" # cate_name =('彩妆/香水/美妆工具') cates = ConfigData().getFullCate() dateType = 'month' sellers = [-1, 0, 1] crawl_dates = [[datetime.date(v.year, v.month, 1).strftime('%Y-%m-%d'), v.strftime('%Y-%m-%d')] for v in pd.date_range('2017-08-01', '2018-09-01', freq='1M').date] arr = [] for cateId in cates: for crawl_date in crawl_dates: for seller in sellers: new_url = url.format( cateId=cateId, start_time=crawl_date[0], end_time=crawl_date[1], dateType=dateType, seller=seller ) arr.append(new_url) print(new_url) return arr
def parse_page(self): self.driver.get(self.request.url) self.driver.refresh() self.get_cookie() time.sleep(2) [(cateId, start_time, end_time, dateType, devcice, seller) ] = self.re_partern.findall(self.request.url) print(self.re_partern.findall(self.driver.current_url)) common_tr, common_tb, data_key = ConfigData.cateField( self.cates, cateId, start_time, end_time, dateType, devcice, seller) cate_trend_tr = list( map(lambda v: v.text, self.find_elements(*self.cate_trend_tr_s))) cate_trend_tb = list( map(lambda v: v.text, self.find_elements(*self.cate_trend_tb_s))) while True: try: self.quick_find_element(*self.cate_trend_next_s) self.click_item(*(By.CSS_SELECTOR, '#cateTrend .right')) cate_trend_tr = cate_trend_tr + list( map(lambda v: v.text, self.find_elements(*self.cate_trend_tr_s))) cate_trend_tb = cate_trend_tb + list( map(lambda v: v.text, self.find_elements(*self.cate_trend_tb_s))) except Exception as e: logger.info(e) break try: self.get_bottom() time.sleep(.5) except Exception as e: logger.info(e) value = json.dumps(dict( zip(cate_trend_tr + common_tr, cate_trend_tb + common_tb)), ensure_ascii=False) logger.info(value) self.db.data_process('宝洁官方旗舰店', '市场-市场大盘-行业趋势', data_key + [cate_trend_tb[0]], value, '生意参谋', start_time, end_time) for table in self.total_table: try: self.quick_find_element(By.CSS_SELECTOR, table[0]) except Exception as e: logger.info(e) continue self.process(table, common_tb, common_tr, data_key, start_time, end_time) total = self.find_elements(By.CSS_SELECTOR, self.total_s.format(table[0])) total = int(total[len(total) - 2].text) while total > 1: self.click_item(By.CSS_SELECTOR, self.next_page_s.format(table[0])) self.process(table, common_tb, common_tr, data_key, start_time, end_time) total = total - 1
def worker(cls, url): (cateId, start_time, end_time, dateType, seller) = cls.re_partern.findall(url)[0] commom_tr, common_tb, data_key = ConfigData.cateField(cls.cates_items, cateId, start_time, end_time, dateType, 0, seller) time.sleep(3) data = [] try: data = cls.request(url) except Exception as e: print(e) if data: cls.parse(data, commom_tr, common_tb, data_key, start_time, end_time)
def run(self): url = "https://sycm.taobao.com/mq/brand/rank.json?" \ "cateId={cateId}&" \ "dateRange={start_time}%7C{end_time}&" \ "dateType={dateType}&device=0&orderField=tradeIndex&orderType=desc&page=1&pageSize=10&rankType=0&search=&seller={seller}" # 获取所有类目 dateType = 'month' sellers = [-1, 0, 1] crawl_dates = [[ datetime.date(v.year, v.month, 1).strftime('%Y-%m-%d'), v.strftime('%Y-%m-%d') ] for v in pd.date_range('2017-08-01', '2018-09-01', freq='1M').date] self.all = ConfigData().get_all() cates_items = self.all cates = [50011991, 50011992, 121366011, 121408009, 125172008] for cateId in cates: for crawl_date in crawl_dates: for seller in sellers: new_url = url.format(cateId=cateId, start_time=crawl_date[0], end_time=crawl_date[1], dateType=dateType, seller=seller) print(new_url) commom_tr, common_tb, data_key = ConfigData.cateField( cates_items, cateId, crawl_date[0], crawl_date[1], dateType, 0, seller) data = [] try: data = self.request(new_url) except Exception as e: print(e) if data: self.parse(data, commom_tr, common_tb, data_key, crawl_date[0], crawl_date[1])
def main(): start_time = '2018-09-05' yesterday = datetime.date.today() - datetime.timedelta(days=1) start_time = datetime.date(*[int(i) for i in start_time.split('-')]) db = RedisClient('127.0.0.1', '6379') cates = ConfigData().getFullCate() for dateType in ['day', 'week', 'month']: crawl_dates = [] if dateType == 'day': crawl_dates = [[i.strftime('%Y-%m-%d'), i.strftime('%Y-%m-%d')] for i in pd.date_range(start_time, yesterday).date] elif dateType == 'week': # 判断开始时间是不是星期一 current_week = start_time.isoweekday() if current_week != 1: monday = start_time + datetime.timedelta(days=7 - current_week) # 如果当前这个星期一距离现在不足7天 crawl_dates = [[ i.strftime('%Y-%m-%d'), (i + datetime.timedelta(days=6)).strftime('%Y-%m-%d') ] for i in pd.date_range(monday, yesterday, freq='7D').date if (yesterday - i).days > 7] else: crawl_dates = [[ i.strftime('%Y-%m-%d'), (i + datetime.timedelta(days=6)).strftime('%Y-%m-%d') ] for i in pd.date_range(start_time, yesterday, freq='7D').date if (yesterday - i).days > 7] elif dateType == 'month': crawl_dates = [[ datetime.date(v.year, v.month, 1).strftime('%Y-%m-%d'), v.strftime('%Y-%m-%d') ] for v in pd.date_range(start_time, yesterday, freq='1M').date] for crawl_date in crawl_dates: for device in [0, 2]: for cate in cates: for seller in [-1, 1]: url = "https://sycm.taobao.com/mc/mq/overview?cateFlag=0&cateId={}&dateRange={}&dateType={}&device={}&sellerType={}".format( cate, crawl_date[0] + "%7c" + crawl_date[1], dateType, device, seller) db.add('compeletion:start_urls', url) print(url)
def worker(cls, url): (cateId, start_time, end_time, dateType, seller) = cls.re_partern.findall(url)[0] commom_tr, common_tb, data_key = ConfigData.cateField(cls.cates_items, cateId, start_time, end_time, dateType, 0, seller) data = [] if int(cateId) == 25 or int(cateId) == 124484008: url = "https://sycm.taobao.com/mq/overview/reportIndex.json?cateId={cateId}&dateRange={start_time}%7C{end_time}&dateType={dateType}&device=0&indexCode=uv|pv|searchUvCnt|searchPvCnt|searchClkRate|favBuyerCnt|favCnt|addCartBuyerCnt|addCartCnt|payPct|visitItemCnt|sellerCnt|visitSellerCnt|paySellerCnt|payItemQty&seller={seller}".format( cateId=cateId, start_time=start_time, end_time=end_time, dateType=dateType, seller=seller ) try: data = cls.request(url) except Exception as e: print(e) if len(data) != 0: cls.parse(data, commom_tr, common_tb, data_key, start_time, end_time)
def run(cls): url = "https://sycm.taobao.com/mq/overview/childCateRank.json?cateId=(.*?)&dateRange=(.*?)%7C(.*?)&dateType=(.*?)&device=0&seller=(.*?)" cates = ConfigData().getFullCate() dateType = 'month' sellers = [1, -1] crawl_dates = [[datetime.date(v.year, v.month, 1).strftime('%Y-%m-%d'), v.strftime('%Y-%m-%d')] for v in pd.date_range('2015-09-01', '2018-09-01', freq='1M').date] arr = [] for cateId in cates: for crawl_date in crawl_dates: for seller in sellers: new_url = url.format( cateId=cateId, start_time=crawl_date[0], end_time=crawl_date[1], dateType=dateType, seller=seller ) arr.append(new_url) print(new_url) return arr
class Rank(object): re_partern = re.compile( r"cateId=(.*?)&dateRange=(\d{4}-\d{2}-\d{2})%7C(\d{4}-\d{2}-\d{2})&dateType=(.*?)&device=0&indexCode=uv\|pv\|searchUvCnt\|searchPvCnt\|searchClkRate\|favBuyerCnt\|favCnt\|addCartBuyerCnt\|addCartCnt\|payPct\|visitItemCnt\|sellerCnt\|visitSellerCnt\|paySellerCnt\|payItemQty\|searchIndex\|tradeIndex\|payAmtParentRate&seller=(.*?)$") db = Dto(sql_address='mysql+mysqlconnector://py_sycm:Kdi*[email protected]:3306/toothpick') cates_items = ConfigData().get_all() @classmethod def run(cls): url = \ "https://sycm.taobao.com/mq/overview/reportIndex.json?cateId={cateId}&" \ "dateRange={start_time}%7C{end_time}&dateType={dateType}&device=0&" \ "indexCode=uv|pv|searchUvCnt|searchPvCnt|searchClkRate|favBuyerCnt|favCnt|addCartBuyerCnt|" \ "addCartCnt|payPct|visitItemCnt|sellerCnt|visitSellerCnt|paySellerCnt|" \ "payItemQty|searchIndex|tradeIndex|payAmtParentRate&seller={seller}" cates = ConfigData().getFullCate() dateType = 'month' sellers = [1, -1] crawl_dates = [[datetime.date(v.year, v.month, 1).strftime('%Y-%m-%d'), v.strftime('%Y-%m-%d')] for v in pd.date_range('2015-09-01', '2018-09-01', freq='1M').date] arr = [] for cateId in cates: for crawl_date in crawl_dates: for seller in sellers: new_url = url.format( cateId=cateId, start_time=crawl_date[0], end_time=crawl_date[1], dateType=dateType, seller=seller ) arr.append(new_url) print(new_url) return arr @classmethod def worker(cls, url): (cateId, start_time, end_time, dateType, seller) = cls.re_partern.findall(url)[0] commom_tr, common_tb, data_key = ConfigData.cateField(cls.cates_items, cateId, start_time, end_time, dateType, 0, seller) data = [] if int(cateId) == 25 or int(cateId) == 124484008: url = "https://sycm.taobao.com/mq/overview/reportIndex.json?cateId={cateId}&dateRange={start_time}%7C{end_time}&dateType={dateType}&device=0&indexCode=uv|pv|searchUvCnt|searchPvCnt|searchClkRate|favBuyerCnt|favCnt|addCartBuyerCnt|addCartCnt|payPct|visitItemCnt|sellerCnt|visitSellerCnt|paySellerCnt|payItemQty&seller={seller}".format( cateId=cateId, start_time=start_time, end_time=end_time, dateType=dateType, seller=seller ) try: data = cls.request(url) except Exception as e: print(e) if len(data) != 0: cls.parse(data, commom_tr, common_tb, data_key, start_time, end_time) @classmethod def request(cls, new_url): rep = requests.get(new_url, headers={ 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Accept': '/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'cookie': 'cookie2=34c05924cf1d6c6e07660b3502afa9be; csg=d7ebd851' }) if rep.status_code == 200: rep_josn = json.loads(rep.text, encoding='utf-8') if rep_josn['hasError'] == False: return rep_josn['content']['data'] else: # print(rep.text) return [] @classmethod def parse(cls, data, common_tr, common_tb, data_key, start_time, end_time): tr = [ItenEnum[v['indexCode']].value for v in data] tb = [v.get('currentValue', None) or v.get('values',[0])[-1] for v in data] for tb_val in tb: if tb_val: try: json_str = json.dumps(dict(zip(tr + common_tr, tb + common_tb)), ensure_ascii=False) cls.db.data_process( 'hasbro孩之宝旗舰店', '市场行情-行业大盘-行业报表', data_key + [start_time], json_str, '生意参谋', start_time, end_time ) print(json_str) break except Exception as e: print(e)
class Rank(object): tr = ['排名', '属性值', '交易指数', '支付件数', '类别'] re_partern = re.compile( r"cateId=(.*?)&dateRange=(\d{4}-\d{2}-\d{2})%7C(\d{4}-\d{2}-\d{2})&dateType=(.*?)&device=0&hotAttrType=0&propertyId=122216905&seller=(.*?)$" ) db = Dto( sql_address= 'mysql+mysqlconnector://py_sycm:Kdi*[email protected]:3306/toothpick') cates_items = ConfigData().get_all() @classmethod def run(cls): url = "https://sycm.taobao.com/mq/property/hotRank.json?" \ "cateId={cateId}&dateRange={start_time}%7C{end_time}&dateType={dateType}&device=0&hotAttrType=0&propertyId=122216905&seller={seller}" dateType = 'day' sellers = [-1, 0, 1] crawl_dates = [[v.strftime('%Y-%m-%d'), v.strftime('%Y-%m-%d')] for v in pd.date_range('2018-06-20', '2018-09-16').date] arr = [] for cateId in [ 121484013, 121422013, 121468012, 121368013, 121452007, 121396013 ]: for crawl_date in crawl_dates: for seller in sellers: new_url = url.format(cateId=cateId, start_time=crawl_date[0], end_time=crawl_date[1], dateType=dateType, seller=seller) arr.append(new_url) print(new_url) return arr @classmethod def worker(cls, url): (cateId, start_time, end_time, dateType, seller) = cls.re_partern.findall(url)[0] commom_tr, common_tb, data_key = ConfigData.cateField( cls.cates_items, cateId, start_time, end_time, dateType, 0, seller) print(commom_tr) print(common_tb) print(data_key) data = [] try: time.sleep(1) data = cls.request(url) except Exception as e: print(e) if data: cls.parse(data, commom_tr, common_tb, data_key, start_time, end_time) @classmethod def request(cls, new_url): rep = requests.get( new_url, headers={ 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Accept': '/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'cookie': 'cookie2=19b3cf1a61b5e088b1f21ca683995e05; csg=4586600b' }) if rep.status_code == 200: rep_josn = json.loads(rep.text, encoding='utf-8') if rep_josn['hasError'] == False: return rep_josn['content']['data']['data'] else: print(rep.text) return [] @classmethod def parse(cls, data, common_tr, common_tb, data_key, start_time, end_time): for tb in data: tb = [ tb['rank'], tb['properties'][0]['value']['name'], tb['tradeIndex'], tb['payItemQty'], tb['properties'][0]['name'] ] try: json_str = json.dumps(dict( zip(cls.tr + common_tr, tb + common_tb)), ensure_ascii=False) cls.db.data_process('宝洁官方旗舰', '市场行情-属性分析-属性排行-热销属性榜', data_key + [str(tb[0])], json_str, '百雀羚旗舰店', start_time, end_time) print(json_str) except Exception as e: print(e)
class Rank(object): tr = ['热销排名', '品牌名称', '交易指数', '交易增长幅度', '支付商品数', '支付转化率', '品牌ID'] item = ['rankNo', 'brandName', 'tradeIndex', 'tradeIndexPercent', 'payItemCnt', 'payByrRate', 'brandId'] re_partern = re.compile( r"cateId=(.*?)&dateRange=(\d{4}-\d{2}-\d{2})%7C(\d{4}-\d{2}-\d{2})&dateType=(.*?)&device=0&orderField=tradeIndex&orderType=desc&page=1&pageSize=10&rankType=0&search=&seller=(.*?)$") db = Dto(sql_address='mysql+mysqlconnector://py_sycm:Kdi*[email protected]:3306/toothpick') cates_items = ConfigData().get_all() @classmethod def run(cls): url = "https://sycm.taobao.com/mq/brand/rank.json?" \ "cateId={cateId}&" \ "dateRange={start_time}%7C{end_time}&" \ "dateType={dateType}&device=0&orderField=tradeIndex&orderType=desc&page=1&pageSize=10&rankType=0&search=&seller={seller}" # cate_name =('彩妆/香水/美妆工具') cates = ConfigData().getFullCate() dateType = 'month' sellers = [-1, 0, 1] crawl_dates = [[datetime.date(v.year, v.month, 1).strftime('%Y-%m-%d'), v.strftime('%Y-%m-%d')] for v in pd.date_range('2017-08-01', '2018-09-01', freq='1M').date] arr = [] for cateId in cates: for crawl_date in crawl_dates: for seller in sellers: new_url = url.format( cateId=cateId, start_time=crawl_date[0], end_time=crawl_date[1], dateType=dateType, seller=seller ) arr.append(new_url) print(new_url) return arr @classmethod def worker(cls, url): (cateId, start_time, end_time, dateType, seller) = cls.re_partern.findall(url)[0] commom_tr, common_tb, data_key = ConfigData.cateField(cls.cates_items, cateId, start_time, end_time, dateType, 0, seller) time.sleep(3) data = [] try: data = cls.request(url) except Exception as e: print(e) if data: cls.parse(data, commom_tr, common_tb, data_key, start_time, end_time) @classmethod def request(cls, new_url): rep = requests.get(new_url, headers={ 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Accept': '/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'cookie': 'cookie2=34c05924cf1d6c6e07660b3502afa9be; csg=d7ebd851' }) if rep.status_code == 200: rep_josn = json.loads(rep.text, encoding='utf-8') if rep_josn['hasError'] == False: return rep_josn['content']['data']['data'] else: print(rep.text) return [] @classmethod def parse(cls, data, common_tr, common_tb, data_key, start_time, end_time): count = 0 for tbs in data: count += 1 if count > 50: break tb = [tbs.get(key, None) for key in cls.item] try: json_str = json.dumps(dict(zip(cls.tr + common_tr, tb + common_tb)), ensure_ascii=False) cls.db.data_process( 'hasbro孩之宝旗舰店', '品牌分析-品牌排行-热销品牌榜', data_key + [str(tb[0])], json_str, '百雀羚旗舰店', start_time, end_time ) print(json_str) except Exception as e: print(e)
def main(): db = RedisClient('127.0.0.1', '6379') cates = ConfigData().getFullCate(needJunjor=False) crawl_dates = [['2018-08-12', '2018-09-10']] deo = Dto("mysql+mysqlconnector://root:123456@localhost:3306/test") urls = [ { "url": "sycm.taobao.com/mq/buyerPortrait/getJob.json?age=&area=&cateId={}&dateRange=2018-08-12|2018-09-10&dateType=recent30" "&device=0&price={}&seller={}&sex=", "name": "职业分布" }, { "url": "sycm.taobao.com/mq/buyerPortrait/getLevel.json?age=&area=&cateId={}&dateRange=2018-08-12|2018-09-10&" "dateType=recent30&device=0&price={}&seller={}&sex=", "name": "淘气值分布" }, { "url": "sycm.taobao.com/mq/buyerPortrait/getBuyerArea.json?age=&area=&areaType=province&cateId={}¤tPage=1&" "dateRange=2018-08-12|2018-09-10&dateType=recent30&device=0&pageSize=10&price={}&seller={}&sex=", "name": "省份分布排行" }, { "url": "sycm.taobao.com/mq/buyerPortrait/getBuyerArea.json?age=&area=&areaType=city&cateId={}" "¤tPage=1&dateRange=2018-08-12|2018-09-10&dateType=recent30&device=0&pageSize=10&price={}&seller={}&sex=", "name": "城市分布排行" }, { "url": "sycm.taobao.com/mq/buyerPortrait/getFondnessPropsByOneLevel.json?age=&area=&" "cateId={}&dateRange=2018-08-12|2018-09-10&dateType=recent30&device=0&price={}&seller={}&sex=", "name": "属性偏好" }, { "url": "sycm.taobao.com/mq/buyerPortrait/getKeyWord.json?age=&area=&cateId={}&dateRange=2018-08-12|2018-09-10&" "dateType=recent30&device=0&price={}&seller={}&sex=", "name": "搜索词偏好" }, { "url": "sycm.taobao.com/mq/buyerPortrait/getNinetyDaysBuyCnt.json?age=&area=&cateId={}&dateRange=2018-08-12|2018-09-10&dateType=recent30&device=0&price={}&seller={}&sex=", "name": "近90天购买次数" }, { "url": "sycm.taobao.com/mq/buyerPortrait/getNinetyDaysAmt.json?age=&area=&cateId={}&dateRange=2018-08-12|2018-09-10&dateType=recent30&device=0&price={}&seller={}&sex=", "name": "近90天支付金额" }, { "url": "sycm.taobao.com/mq/buyerPortrait/getFondnessBrand.json?age=&area=&cateId={}¤tPage=1&dateRange=2018-08-12|2018-09-10&dateType=recent30&device=0&pageSize=10&preference=brand&price={}&seller={}&sex=", "name": "买家品牌购买偏好" }, { "url": "sycm.taobao.com/mq/buyerPortrait/getFondnessBrand.json?age=&area=&cateId={}¤tPage=1&dateRange=2018-08-12|2018-09-10&dateType=recent30&device=0&pageSize=10&preference=cate&price={}&seller={}&sex=", "name": "买家类目购买偏好" } ] # 数据类型 for url in urls: for cate in [50010788, 1801]: price_list = [] if cate == 50010788: price_list = [('', '全部'), ('0-25', 0.3995), ('25-55', 0.3059), ('55-120', '0.1999'), ('120-235', 0.0597), ('235-360', 0.0247), ('360', 0.0103)] elif cate == 1801: price_list = [('', '全部'), ('0-30', 0.2950), ('30-65', 0.3088), ('65-135', 0.2454), ('135-240', 0.0928), ('240-450', 0.0383), ('405', 0.0197)] for price in price_list: for seller in [-1, 0, 1]: # 终端 new_url = url['url'].format(cate, price[0], seller)