def get_searchwords(self, data_type): session = Dto(self.__sql_address__) conf = session.query(UserParams).filter( UserParams.data_type == data_type).one() temp = json.loads(conf.value) searchwords_list = temp['userParams']['搜索词列表'] session.close() return searchwords_list
def get_current_date_where_data_type_and_time_dimension( self, data_type, time_dimension): session = Dto(self.__sql_address__) task_progress = None try: task_progress = session.query(TaskProgress).filter( TaskProgress.data_type == data_type, TaskProgress.time_dimension == time_dimension).one() except Exception as e: logger.error('数据库未查询到%s and %s' % (data_type, time_dimension) + str(e)) return task_progress
def getThridCate(self, seniorName=('美容护肤/美体/精油', '彩妆/香水/美妆工具')): session = Dto(self.__sql_address__) conf_list = session.query(SycmCategory).filter( SycmCategory.name.in_(seniorName)) chain = list() for cate in conf_list: subcates = session.query(SycmCategory).filter( SycmCategory.parent_id == cate.id) for subcate in subcates: juniorCates = session.query(SycmCategory).filter( SycmCategory.parent_id == subcate.id).all() if len(juniorCates) != 0: for juniorCate in juniorCates: chain.append(juniorCate.id) return chain
def getFullCate(self, needJunjor=True, seniorName=('模玩/动漫/周边/cos/桌游', '玩具/童车/益智/积木/模型')): session = Dto(self.__sql_address__) conf_list = session.query(SycmCategory).filter( SycmCategory.name.in_(seniorName)) chain = list() for cate in conf_list: chain.append(cate.id) if needJunjor: subcates = session.query(SycmCategory).filter( SycmCategory.parent_id == cate.id) for subcate in subcates: chain.append(subcate.id) juniorCates = session.query(SycmCategory).filter( SycmCategory.parent_id == subcate.id).all() if len(juniorCates) != 0: for juniorCate in juniorCates: chain.append(juniorCate.id) return chain
def updateDate_where_data_type(self, data_type, time_dimension, current_date): session = Dto(self.__sql_address__) try: session.query(TaskProgress).filter( TaskProgress.data_type == data_type, TaskProgress.time_dimension == time_dimension).update({ TaskProgress.next_time: datetime.datetime.strptime(current_date, "%Y-%m-%d") }) session.commit() except Exception as e: logger.error("进度更新失败:类型: %s 时间维度 %s 日期进度:%s" % (data_type, time_dimension, current_date) + str(e)) finally: session.close()
def newFindByName(self, data_type): session = Dto(self.__sql_address__) conf_list = session.query(UserParams).filter( UserParams.data_type == data_type).all() chain_list = list() for conf in conf_list: conf_detail = json.loads(conf.value) conf_details = conf_detail['userParams']['品牌类目列表'] for cate in conf_details: # cate_chain = list() cate_chain = dict() itemName = cate['一级类目'] try: category_detail = session.query(SycmCategory).filter( SycmCategory.level == 1, SycmCategory.name == itemName).one() except Exception as e: logger.error("通过类目名%s获取id失败%s" % (itemName, category_detail.id) + str(e)) if data_type == '商品店铺榜-品牌粒度': cate_chain = { 'itemName': itemName, 'cateId': category_detail.id, 'bandName': cate['品牌名'], 'bandCount': cate['品牌序号'], 'shopName': conf.shop } if data_type == '商品店铺榜-产品粒度': cate_chain = { 'itemName': itemName, 'cateId': category_detail.id, 'bandName': cate['品牌名'], 'bandCount': cate['品牌序号'], 'productName': cate['产品名'], 'productCount': cate['产品序号'], 'shopName': conf.shop } cate_chain['depth'] = 1 if cate['二级类目'] != "": itemName = cate['二级类目'] try: category_detail = session.query(SycmCategory).filter( SycmCategory.level == 2, SycmCategory.name == itemName, SycmCategory.parent_id == cate_chain['cateId']).one() if data_type == '商品店铺榜-品牌粒度': cate_chain['subcate'] = { 'itemName': itemName, 'cateId': category_detail.id, 'bandName': cate['品牌名'], 'bandCount': cate['品牌序号'], 'shopName': conf.shop } if data_type == '商品店铺榜-产品粒度': cate_chain['subcate'] = { 'itemName': itemName, 'cateId': category_detail.id, 'bandName': cate['品牌名'], 'bandCount': cate['品牌序号'], 'productName': cate['产品名'], 'productCount': cate['品牌序号'], 'shopName': conf.shop } except Exception as e: logger.error("通过品牌名%s获取id失败%s" % (itemName, category_detail.id) + str(e)) cate_chain['depth'] = 2 if cate['三级类目'] != "": itemName = cate['三级类目'] try: category_detail = session.query(SycmCategory).filter( SycmCategory.level == 3, SycmCategory.name == itemName, SycmCategory.parent_id == cate_chain['subcate'] ['cateId']).one() if data_type == '商品店铺榜-品牌粒度': cate_chain['subcate']['subcate'] = { 'itemName': itemName, 'cateId': category_detail.id, 'bandName': cate['品牌名'], 'bandCount': cate['品牌序号'], 'shopName': conf.shop } if data_type == '商品店铺榜-产品粒度': cate_chain['subcate']['subcate'] = { 'itemName': itemName, 'cateId': category_detail.id, 'bandName': cate['品牌名'], 'bandCount': cate['品牌序号'], 'productName': cate['产品名'], 'productCount': cate['品牌序号'], 'shopName': conf.shop } except Exception as e: logger.error("通过品牌名%s获取id失败%s" % (itemName, category_detail.id) + str(e)) cate_chain['depth'] = 3 chain_list.append(cate_chain) session.close() return chain_list
def get_all(self): session = Dto(self.__sql_address__) cates = session.query(SycmCategory).all() session.close() return cates
class Rank(object): re_partern = re.compile( r"cateId=(.*?)&dateRange=(\d{4}-\d{2}-\d{2})%7C(\d{4}-\d{2}-\d{2})&dateType=(.*?)&device=0&indexCode=uv\|pv\|searchUvCnt\|searchPvCnt\|searchClkRate\|favBuyerCnt\|favCnt\|addCartBuyerCnt\|addCartCnt\|payPct\|visitItemCnt\|sellerCnt\|visitSellerCnt\|paySellerCnt\|payItemQty\|searchIndex\|tradeIndex\|payAmtParentRate&seller=(.*?)$") db = Dto(sql_address='mysql+mysqlconnector://py_sycm:Kdi*[email protected]:3306/toothpick') cates_items = ConfigData().get_all() @classmethod def run(cls): url = \ "https://sycm.taobao.com/mq/overview/reportIndex.json?cateId={cateId}&" \ "dateRange={start_time}%7C{end_time}&dateType={dateType}&device=0&" \ "indexCode=uv|pv|searchUvCnt|searchPvCnt|searchClkRate|favBuyerCnt|favCnt|addCartBuyerCnt|" \ "addCartCnt|payPct|visitItemCnt|sellerCnt|visitSellerCnt|paySellerCnt|" \ "payItemQty|searchIndex|tradeIndex|payAmtParentRate&seller={seller}" cates = ConfigData().getFullCate() dateType = 'month' sellers = [1, -1] crawl_dates = [[datetime.date(v.year, v.month, 1).strftime('%Y-%m-%d'), v.strftime('%Y-%m-%d')] for v in pd.date_range('2015-09-01', '2018-09-01', freq='1M').date] arr = [] for cateId in cates: for crawl_date in crawl_dates: for seller in sellers: new_url = url.format( cateId=cateId, start_time=crawl_date[0], end_time=crawl_date[1], dateType=dateType, seller=seller ) arr.append(new_url) print(new_url) return arr @classmethod def worker(cls, url): (cateId, start_time, end_time, dateType, seller) = cls.re_partern.findall(url)[0] commom_tr, common_tb, data_key = ConfigData.cateField(cls.cates_items, cateId, start_time, end_time, dateType, 0, seller) data = [] if int(cateId) == 25 or int(cateId) == 124484008: url = "https://sycm.taobao.com/mq/overview/reportIndex.json?cateId={cateId}&dateRange={start_time}%7C{end_time}&dateType={dateType}&device=0&indexCode=uv|pv|searchUvCnt|searchPvCnt|searchClkRate|favBuyerCnt|favCnt|addCartBuyerCnt|addCartCnt|payPct|visitItemCnt|sellerCnt|visitSellerCnt|paySellerCnt|payItemQty&seller={seller}".format( cateId=cateId, start_time=start_time, end_time=end_time, dateType=dateType, seller=seller ) try: data = cls.request(url) except Exception as e: print(e) if len(data) != 0: cls.parse(data, commom_tr, common_tb, data_key, start_time, end_time) @classmethod def request(cls, new_url): rep = requests.get(new_url, headers={ 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Accept': '/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'cookie': 'cookie2=34c05924cf1d6c6e07660b3502afa9be; csg=d7ebd851' }) if rep.status_code == 200: rep_josn = json.loads(rep.text, encoding='utf-8') if rep_josn['hasError'] == False: return rep_josn['content']['data'] else: # print(rep.text) return [] @classmethod def parse(cls, data, common_tr, common_tb, data_key, start_time, end_time): tr = [ItenEnum[v['indexCode']].value for v in data] tb = [v.get('currentValue', None) or v.get('values',[0])[-1] for v in data] for tb_val in tb: if tb_val: try: json_str = json.dumps(dict(zip(tr + common_tr, tb + common_tb)), ensure_ascii=False) cls.db.data_process( 'hasbro孩之宝旗舰店', '市场行情-行业大盘-行业报表', data_key + [start_time], json_str, '生意参谋', start_time, end_time ) print(json_str) break except Exception as e: print(e)
class Rank(object): tr = ['排名', '属性值', '交易指数', '支付件数', '类别'] re_partern = re.compile( r"cateId=(.*?)&dateRange=(\d{4}-\d{2}-\d{2})%7C(\d{4}-\d{2}-\d{2})&dateType=(.*?)&device=0&hotAttrType=0&propertyId=122216905&seller=(.*?)$" ) db = Dto( sql_address= 'mysql+mysqlconnector://py_sycm:Kdi*[email protected]:3306/toothpick') cates_items = ConfigData().get_all() @classmethod def run(cls): url = "https://sycm.taobao.com/mq/property/hotRank.json?" \ "cateId={cateId}&dateRange={start_time}%7C{end_time}&dateType={dateType}&device=0&hotAttrType=0&propertyId=122216905&seller={seller}" dateType = 'day' sellers = [-1, 0, 1] crawl_dates = [[v.strftime('%Y-%m-%d'), v.strftime('%Y-%m-%d')] for v in pd.date_range('2018-06-20', '2018-09-16').date] arr = [] for cateId in [ 121484013, 121422013, 121468012, 121368013, 121452007, 121396013 ]: for crawl_date in crawl_dates: for seller in sellers: new_url = url.format(cateId=cateId, start_time=crawl_date[0], end_time=crawl_date[1], dateType=dateType, seller=seller) arr.append(new_url) print(new_url) return arr @classmethod def worker(cls, url): (cateId, start_time, end_time, dateType, seller) = cls.re_partern.findall(url)[0] commom_tr, common_tb, data_key = ConfigData.cateField( cls.cates_items, cateId, start_time, end_time, dateType, 0, seller) print(commom_tr) print(common_tb) print(data_key) data = [] try: time.sleep(1) data = cls.request(url) except Exception as e: print(e) if data: cls.parse(data, commom_tr, common_tb, data_key, start_time, end_time) @classmethod def request(cls, new_url): rep = requests.get( new_url, headers={ 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Accept': '/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'cookie': 'cookie2=19b3cf1a61b5e088b1f21ca683995e05; csg=4586600b' }) if rep.status_code == 200: rep_josn = json.loads(rep.text, encoding='utf-8') if rep_josn['hasError'] == False: return rep_josn['content']['data']['data'] else: print(rep.text) return [] @classmethod def parse(cls, data, common_tr, common_tb, data_key, start_time, end_time): for tb in data: tb = [ tb['rank'], tb['properties'][0]['value']['name'], tb['tradeIndex'], tb['payItemQty'], tb['properties'][0]['name'] ] try: json_str = json.dumps(dict( zip(cls.tr + common_tr, tb + common_tb)), ensure_ascii=False) cls.db.data_process('宝洁官方旗舰', '市场行情-属性分析-属性排行-热销属性榜', data_key + [str(tb[0])], json_str, '百雀羚旗舰店', start_time, end_time) print(json_str) except Exception as e: print(e)
def __init__(self): self.db = Dto( sql_address= 'mysql+mysqlconnector://py_sycm:Kdi*[email protected]:3306/toothpick' )
class Rank(object): tr = ['热销排名', '品牌名称', '交易指数', '交易增长幅度', '支付商品数', '支付转化率', '品牌ID'] item = [ 'rankNo', 'brandName', 'tradeIndex', 'tradeIndexPercent', 'payItemCnt', 'payByrRate', 'brandId' ] def __init__(self): self.db = Dto( sql_address= 'mysql+mysqlconnector://py_sycm:Kdi*[email protected]:3306/toothpick' ) def run(self): url = "https://sycm.taobao.com/mq/brand/rank.json?" \ "cateId={cateId}&" \ "dateRange={start_time}%7C{end_time}&" \ "dateType={dateType}&device=0&orderField=tradeIndex&orderType=desc&page=1&pageSize=10&rankType=0&search=&seller={seller}" # 获取所有类目 dateType = 'month' sellers = [-1, 0, 1] crawl_dates = [[ datetime.date(v.year, v.month, 1).strftime('%Y-%m-%d'), v.strftime('%Y-%m-%d') ] for v in pd.date_range('2017-08-01', '2018-09-01', freq='1M').date] self.all = ConfigData().get_all() cates_items = self.all cates = [50011991, 50011992, 121366011, 121408009, 125172008] for cateId in cates: for crawl_date in crawl_dates: for seller in sellers: new_url = url.format(cateId=cateId, start_time=crawl_date[0], end_time=crawl_date[1], dateType=dateType, seller=seller) print(new_url) commom_tr, common_tb, data_key = ConfigData.cateField( cates_items, cateId, crawl_date[0], crawl_date[1], dateType, 0, seller) data = [] try: data = self.request(new_url) except Exception as e: print(e) if data: self.parse(data, commom_tr, common_tb, data_key, crawl_date[0], crawl_date[1]) def request(self, new_url): rep = requests.get( new_url, headers={ 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Accept': '/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'cookie': 'cookie2=11242a80535c6ef9915c6475bdb86496; csg=993f274e' }) if rep.status_code == 200: rep_josn = json.loads(rep.text, encoding='utf-8') if rep_josn['hasError'] == False: return rep_josn['content']['data']['data'] else: print(rep.text) return [] def parse(self, data, common_tr, common_tb, data_key, start_time, end_time): count = 0 for tbs in data: count += 1 if count > 50: break tb = [tbs.get(key, None) for key in self.item] try: json_str = json.dumps(dict( zip(self.tr + common_tr, tb + common_tb)), ensure_ascii=False) self.db.data_process('宝洁官方旗舰', '品牌分析-品牌排行-热销品牌榜', data_key + [str(tb[0])], json_str, '百雀羚旗舰店', start_time, end_time) print(json_str) except Exception as e: print(e)
class Rank(object): tr = ['热销排名', '品牌名称', '交易指数', '交易增长幅度', '支付商品数', '支付转化率', '品牌ID'] item = ['rankNo', 'brandName', 'tradeIndex', 'tradeIndexPercent', 'payItemCnt', 'payByrRate', 'brandId'] re_partern = re.compile( r"cateId=(.*?)&dateRange=(\d{4}-\d{2}-\d{2})%7C(\d{4}-\d{2}-\d{2})&dateType=(.*?)&device=0&orderField=tradeIndex&orderType=desc&page=1&pageSize=10&rankType=0&search=&seller=(.*?)$") db = Dto(sql_address='mysql+mysqlconnector://py_sycm:Kdi*[email protected]:3306/toothpick') cates_items = ConfigData().get_all() @classmethod def run(cls): url = "https://sycm.taobao.com/mq/brand/rank.json?" \ "cateId={cateId}&" \ "dateRange={start_time}%7C{end_time}&" \ "dateType={dateType}&device=0&orderField=tradeIndex&orderType=desc&page=1&pageSize=10&rankType=0&search=&seller={seller}" # cate_name =('彩妆/香水/美妆工具') cates = ConfigData().getFullCate() dateType = 'month' sellers = [-1, 0, 1] crawl_dates = [[datetime.date(v.year, v.month, 1).strftime('%Y-%m-%d'), v.strftime('%Y-%m-%d')] for v in pd.date_range('2017-08-01', '2018-09-01', freq='1M').date] arr = [] for cateId in cates: for crawl_date in crawl_dates: for seller in sellers: new_url = url.format( cateId=cateId, start_time=crawl_date[0], end_time=crawl_date[1], dateType=dateType, seller=seller ) arr.append(new_url) print(new_url) return arr @classmethod def worker(cls, url): (cateId, start_time, end_time, dateType, seller) = cls.re_partern.findall(url)[0] commom_tr, common_tb, data_key = ConfigData.cateField(cls.cates_items, cateId, start_time, end_time, dateType, 0, seller) time.sleep(3) data = [] try: data = cls.request(url) except Exception as e: print(e) if data: cls.parse(data, commom_tr, common_tb, data_key, start_time, end_time) @classmethod def request(cls, new_url): rep = requests.get(new_url, headers={ 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', 'Accept': '/', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.81 Safari/537.36', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'zh-CN,zh;q=0.8', 'cookie': 'cookie2=34c05924cf1d6c6e07660b3502afa9be; csg=d7ebd851' }) if rep.status_code == 200: rep_josn = json.loads(rep.text, encoding='utf-8') if rep_josn['hasError'] == False: return rep_josn['content']['data']['data'] else: print(rep.text) return [] @classmethod def parse(cls, data, common_tr, common_tb, data_key, start_time, end_time): count = 0 for tbs in data: count += 1 if count > 50: break tb = [tbs.get(key, None) for key in cls.item] try: json_str = json.dumps(dict(zip(cls.tr + common_tr, tb + common_tb)), ensure_ascii=False) cls.db.data_process( 'hasbro孩之宝旗舰店', '品牌分析-品牌排行-热销品牌榜', data_key + [str(tb[0])], json_str, '百雀羚旗舰店', start_time, end_time ) print(json_str) except Exception as e: print(e)
def main(): db = RedisClient('127.0.0.1', '6379') cates = ConfigData().getFullCate(needJunjor=False) crawl_dates = [['2018-08-12', '2018-09-10']] deo = Dto("mysql+mysqlconnector://root:123456@localhost:3306/test") urls = [ { "url": "sycm.taobao.com/mq/buyerPortrait/getJob.json?age=&area=&cateId={}&dateRange=2018-08-12|2018-09-10&dateType=recent30" "&device=0&price={}&seller={}&sex=", "name": "职业分布" }, { "url": "sycm.taobao.com/mq/buyerPortrait/getLevel.json?age=&area=&cateId={}&dateRange=2018-08-12|2018-09-10&" "dateType=recent30&device=0&price={}&seller={}&sex=", "name": "淘气值分布" }, { "url": "sycm.taobao.com/mq/buyerPortrait/getBuyerArea.json?age=&area=&areaType=province&cateId={}¤tPage=1&" "dateRange=2018-08-12|2018-09-10&dateType=recent30&device=0&pageSize=10&price={}&seller={}&sex=", "name": "省份分布排行" }, { "url": "sycm.taobao.com/mq/buyerPortrait/getBuyerArea.json?age=&area=&areaType=city&cateId={}" "¤tPage=1&dateRange=2018-08-12|2018-09-10&dateType=recent30&device=0&pageSize=10&price={}&seller={}&sex=", "name": "城市分布排行" }, { "url": "sycm.taobao.com/mq/buyerPortrait/getFondnessPropsByOneLevel.json?age=&area=&" "cateId={}&dateRange=2018-08-12|2018-09-10&dateType=recent30&device=0&price={}&seller={}&sex=", "name": "属性偏好" }, { "url": "sycm.taobao.com/mq/buyerPortrait/getKeyWord.json?age=&area=&cateId={}&dateRange=2018-08-12|2018-09-10&" "dateType=recent30&device=0&price={}&seller={}&sex=", "name": "搜索词偏好" }, { "url": "sycm.taobao.com/mq/buyerPortrait/getNinetyDaysBuyCnt.json?age=&area=&cateId={}&dateRange=2018-08-12|2018-09-10&dateType=recent30&device=0&price={}&seller={}&sex=", "name": "近90天购买次数" }, { "url": "sycm.taobao.com/mq/buyerPortrait/getNinetyDaysAmt.json?age=&area=&cateId={}&dateRange=2018-08-12|2018-09-10&dateType=recent30&device=0&price={}&seller={}&sex=", "name": "近90天支付金额" }, { "url": "sycm.taobao.com/mq/buyerPortrait/getFondnessBrand.json?age=&area=&cateId={}¤tPage=1&dateRange=2018-08-12|2018-09-10&dateType=recent30&device=0&pageSize=10&preference=brand&price={}&seller={}&sex=", "name": "买家品牌购买偏好" }, { "url": "sycm.taobao.com/mq/buyerPortrait/getFondnessBrand.json?age=&area=&cateId={}¤tPage=1&dateRange=2018-08-12|2018-09-10&dateType=recent30&device=0&pageSize=10&preference=cate&price={}&seller={}&sex=", "name": "买家类目购买偏好" } ] # 数据类型 for url in urls: for cate in [50010788, 1801]: price_list = [] if cate == 50010788: price_list = [('', '全部'), ('0-25', 0.3995), ('25-55', 0.3059), ('55-120', '0.1999'), ('120-235', 0.0597), ('235-360', 0.0247), ('360', 0.0103)] elif cate == 1801: price_list = [('', '全部'), ('0-30', 0.2950), ('30-65', 0.3088), ('65-135', 0.2454), ('135-240', 0.0928), ('240-450', 0.0383), ('405', 0.0197)] for price in price_list: for seller in [-1, 0, 1]: # 终端 new_url = url['url'].format(cate, price[0], seller)