def __init__(self, db_type='mssql'): if db_type == 'mssql': self.data_base = MsSQL() else: self.data_base = MySQL() super(Crawler, self).__init__()
# coding:utf-8 # @Time: 2019-12-31 17:07 # Author: turpure import asyncio import re import time from functools import partial from common.logger import logger from common.tools import BaseCrawler, MsSQL, MySQL from common.color import get_color_dict from pyppeteer import launch color_dict = get_color_dict() my_sql = MySQL() async def fetch(product_id, job_id): browser = await launch({ 'headless': True, 'args': [ '--disable-extensions', '--hide-scrollbars', '--disable-bundled-ppapi-flash', '--mute-audio', '--no-sandbox', '--disable-setuid-sandbox', '--disable-gpu', ]})
class Crawler(BaseCrawler): def __init__(self, db_type='mssql'): if db_type == 'mssql': self.data_base = MsSQL() else: self.data_base = MySQL() super(Crawler, self).__init__() def get_token(self): sql = 'select token, bearerToken,x_version from urTools.sys_joom_token limit 1' con = self.data_base.connection() cur = con.cursor(pymysql.cursors.DictCursor) cur.execute(sql) ret = cur.fetchone() if ret: return ret return None def fetch(self, pro_id): if not pro_id: raise Exception('Invalid pro ID', pro_id) api = 'https://api.joom.com/1.1/products/{}?currency=USD&language=en-US' base_url = api.format(pro_id) token = self.get_token() api_token = token['token'] bearer_token = token['bearerToken'] x_version = token['x_version'] headers = { 'authorization': bearer_token, 'user-agent': ('Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/64.0.3282.186 Safari/537.36'), # 'referer': "https://www.joom.com", # 'origin': "https://www.joom.com", # 'Host': "api.joom.com", # 'User-Agent': ("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/" # "537.36 (KHTML, like Gecko) Chrome/67.0.3396.79 Safari/537.36"), # 'Cache-Control': "no-cache", # 'x-version': x_version, # 'x-ostype ': 'Windows', 'x-api-token': api_token } session = requests.Session() r = session.get(base_url, headers=headers, verify=False) return r.json() @staticmethod def parse(data): wanted_info = dict() main_info = dict() if data is None: yield wanted_info return pro_info = data['payload'] try: tags_info = pro_info['nameExt']['tags'] tags = ','.join([name['nameEng'] for name in tags_info]) except: tags = '' main_info['tags'] = tags extra_images = pro_info.get('gallery', '') main_info['proId'] = pro_info.get('id', '') main_info['description'] = pro_info.get('engDescription', '') main_info['proName'] = pro_info.get('engName', '') main_info['categoryId'] = pro_info.get('categoryId', '') main_info['mainImage'] = pro_info['lite']['mainImage']['images'][-1][ 'url'] for image in extra_images: main_info['extra_image' + str(extra_images.index( image))] = image['payload']['images'][-1]['url'] for i in range(0, 11 - len(extra_images)): main_info['extra_image' + str(11 - i - 1)] = '' pro_variants = pro_info.get('variants', '') for var in pro_variants: variants = dict() try: try: variants['color'] = color_dict['#' + var['colors'][0]['rgb']] except: variants['color'] = var['colors'][0]['rgb'] except: variants['color'] = '' variants['proSize'] = var.get('size', '') variants['msrPrice'] = var.get('msrPrice', 0) variants['shipping'] = var['shipping']['price'] variants['shippingTime'] = '-'.join([ str(var['shipping']['minDays']), str(var['shipping']['maxDays']) ]) variants['price'] = var['price'] variants['shippingWeight'] = var.get('shippingWeight', 0) try: variants['varMainImage'] = var['mainImage']['images'][-1][ 'url'] except: variants['varMainImage'] = '' # variants['quantity'] = var['inventory'] variants['quantity'] = 100000 wanted_info = dict(main_info, **variants) yield wanted_info def get_task(self, queue_name, block=True): if block: task = self.redis.blpop(queue_name, timeout=10) task = task[1] else: task = self.redis.lpop(queue_name) if not task: time.sleep(1) return task def run(self): while 1: try: job = self.get_task('job_list', block=True) if job: job_info = job.split(',') job_id, pro_id = job_info raw_data = self.fetch(pro_id) rows = self.parse(raw_data) self.data_base.insert(rows, job_id) except Exception as why: pass