Example #1
0
 def get_collection(self):
     name = config.USERNAME
     sql = "select * from collection where(name='%s') order by id DESC limit 5" % (
         name)
     mysqlClient = MysqlClient()
     find_res = mysqlClient.find_all(sql)
     return find_res
Example #2
0
 def get_history(self):
     name = config.USERNAME
     sql = "select * from content where(name='%s') order by id DESC limit 10" % (
         name)
     mysqlClient = MysqlClient()
     find_res = mysqlClient.find_all(sql)
     return find_res
Example #3
0
class Scheduler(object):
    def __init__(self):
        self.download = Download()
        self.db = MysqlClient()
        self.redisClient = RedisClient()

    def run(self):
        #self.get_qu()
        #self.get_zhen()
        # self.push_url_to_redis()
        self.get_position()

    def get_qu(self):
        sql = 'select * from shi'
        results = self.db.find_all(sql)
        for res in results:
            shi_id = res[2]
            url = SHI_URL.format(shi_id='c' + shi_id)
            print(url)
            html = self.download.get_html(url)
            if html.status_code == 200 and html is not None:
                html = HTML(html.text)
                qu_id_list = html.xpath(
                    '//dl[@class="condition-district show-condition-district"]/dd/a/@href'
                )
                qu_name_list = html.xpath(
                    '//dl[@class="condition-district show-condition-district"]/dd/a/text()'
                )
                for qu_id, name in zip(qu_id_list[1:], qu_name_list[1:]):
                    qu_id = qu_id.split('/')
                    qu_id = qu_id[2]
                    sql = '''insert into qu(pid,qu_id,name) VALUES ('{pid}','{qu_id}','{name}')'''\
                        .format(pid=shi_id,qu_id=qu_id, name=name)
                    print(sql)
                    self.db.save(sql)
            else:
                print('该url无数据')

    def get_zhen(self):
        sql = 'select * from qu'
        results = self.db.find_all(sql)
        for res in results:
            shi_id = res[1]
            qu_id = res[2]
            url = QU_URL.format(shi_id='c' + shi_id, qu_id=qu_id)
            print(url)
            html = self.download.get_html(url)
            if html is not None and html.status_code == 200:
                html = HTML(html.text)
                zhen_id_list = html.xpath(
                    '//dl[@class="condition-area show-condition-area"]/dd/a/@href'
                )
                zhen_name_list = html.xpath(
                    '//dl[@class="condition-area show-condition-area"]/dd/a/text()'
                )
                for zhen_id, name in zip(zhen_id_list[1:], zhen_name_list[1:]):
                    zhen_id = zhen_id.split('/')
                    zhen_id = zhen_id[2]
                    sql = '''insert into zhen(pid,qu_id, zhen_id,name) VALUES ('{pid}','{qu_id}','{zhen_id}','{name}')'''\
                        .format(pid=shi_id,qu_id=qu_id,zhen_id=zhen_id, name=name)
                    print(sql)
                    self.db.save(sql)
            else:
                print('该url无数据')

    def get_position(self):
        redis_results = self.redisClient.pop('employment')
        try:
            json_obj = json.loads(redis_results[1].decode('utf8'))
        except:
            return None

        if json_obj:
            flag = True
            pageToken = 1

            #处理翻页问题
            while flag:
                detail_url_list = []
                url = json_obj['url']
                pre_page = re.search('\/\?page=(.*?)&', url).group(1)
                if int(pageToken) > 10:
                    break
                url = url.replace(
                    'page=' + pre_page + '&sort=2&ka=page-' + pre_page,
                    'page=' + str(pageToken) + '&sort=2&ka=page-' +
                    str(pageToken))
                cityId = json_obj['cityId']
                zhiweiId = json_obj['zhiweiId']
                print(url)
                html = self.download.get_html(url)

                if html is not None and html.status_code == 200:
                    html = HTML(html.text)

                    #判断是否是当天发布,是的话请求详情页, 判断数据库是否有这条数据,有的话不请求(暂时)
                    li_xpath = html.xpath('//div[@class="job-list"]/ul/li')
                    for li in li_xpath:
                        content = etree.tostring(li)
                        content = HT.unescape(content.decode())
                        content = HTML(content)
                        li_time = content.xpath(
                            'string(//div[@class="info-publis"]/p)')
                        href_url = content.xpath(
                            'string(//div[@class="info-primary"]//h3/a/@href)')
                        try:
                            last_str = li_time.split('发布于')[1]
                            minute = last_str.split(':')[1]
                            #判断是否当天发布
                            if minute:
                                #判断数据库存不存在:
                                try:
                                    cid = re.match('^/job_detail/(.*?)\.html',
                                                   href_url).group(1)
                                    sql = "select * from positions where cid='%s'" % (
                                        cid)
                                    find_one_res = self.db.find_one(sql)
                                    if find_one_res is None:
                                        #先把cid插入,避免重复抓取
                                        sql = "insert into positions(cid) values ('%s')" % (
                                            cid)
                                        self.db.save(sql)
                                        detail_url_list.append(
                                            config.HOST_URL + href_url)
                                    elif find_one_res[2] is None:
                                        detail_url_list.append(
                                            config.HOST_URL + href_url)
                                    else:
                                        print('数据库存在该记录:' + str(cid))
                                except:
                                    print('查询数据库出错:' + str(cid))
                        except:
                            print('该URL发布日期小于当天:' + config.HOST_URL + href_url)

                    results = self.get_detail(detail_url_list, cityId,
                                              zhiweiId)

                    #判断是否翻页
                    try:
                        last_li = html.xpath(
                            'string(//div[@class="job-list"]/ul/li[last()]//div[@class="info-publis"]/p)'
                        )
                        last_str = last_li.split('发布于')[1]
                        minute = last_str.split(':')[1]
                        if minute:
                            pageToken = str(int(pageToken) + 1)
                    except:
                        flag = False

                else:
                    print('该url无数据')

    def get_detail(self, detail_url_list, cityId, zhiweiId):
        for url in detail_url_list:
            print('下载该详情页:' + url)
            html = self.download.get_html(url)
            if html is not None and html.status_code == 200:
                html = HTML(html.text)

                try:
                    cid = re.match(
                        '^https://www.zhipin.com/job_detail/(.*?)\.html',
                        url).group(1)
                except:
                    print('获取cid失败')
                    continue

                title = html.xpath('string(//h1)')
                url = url
                try:
                    publishDateStr = html.xpath(
                        'string(//span[@class="time"])').split('发布于')[1]
                    publishDate = int(
                        time.mktime(
                            time.strptime(publishDateStr, "%Y-%m-%d %H:%M")))
                except:
                    publishDateStr = None
                    publishDate = None

                try:
                    info = html.xpath(
                        'string(//div[@class="job-banner"]//div[@class="info-primary"]/p)'
                    )
                    info = info.split(':')
                    city = info[1][:-2]
                    jingyan = info[2][:-2]
                    xueli = info[3]
                except:
                    city = None
                    jingyan = None
                    xueli = None
                price = html.xpath(
                    'string(//div[@class="info-primary"]//span[@class="badge"])'
                )
                posterName = html.xpath('string(//h2)')
                posterId = None
                posterUrl = html.xpath(
                    'string(//div[@class="detail-figure"]/img/@src)')
                content = html.xpath(
                    'string(//div[@class="job-sec"]/div[@class="text"])'
                ).strip()

                try:
                    company_text = html.xpath(
                        'string(//a[@ka="job-cominfo"]/@href)')
                    companyID = re.match('/gongsi/(.*?)\.html',
                                         company_text).group(1)
                except:
                    companyID = None
                createDate = int(time.time())

                #判断是否是当天发布
                temp_time = time.localtime(int(time.time()))
                now_DateStr = time.strftime("%Y-%m-%d", temp_time)
                lt = time.strptime(now_DateStr, "%Y-%m-%d")
                now_timestamp = int(time.mktime(lt))
                if publishDate == None or publishDate < now_timestamp or publishDate >= (
                        now_timestamp + 86400):
                    print('特例.该url不是当天发布:' + str(url))
                    continue

                res_obj = {
                    'cid': cid,
                    'title': title,
                    'url': url,
                    'publishDateStr': publishDateStr,
                    'publishDate': publishDate,
                    'city': city,
                    'jingyan': jingyan,
                    'xueli': xueli,
                    'price': price,
                    'posterName': posterName,
                    'posterId': posterId,
                    'posterUrl': posterUrl,
                    'content': content,
                    'companyID': companyID,
                    'createDate': createDate,
                    'cityId': cityId,
                    'zhiweiId': zhiweiId
                }
                print(res_obj)
                sql = "insert into positions(cid,title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId)" \
                      " VALUES ('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')" \
                      % (cid,title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId)\
                      + "ON DUPLICATE KEY UPDATE title='%s', url='%s', publishDate='%s', publishDateStr='%s', city='%s', jingyan='%s', xueli='%s', price='%s', posterName='%s', posterId='%s', posterUrl='%s', content='%s', companyID='%s', createDate='%s',cityId='%s', zhiweiId='%s'" \
                      %(title,url,publishDate,publishDateStr,city,jingyan,xueli,price,posterName,posterId,posterUrl,content,companyID,createDate,cityId, zhiweiId)
                self.db.save(sql)
            else:
                print('请求详情页失败:' + str(url))

    def push_url_to_redis(self):
        # zhiwei_list = []
        # zhiwei_sql = 'select * from zhiwei'
        # zhiwei_results = self.db.find_all(zhiwei_sql)
        # for zhiwei in zhiwei_results:
        #     zhiwei_list.append(zhiwei[2])
        #
        # zhen_sql = 'select * from zhen'
        # zhen_results = self.db.find_all(zhen_sql)
        #
        # for res in zhen_results:
        #     pid = res[1]
        #     zhen_id = res[2]
        #     for zhiwei_id in zhiwei_list:
        #         url = POSITION_URL.format(pid=pid, zhen_id=zhen_id, zhiwei_id=zhiwei_id, pageToken='1')
        #         self.redisClient.push('employment',url)

        zhiwei_list = []
        zhiwei_sql = 'select * from zhiwei'
        zhiwei_results = self.db.find_all(zhiwei_sql)
        for zhiwei in zhiwei_results:
            zhiwei_list.append(zhiwei[2])

        shi_sql = 'select * from shi'
        shi_results = self.db.find_all(shi_sql)

        for res in shi_results:
            pid = res[2]
            for zhiwei_id in zhiwei_list:
                url = NEW_POSITION_URL.format(pid=pid,
                                              zhiwei_id=zhiwei_id,
                                              pageToken='1')
                url_obj = {"url": url, "cityId": pid, "zhiweiId": zhiwei_id}
                self.redisClient.push('employment', json.dumps(url_obj))
class Bestseller(object):
    def __init__(self):
        self.download = Download()
        self.mysql = MysqlClient()

    def start(self):
        # res1 = self.get_url('onedepa')
        # res2 = self.get_url('twodepa')
        res3 = self.get_url('threedepa')

        # self.get_html(res1)
        # self.get_html(res2)
        self.get_html(res3[4317:4319])

    def get_url(self, typename):
        sql = "select * from %s" % (typename)
        results = self.mysql.find_all(sql)
        return results

    def get_html(self, results):
        for res in results:
            url = res[5]
            typeid = res[1]
            temp_url_lit = []
            url_one = url
            page2_replace = re.search(
                'https://www.amazon.com.*?ref=zg_bs_(.*?/\d+-\d+-\d+)',
                url).group(1)
            url_tow = url.replace(page2_replace, 'pg_2?&pg=2')
            temp_url_lit.append(url_one)
            temp_url_lit.append(url_tow)
            for url in temp_url_lit:
                response = self.download.get_html(url)
                if response:
                    html = HTML(response.text)
                    url_list = html.xpath(
                        '//div[@id="zg-center-div"]/ol/li//a[@class="a-link-normal a-text-normal"]/@href'
                    )
                    for detail_url in url_list:
                        spider_url = 'https://www.amazon.com' + detail_url
                        detail_response = self.download.get_html(spider_url)
                        if detail_response:
                            detail_html = HTML(detail_response.text)
                            sellrank = re.search(
                                'https://www.amazon.com/.*?/dp/.*?ref=.*?_(\d+)/\d+-\d+-\d+\?',
                                spider_url).group(1)
                            print('sellrank:' + sellrank)

                            product_id = hashlib.md5(
                                detail_url.encode()).hexdigest()
                            title = detail_html.xpath(
                                'string(//h1[@id="title"])').strip()
                            price = detail_html.xpath(
                                'string(//span[@id="priceblock_ourprice"])'
                            ).replace(',', '').replace('$', '')
                            if price == '':
                                price = 0
                            color = detail_html.xpath(
                                'string(//div[@id="variation_color_name"]//span)'
                            ).strip()
                            size = detail_html.xpath(
                                'string(//div[@id="variation_size_name"]//span)'
                            ).strip()
                            commentCount = detail_html.xpath(
                                'string(//span[@id="acrCustomerReviewText"])'
                            ).split(' ')[0].replace(',', '')
                            if commentCount == '':
                                commentCount = 0
                            commentRating = detail_html.xpath(
                                'string(//a[@class="a-popover-trigger a-declarative"]/i/span)'
                            ).split(' ')[0]
                            if commentRating == '':
                                commentRating = 0
                            crawled_timestamp = int(time.time())
                            crawled_time = time.strftime(
                                "%Y-%m-%d %H:%M:%S", time.localtime())
                            crawled_date = time.strftime(
                                "%Y-%m-%d", time.localtime())
                            # 编号
                            try:
                                asin = re.search(
                                    'https://www.amazon.com/.*?/dp/(.*?)/ref=.*?',
                                    spider_url).group(1)
                            except:
                                asin = None

                            # 类目排名
                            rank1 = None
                            rank2 = None
                            # try:
                            #     category_res1 = re.search('.*?productDetails_detailBullets_sections1.*?Best Sellers Rank.*?<span>.*?(<span>.*?</span>)',detail_response.text, re.S)
                            #     category_res2 = re.search('.*?productDetails_detailBullets_sections1.*?Best Sellers Rank.*?<span>.*?<span>.*?</span>.*?(<span>.*?</span>).*?</span>', detail_response.text, re.S)
                            #     if category_res1:
                            #         # rank_search = re.search('.*?#(.*?)in.*?', category_res1.group(1))
                            #         # if rank_search:
                            #         #     rank1 = rank_search.group(1)
                            #         # else:
                            #         #     rank1 = None
                            #         # print(rank1)
                            #         html = HTML(category_res1.group(1))
                            #         list_res = html.xpath('//text()')
                            #         rank1 = ''.join(list_res)
                            #     if category_res2:
                            #         html = HTML(category_res2.group(1))
                            #         list_res = html.xpath('//text()')
                            #         rank2 = ''.join(list_res)
                            # except:
                            #     rank1 = None
                            #     rank2 = None
                            # 图片信息入库
                            try:
                                imageUrls = []
                                img_res = re.search(
                                    "var data = {};.*?var obj = jQuery.parseJSON\('(.*?)'\);",
                                    detail_response.text, re.S)
                                img_obj = json.loads(img_res.group(1))
                                key_one = list(
                                    img_obj['colorImages'].keys())[0]
                                for data in img_obj['colorImages'][key_one]:
                                    imageUrls.append(data['large'])
                                for img in imageUrls:
                                    img_id = hashlib.md5(
                                        img.encode()).hexdigest()
                                    img_url = img
                                    sql = "insert into image(product_id,img_id,img_url,crawled_timestamp,crawled_time) values ('%s','%s','%s','%s','%s')" \
                                          % (asin, img_id, img_url, crawled_timestamp, crawled_time) \
                                          + "ON DUPLICATE KEY UPDATE crawled_timestamp='%s',crawled_time='%s'" % (crawled_timestamp, crawled_time)
                                    print(sql)
                                    self.mysql.save(sql)
                            except:
                                pass

                            # 跟卖信息入库
                            have_follow_sale = '0'
                            follow_sale_num = 0
                            follow_sale_str = detail_html.xpath(
                                'string(//div[@id="olpPocs_feature_div"]/div/span)'
                            )
                            if follow_sale_str != '':
                                have_follow_sale = '1'
                                follow_sale_num = re.search(
                                    '\((\d+)\)', follow_sale_str).group(1)

                            follow_sale_url = detail_html.xpath(
                                'string(//div[@id="olpPocs_feature_div"]/div/span/a/@href)'
                            )
                            if follow_sale_url[0:4] == 'http':
                                follow_sale_url = follow_sale_url
                            else:
                                follow_sale_url = 'https://www.amazon.com' + follow_sale_url + '&startIndex={startIndex}'
                            follow_response = self.get_follow_sale(
                                follow_sale_url, follow_sale_num)
                            for item in follow_response:
                                follow_sale_id = item['follow_sale_id']
                                price = item['price']
                                seller = item['seller']
                                type = item['type']
                                sql = "insert into follow_sale(product_id,follow_sale_id,price,seller,type,crawled_timestamp,crawled_time) values ('%s','%s','%s','%s','%s','%s','%s')" \
                                      % (asin, follow_sale_id, price, seller, type, crawled_timestamp, crawled_time) \
                                      + "ON DUPLICATE KEY UPDATE crawled_timestamp='%s',crawled_time='%s'" % (crawled_timestamp, crawled_time)
                                print(sql)
                                self.mysql.save(sql)

                            # 商品信息入库
                            sql = "insert into bestseller(typeid,sellrank,product_id,title,url,price,color,size,commentCount,commentRating,have_follow_sale,follow_sale_num,asin,rank1,rank2,crawled_timestamp,crawled_time,crawled_date) values ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" \
                                  % (typeid,sellrank,product_id,title,url,price,color,size,commentCount,commentRating,have_follow_sale,follow_sale_num,asin,rank1,rank2,crawled_timestamp,crawled_time,crawled_date) \
                                  + "ON DUPLICATE KEY UPDATE sellrank='%s',title='%s', url='%s', price='%s',commentCount='%s',crawled_timestamp='%s',crawled_time='%s',crawled_date='%s',follow_sale_num='%s'" % (
                                    sellrank, title, spider_url, price, commentCount, crawled_timestamp, crawled_time,crawled_date,follow_sale_num)
                            print(sql)
                            self.mysql.save(sql)

    def get_follow_sale(self, url, follow_sale_num):
        if follow_sale_num == 0:
            return []
        if int(follow_sale_num) > 10:
            pageNum = math.ceil(int(follow_sale_num) / 10)
        else:
            pageNum = 1

        item_list = []
        for page in range(0, pageNum):
            startIndex = page * 10
            url = url.format(startIndex=startIndex)
            print(url)
            follow_response = self.download.get_html(url)
            if follow_response is None:
                return []
            follow_html = HTML(follow_response.text)

            html_list = follow_html.xpath(
                '//div[@class="a-row a-spacing-mini olpOffer"]')
            for html in html_list:
                html = etree.tostring(html).decode()
                html = HTML(html)
                price = html.xpath(
                    'string(//div[@class="a-column a-span2 olpPriceColumn"]/span)'
                ).strip().replace('$', '')
                seller = html.xpath('string(//h3/span)').strip()
                FBA = html.xpath('string(//div[@class="olpBadge"])')
                type = 'FBM'
                if FBA != '':
                    type = 'FBA'
                follow_sale_id = hashlib.md5(
                    (seller + price + type).encode()).hexdigest()
                obj = {
                    'follow_sale_id': follow_sale_id,
                    'price': price,
                    'seller': seller,
                    'type': type
                }
                print(obj)
                item_list.append(obj)
        return item_list
Example #5
0
 def get_all_message(self):
     sql = "select * from content order by id DESC limit 15"
     mysqlClient = MysqlClient()
     find_res = mysqlClient.find_all(sql)
     return find_res