def parse(self, response): html = response.body.decode() soup = BeautifulSoup(html, 'lxml') coupon_infos = soup.find('div', class_='index_top_box').find_all('div', class_='media') for coupon_info in coupon_infos: try: coupon = CouponItem() coupon['type'] = 'coupon' coupon['name'] = coupon_info.find('h3', class_='each_box_header').text.strip() coupon['site'] = 'www.couponsock.com' coupon['description'] = coupon_info.find('p').text.strip() coupon['verify'] = False coupon['link'] = '' coupon['coupon_type'] = 'CODE' coupon['expire_at'] = '' coupon['code'] = coupon_info.find('div', class_='code_button').find('a').get('code') coupon['final_website'] = get_real_url( self.base_url + coupon_info.find('div', class_='code_button').find('a').get('href')) coupon['store'] = coupon_info.find('p', class_='more_p_a').find('a').get('href').replace('/store-coupons/', '') coupon['store_url_name'] = self.base_url + coupon_info.find('p', class_='more_p_a').find('a').get('href') coupon['store_description'] = '' coupon['store_category'] = '' coupon['store_website'] = get_domain_url(coupon['final_website']) coupon['store_country'] = 'US' coupon['store_picture'] = coupon_info.find('img').get('src') coupon['created_at'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') yield coupon except Exception as e: print(e) pass
def parse(self, response): html = response.body soup = BeautifulSoup(html, 'lxml') coupon_infos = soup.find_all( 'div', class_='wp-block-column')[2].find_all('tr')[1:] for coupon_info in coupon_infos: coupon = CouponItem() coupon['type'] = 'coupon' coupon['name'] = coupon_info.find('a').text.strip() coupon['site'] = 'thevape.guide' coupon['description'] = coupon_info.find_all('td')[1].text.strip() coupon['verify'] = False coupon['link'] = '' coupon['expire_at'] = '' coupon['coupon_type'] = 'CODE' coupon['code'] = coupon_info.find_all('td')[2].text.strip() coupon['final_website'] = get_real_url( coupon_info.find('a').get('href')) coupon['store'] = coupon_info.find('a').text.strip() coupon['store_url_name'] = coupon_info.find('a').get('href') coupon['store_description'] = '' coupon['store_category'] = '' coupon['store_website'] = get_domain_url(coupon['final_website']) coupon['store_country'] = 'US' coupon['store_picture'] = '' coupon['created_at'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') yield coupon pass
def coupon_parse(self, response): html = response.body soup = BeautifulSoup(html, 'lxml') button = soup.find('button') coupon = CouponItem() coupon['type'] = 'coupon' coupon['name'] = button.get('title') coupon['site'] = 'theseedlingtruck.com' coupon['description'] = button.get('data-description') coupon['verify'] = False coupon['link'] = '' coupon['expire_at'] = '' coupon['coupon_type'] = 'CODE' if 'code' in button.get( 'data-classes') else 'DEAL' coupon['code'] = button.get('data-code') coupon['final_website'] = get_real_url(self.base_url + button.get('data-url')) # coupon['store'] = button.get('data-url') coupon['store'] = soup.find('div', class_='post-header-title').find( 'span', class_='post-title') coupon['store'] = coupon['store'].text.replace( ' Coupon Codes', '') if coupon['store'] else button.get('data-url') coupon['store_url_name'] = self.base_url + button.get('data-url') coupon['store_description'] = '' coupon['store_category'] = 'CDB DEALS' coupon['store_website'] = get_domain_url(coupon['final_website']) coupon['store_country'] = 'US' coupon['store_picture'] = button.get('data-image') coupon['created_at'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') yield coupon
def parse(self, response): html = response.body category = re.findall( r'discount-codes-for/(.+?)/', response.url)[0] if re.findall( r'discount-codes-for/(.+?)/', response.url) else '' soup = BeautifulSoup(html, 'lxml') exit_count = soup.find('b', class_='num').text if exit_count != '0': current_page = re.findall(r'fwp_paged=(.+?)', response.url)[0] next_url = response.url.replace(current_page, str(int(current_page) + 1)) yield scrapy.Request(url=next_url, callback=self.parse) offers = soup.find_all('div', class_='itemdata') for offer in offers: expired = offer.find('span', class_='wlt_shortcodes_expiry_date').text if 'expired' in expired: continue coupon = CouponItem() coupon['type'] = 'coupon' coupon['name'] = offer.find( 'div', class_='titletext').find('span').text.strip() coupon['site'] = '420.deals' coupon['description'] = offer.find( 'div', class_='excerpttext').find('p').text.strip() coupon['verify'] = False button = offer.find('div', class_='clicktoreveal') # coupon['link'] = offer.find('div', class_='titletext').find('a').get('href') coupon['link'] = '' coupon['expire_at'] = '' coupon['coupon_type'] = 'DEAL' if 'Deal' in button.text else 'CODE' coupon['code'] = button.find('div', class_='code').text.strip( ) if coupon['coupon_type'] != 'DEAL' else '' link = button.find('a').get( 'href') if coupon['coupon_type'] == 'DEAL' else re.findall( r"href='(.+?)';", button.next_sibling.next_sibling.text)[0] coupon['final_website'] = get_real_url(link) store_info = offer.find('span', class_='wlt_shortcode_store') coupon['store'] = store_info.find('a').text.strip() coupon['store_url_name'] = store_info.find('a').get('href') coupon['store_description'] = '' coupon['store_category'] = category coupon['store_website'] = get_domain_url(coupon['final_website']) coupon['store_country'] = 'US' coupon['store_picture'] = offer.find('img').get('src') coupon['created_at'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') # coupon['status'] = scrapy.Field() # coupon['depth'] = scrapy.Field() # coupon['download_timeout'] = scrapy.Field() # coupon['download_slot'] = scrapy.Field() # coupon['download_latency'] = scrapy.Field() yield coupon
def coupon_parse(self, response): html = response.body soup = BeautifulSoup(html, 'lxml') coupon_infos = soup.find('div', class_='facetwp-template').find_all( 'div', class_='coupon-box') for coupon_info in coupon_infos: expired = coupon_info.find('div', class_='listingexpiry').text.strip() if 'expired' in expired: continue coupon = CouponItem() coupon['type'] = 'coupon' try: coupon['name'] = coupon_info.find( 'div', class_='listingtitle').find('a').text.strip() except: coupon['name'] = '' coupon['site'] = 'cannabiscouponcodes.com' coupon['description'] = coupon_info.find( 'div', class_='listingsexcerpt').find('span').text.strip() coupon['verify'] = False coupon['link'] = '' if 'unknown' in expired: coupon['expire_at'] = '' else: script = coupon_info.find( 'div', class_='countdowntimer').find('script') coupon['expire_at'] = re.findall(r'var dateStr \=\t"(.+?)";', str(script))[0] coupon['coupon_type'] = 'CODE' if 'Coupon' in coupon_info.find( 'div', class_='main-deal-button').find('a').text else 'DEAL' coupon['created_at'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') coupon['store_country'] = 'US' coupon['store_picture'] = coupon_info.find( 'div', class_='coupon-box-logo').find('img').get('src') coupon['store_category'] = re.findall( r'discount-category/(.+?)/', response.url)[0] if re.findall( r'discount-category/(.+?)/', response.url) else '' coupon['store'] = coupon_info.find( 'div', class_='listingsstore').find('a').text.strip() coupon['store_url_name'] = coupon_info.find( 'div', class_='listingsstore').find('a').get('href') coupon['store_description'] = '' coupon_id = coupon_info.find( 'div', class_='main-deal-button').find('a').get('data-couponid') code_get_url = self.base_code_url % coupon_id yield scrapy.Request(url=code_get_url, callback=self.code_parse, meta={'item': coupon})
def parse(self, response): html = response.body soup = BeautifulSoup(html, 'lxml') coupon_infos = soup.find_all('div', class_='vc_column-inner ') for coupon_info in coupon_infos: try: coupon = CouponItem() coupon['type'] = 'coupon' code_info = coupon_info.find_all( 'h4', class_='vc_custom_heading')[-1].text.replace('\n', '') coupon['name'] = code_info coupon['site'] = 'www.cbdoilusers.com' code = re.findall(r':(.+?) - ', code_info) description = re.findall(r' - (.+?)$', code_info)[0] coupon['description'] = description[ 0] if description else re.findall(r':(.+?)', code_info)[0] coupon['verify'] = False coupon['link'] = '' coupon['expire_at'] = '' coupon['code'] = code[0] if code else '' coupon['coupon_type'] = 'CODE' if code else 'DEAL' coupon['final_website'] = get_real_url( coupon_info.find('a').get('href')) coupon['store'] = coupon_info.find( 'h3', class_='vc_custom_heading').find('a').text.strip() coupon['store_url_name'] = coupon_info.find( 'h3', class_='vc_custom_heading').find('a').get('href') coupon['store_description'] = '' coupon['store_category'] = 'CBD OIL' coupon['store_website'] = get_domain_url( coupon['final_website']) coupon['store_country'] = 'US' coupon['store_picture'] = coupon_info.find('img').get('src') coupon['created_at'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') yield coupon except Exception as e: print(e) pass pass
def coupon_parse(self, response): html = response.body soup = BeautifulSoup(html, 'lxml') main_coupon_info = soup.find('div', class_='store-offer-featured') main_coupon = CouponItem() main_coupon['type'] = 'coupon' main_coupon['name'] = main_coupon_info.find('h2').text.strip() main_coupon['site'] = 'saveoncannabis.com' main_coupon['description'] = '' main_coupon['verify'] = True main_coupon['link'] = '' main_coupon['expire_at'] = main_coupon_info.find( 'div', class_='deal-countdown-info').text.strip().replace( 'Expires in: ', '') main_coupon['expire_at'] = '' if 'Unlimited Time' in main_coupon[ 'expire_at'] else main_coupon['expire_at'] main_coupon['coupon_type'] = 'CODE' offer_id = main_coupon_info.find( 'div', class_='featured-coupon-button').find('a').get('data-offer_id') main_coupon['final_website'] = get_real_url( main_coupon_info.find('div', class_='featured-coupon-button').find( 'a').get('data-affiliate')) main_coupon['store'] = soup.find( 'section', class_='page-title').find('h1').text.strip() main_coupon['store_description'] = '' main_coupon['store_category'] = main_coupon_info.find( 'div', class_='featured-coupon-meta').find('a').text.strip() main_coupon['store_website'] = get_domain_url( main_coupon['final_website']) main_coupon['store_country'] = 'US' main_coupon['store_picture'] = soup.find( 'div', class_='shop-logo').find('img').get('src') main_coupon['store_url_name'] = soup.find( 'div', class_='shop-logo').find('a').get('href') main_coupon['created_at'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') # yield scrapy.FormRequest(url='https://www.saveoncannabis.com/wp-admin/admin-ajax.php', # formdata={'action': 'show_code', 'offer_id': offer_id}, callback=self.code_paese, # dont_filter=True, meta={'item': main_coupon}) yield main_coupon
def coupon_parse(self, response): html = response.body soup = BeautifulSoup(html, 'lxml') coupon_infos = soup.find_all('div', class_='type-coupon') store_info = soup.find('div', class_='store') for coupon_info in coupon_infos: coupon = CouponItem() if coupon_info.find('p', class_='expired_msg'): continue coupon['type'] = 'coupon' # coupon['name'] = coupon_info.find('h3', class_='entry-title').find('a').get('title') coupon['name'] = coupon_info.find( 'h3', class_='entry-title').find('a').text.strip() coupon['site'] = 'vaping.coupons' coupon['description'] = '' coupon['verify'] = False coupon['link'] = '' coupon['expire_at'] = coupon_info.find( 'li', class_='expire').get('datetime') button = coupon_info.find('div', class_='link-holder').find('a') coupon['coupon_type'] = 'DEAL' if 'Redeem' in button.get( 'data-clipboard-text') else 'CODE' coupon['code'] = button.get( 'data-clipboard-text' ) if coupon['coupon_type'] == 'CODE' else '' coupon['final_website'] = get_real_url(button.get('href')) coupon['store'] = store_info.find('h1').text.strip() coupon['store_url_name'] = button.get('href') coupon['store_description'] = store_info.find( 'div', class_='desc').text.strip() coupon['store_category'] = coupon_info.find( 'p', class_='tag').text.replace('Tags:', '').strip() coupon['store_website'] = get_domain_url(coupon['final_website']) coupon['store_country'] = 'US' coupon['store_picture'] = store_info.find('img').get('src') coupon['created_at'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') yield coupon
def store_page_parse(self, response): html = response.body soup = BeautifulSoup(html, 'lxml') store_item = StoreItem() # 处理字段定位 # store store_item['type'] = 'store' store_item['logo_url'] = 'https:' + soup.find( 'div', id='company-identity').a.img.get('src') store_item['title'] = soup.find( 'div', id='offer-section').find('strong').text.strip() store_item['name'] = store_item['title'] store_item['site'] = 'offers' store_item['url_name'] = response.url.split('/')[-2] store_item['description'] = soup.find( 'div', id='company-information').find('p').text store_item['category'] = soup.find_all( 'a', itemprop='item')[-1].find('span').text store_item['website'] = get_real_url( self.base_url + soup.find('div', id='company-identity').a.get('href')) store_item['country'] = "US" store_item['picture'] = scrapy.Field() store_item['coupon_count'] = soup.find( 'div', id='merchant-stats').find('tr').find('span').text store_item['created_at'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') store_item['final_website'] = get_domin_url(store_item['website']) if store_item['final_website'] == '' or store_item[ 'final_website'] is None or store_item[ 'final_website'] == '#' or store_item[ 'final_website'] == 'https://www.offers.com': print(store_item['final_website']) # coupon for offer in soup.find_all('div', class_='offerstrip'): if 'expired' in offer.parent.get('class'): continue coupon_item = CouponItem() coupon_item['type'] = 'coupon' coupon_item['name'] = offer.find('h3', class_='name').text.strip() coupon_item['site'] = 'offers' description = offer.find('div', class_='more-details') coupon_item['description'] = description.find( 'p').text.strip() if description else "" try: coupon_item['verify'] = 'Y' if offer.find( 'span', class_='verified').find( 'strong').text == "Verified" else "N" except: coupon_item['verify'] = 'N' coupon_item['link'] = self.base_url + offer.find('a').get('href') coupon_item['expire_at'] = None try: div = offer.find('div', class_='badge-text') span = offer.find('span', class_='dolphin flag') coupon_type = div.text if div else '' coupon_type += span.text if span else '' except: coupon_item['coupon_type'] = "DEAL" if 'code' in coupon_type: data_offer_id = offer.get('data-offer-id') long_id = coupon_item['link'].split('/')[-2] code_get_url = self.code_url.replace('code_id', data_offer_id).replace( 'long_id', long_id) res = requests.get(code_get_url, headers=get_header()) code = re.findall(r'<div class="coupon-code">(.+?)</div>', res.content.decode()) coupon_item['code'] = code[0] if code else '' coupon_item['coupon_type'] = "CODE" else: coupon_item['coupon_type'] = "DEAL" coupon_item['code'] = '' coupon_item['final_website'] = store_item['final_website'] coupon_item['store'] = store_item['title'] coupon_item['store_url_name'] = store_item['url_name'] coupon_item['store_description'] = store_item['description'] coupon_item['store_category'] = store_item['category'] coupon_item['store_website'] = store_item['website'] coupon_item['store_country'] = "US" coupon_item['store_picture'] = store_item['logo_url'] coupon_item['created_at'] = datetime.datetime.now().strftime( '%Y-%m-%d %H:%M:%S') coupon_item['status'] = '0' # coupon_item['depth'] = scrapy.Field() # coupon_item['download_timeout'] = scrapy.Field() # coupon_item['download_slot'] = scrapy.Field() # coupon_item['download_latency'] = scrapy.Field() yield coupon_item yield store_item pass