def load_amazon(self, url): self.lr.load(url) if self.lr.body.find('Enter the characters you see below') > 0: logger.error("Captcha!!!") time.sleep(10) self.lr = LRequests() self.lr.load(url)
def __init__(self, q, profile_dir=None): self.lr = LRequests() self.gsa = GsaCaptcha() self.key_asins = {} self.q = q # options = webdriver.ChromeOptions() options = uc.ChromeOptions() if profile_dir is not None: options.add_argument('--user-data-dir=%s' % profile_dir) prefs = {} # prefs["profile.default_content_settings"] = {"images": 2} # prefs["profile.managed_default_content_settings"] = {"images": 2} # prefs["intl.accept_languages"] = 'en,en_US' # options.add_experimental_option("prefs", prefs) # options.add_argument('--start-maximized') options.add_argument('--blink-settings=imagesEnabled=false') options.add_argument('--lang=en') # self.browser = webdriver.Chrome(options=options) self.browser = uc.Chrome(options=options, driver_executable_path=os.path.join(os.getcwd(), 'chromedriver.exe'), use_subprocess=True) self.wait = WebDriverWait(self.browser, 120)
def __init__(self, list_queue, product_queue, record_queue, csv_queue): self.list_queue = list_queue self.product_queue = product_queue self.record_queue = record_queue self.csv_queue = csv_queue self.lr = LRequests() self.gsa = GsaCaptcha() self.key_asins = {}
def __init__(self): self.lr = LRequests() self.h = httplib2.Http(".cache") self.gsa = GsaCaptcha() self.key_asins = {} self.headers = { # ':authority': 'www.amazon.com', # ':method': 'GET', # ':path': '/s?k=Wallets', # ':scheme': 'https', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'en;q=0.9', 'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'none', 'sec-fetch-user': '******', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36', } self.cookies = {}
class SpiderAmazon(): def __init__(self, q, profile_dir=None): self.lr = LRequests() self.gsa = GsaCaptcha() self.key_asins = {} self.q = q # options = webdriver.ChromeOptions() options = uc.ChromeOptions() if profile_dir is not None: options.add_argument('--user-data-dir=%s' % profile_dir) prefs = {} # prefs["profile.default_content_settings"] = {"images": 2} # prefs["profile.managed_default_content_settings"] = {"images": 2} # prefs["intl.accept_languages"] = 'en,en_US' # options.add_experimental_option("prefs", prefs) # options.add_argument('--start-maximized') options.add_argument('--blink-settings=imagesEnabled=false') options.add_argument('--lang=en') # self.browser = webdriver.Chrome(options=options) self.browser = uc.Chrome(options=options, driver_executable_path=os.path.join(os.getcwd(), 'chromedriver.exe'), use_subprocess=True) self.wait = WebDriverWait(self.browser, 120) def load_amazon(self, url): while 1: try: logger.info('load url %s' % url) self.browser.get(url) break # while self.lr.body.find('Enter the characters you see below') > 0: # try: # logger.error("Captcha!!!") # captcha_path = os.path.join(CAPTCHA_DIR, '%s.jpg' % time.time()) # self.lr.load_img(self.lr.xpath('//img[contains(@src, "captcha")]').get('src')) # with open(captcha_path, 'wb') as f: # f.write(self.lr.body) # code = self.gsa.decode(captcha_path) # logger.info('Decode Captcha: %s' % code) # amzn = self.lr.xpath('//input[@name="amzn"]').get('value') # amzn_r = self.lr.xpath('//input[@name="amzn-r"]').get('value') # captcha_url = 'https://www.amazon.com/errors/validateCaptcha?amzn=%s&amzn-r=%s&field-keywords=%s' % (quote_plus(amzn), quote_plus(amzn_r), code) # # payload = {'amzn': amzn, # # 'amzn-r': amzn_r, # # 'field-keywords': code,} # self.lr.load(captcha_url, method='GET') #, data=payload) # # open('xxx\\%s.html' % time.time(), 'w', encoding='utf-8').write(self.lr.body) # # self.lr.load(url) # except Exception as ex: # logger.error(ex, exc_info=True) except WebDriverException as ex: if ex.msg.find('unknown error: unexpected command response') > -1: logger.info('unknown error: unexpected command response') time.sleep(1) pass else: raise ex # except Exception as ex: # logger.info('pass keyword: %s' % keyword) def fetch_list(self, keyword): file = os.path.join(ASIN_DIR, '%s.txt' % keyword) if not os.path.exists(file): self.key_asins[keyword] = [] self.load_amazon('https://www.amazon.com/s?k=%s' % quote(keyword)) while 1: self.fetch_asin(keyword) if not self.next_page(): break if len(self.key_asins[keyword]) > 0: open(file, 'w', encoding='utf-8').write('\n'.join(self.key_asins[keyword])) else: logger.info("empty asin: %s" % keyword) else: logger.info('pass keyword: %s' % keyword) def fetch_asin(self, keyword): product_eles = self.browser.find_elements_by_xpath('//div[contains(@class, "s-result-list")]/div[contains(@data-component-type, "s-search-result")]') for product_ele in product_eles: logger.info('asin: %s' % product_ele.get_attribute('data-asin')) self.key_asins[keyword].append(product_ele.get_attribute('data-asin')) def next_page(self): r = False for i in range(3): try: self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(1) next_ele = self.browser.find_element_by_xpath('//div[contains(@class, "s-pagination-container")]//a[contains(@class, "s-pagination-next")]') if next_ele is None: pass else: # next_ele.click() url = urljoin(self.browser.current_url, next_ele.get_attribute('href')) logger.info('load url %s' % url) self.browser.get(url) r = True break except Exception as ex: logger.info('error agent %s' % str(i+1)) logger.error(ex, exc_info=True) return r def fetch_products(self, keyword): asins_file = os.path.join(ASIN_DIR, '%s.txt' % keyword) if os.path.exists(asins_file): if not os.path.exists(os.path.join(PRODUCTS_DIR, keyword)): os.makedirs(os.path.join(PRODUCTS_DIR, keyword)) for asin in open(asins_file, 'r', encoding='utf-8').readlines(): try: asin = asin.strip() if not os.path.exists(os.path.join(PRODUCTS_DIR, keyword, '%s.txt' % asin)): self.fetch_product(keyword, asin) else: logger.info('pass %s' % asin) except KeyboardInterrupt: return except Exception as ex: logger.error(ex, exc_info=True) def fetch_product(self, keyword, asin): try: # self.lr.load('https://www.amazon.com/dp/%s' % asin) self.load_amazon('https://www.amazon.com/dp/%s' % asin) title_ele = self.browser.find_element_by_xpath('//span[@id="productTitle"]') title = title_ele.text.strip() brand = '' try: brand_ele = self.browser.find_element_by_xpath('//a[@id="bylineInfo"]') if brand_ele is not None: text = brand_ele.text.strip().lower() if text.startswith('visit'): brand = text[9:-5].strip() elif text.startswith('brand'): brand = text[6:].strip() except NoSuchElementException as ex: logger.info('not brand %s' % asin) open(os.path.join(PRODUCTS_DIR, keyword, '%s.txt' % asin), 'w', encoding='utf-8').write('|||'.join([asin, brand, title])) except KeyboardInterrupt: return except Exception as ex: # open('xx.html', 'w', encoding='utf-8').write(str(self.lr.body)) logger.error(ex, exc_info=True) def fetch_uspto(self, keyword): uspth_path = os.path.join(USPTO_DIR, keyword) if not os.path.exists(uspth_path): os.makedirs(uspth_path) self.lr.load('https://tmsearch.uspto.gov/') search_ele = self.lr.xpath('//a[contains(@href, "searchss")]') if search_ele is not None: self.lr.load(urljoin(self.lr.current_url, search_ele.get('href'))) # form = self.lr.get_forms()[0] # form = self.lr.getForms(urljoin(self.lr.current_url, search_ele.get('href')))[0] key_path = os.path.join(PRODUCTS_DIR, keyword) if os.path.isdir(key_path): for file in os.listdir(key_path): try: asin, brand, title = open(os.path.join(key_path, file), 'r', encoding='utf-8').read().strip().split('|||') uspto_file = os.path.join(uspth_path, '%s.txt' % asin) if not os.path.exists(uspto_file): if brand: state_ele = self.lr.xpath('//input[@name="state"]') state = state_ele.get('value') payload = {'f': 'toc', 'state': state, 'p_search': 'search', 'p_s_All': '', 'p_s_ALL': brand, 'a_default': 'search', 'a_search': 'Submit',} self.lr.load('https://tmsearch.uspto.gov/bin/showfield', method='POST', data=payload) eles = self.lr.xpaths('//table[@id="searchResultTable"]//tr') if eles is not None and len(eles) > 1: logger.info('Brand %s: %s' % (brand, len(eles))) open(uspto_file, 'w', encoding='utf-8').write(str(len(eles))) else: logger.info('Brand %s: None' % brand) open(uspto_file, 'w', encoding='utf-8').write("0") else: logger.info('Pass Empty Brand %s' % asin) else: logger.info('Pass Empty Uspto %s' % asin) except KeyboardInterrupt: return except Exception as ex: logger.error(ex, exc_info=True) def fetch_trademarkia(self, keyword): trademarkia_path = os.path.join(TRADEMARKIA_DIR, keyword) if not os.path.exists(trademarkia_path): os.makedirs(trademarkia_path) key_path = os.path.join(PRODUCTS_DIR, keyword) if os.path.isdir(key_path): for file in os.listdir(key_path): try: asin, brand, title = open(os.path.join(key_path, file), 'r', encoding='utf-8').read().strip().split('|||') trademarkia_file = os.path.join(trademarkia_path, '%s.txt' % asin) if not os.path.exists(trademarkia_file): if brand: self.lr.load('https://www.trademarkia.com/trademarks-search.aspx?tn=%s' % quote(brand)) eles = self.lr.xpaths('//table[contains(@class, "tablesaw")]//tr') if eles is not None and len(eles) > 1: logger.info('Brand %s: %s' % (brand, len(eles))) open(trademarkia_file, 'w', encoding='utf-8').write(str(len(eles))) else: logger.info('Brand %s: None' % brand) open(trademarkia_file, 'w', encoding='utf-8').write("0") else: logger.info('Pass Empty Brand %s' % asin) else: logger.info('Pass Trademarkia %s' % asin) except KeyboardInterrupt: return except Exception as ex: logger.error(ex, exc_info=True) def output_csv(self, keyword): product_dir = os.path.join(PRODUCTS_DIR, keyword) if os.path.exists(product_dir): products = [] for file in os.listdir(product_dir): asin, brand, title = open(os.path.join(product_dir, file), 'r', encoding='utf-8').read().strip().split('|||') t_m = '否' t_dir = os.path.join(TRADEMARKIA_DIR, keyword, '%s.txt' % asin) if os.path.exists(t_dir): if int(open(t_dir).read().strip()) > 0: t_m = '是' u_m = '否' u_dir = os.path.join(USPTO_DIR, keyword, '%s.txt' % asin) if os.path.exists(u_dir): if int(open(u_dir).read().strip()) > 0: u_m = '是' products.append([title, asin, brand, t_m, u_m]) fields = ['标题', 'asin', '品牌', 'TRADEMARKIA', 'USPTO'] csv_path = os.path.join(CSV_DIR, '%s.csv' % keyword) with open(csv_path, 'w', encoding='utf-8', newline="") as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(fields) csvwriter.writerows(products) def do(self): try: while 1: try: keyword = self.q.get(timeout=5) self.fetch_list(keyword) self.fetch_products(keyword) self.fetch_trademarkia(keyword) self.fetch_uspto(keyword) self.output_csv(keyword) except queue.Empty: logger.info('Empty') time.sleep(5) except Exception as ex: logger.error(ex, exc_info=True) except Exception as ex: logger.error(ex, exc_info=True)
def __init__(self, keyword): self.lr = LRequests() self.asins = [] self.keyword = keyword
class SpiderAmazon(): def __init__(self, keyword): self.lr = LRequests() self.asins = [] self.keyword = keyword def load_amazon(self, url): self.lr.load(url) if self.lr.body.find('Enter the characters you see below') > 0: logger.error("Captcha!!!") time.sleep(10) self.lr = LRequests() self.lr.load(url) def fetch_list(self): file = os.path.join(ASIN_DIR, '%s.txt' % self.keyword) if not os.path.exists(file): self.lr.load('https://www.amazon.com/s?k=%s' % quote(self.keyword)) while 1: self.fetch_asin() if not self.next_page(): break open(file, 'w', encoding='utf-8').write('\n'.join(self.key_asins[keyword])) else: logger.info('pass keyword: %s' % keyword) def fetch_asin(self): product_eles = self.lr.xpaths('//div[contains(@class, "s-result-list")]/div[contains(@data-component-type, "s-search-result")]') for product_ele in product_eles: self.asins.append(product_ele.get('data-asin')) def next_page(self): next_ele = self.lr.xpath('//div[contains(@class, "s-pagination-container")]//a[contains(@aria-label, "next page")]') if next_ele is None: return False else: next_url = urljoin(self.lr.current_url, next_ele.get('href')) self.lr.load(next_url) return True def fetch_asins(self): if not os.path.exists(os.path.join(PRODUCTS_DIR, self.keyword)): os.makedirs(os.path.join(PRODUCTS_DIR, self.keyword)) for asin in self.asins: try: if not os.path.exists(os.path.join(PRODUCTS_DIR, self.keyword, '%s.txt' % asin)): self.fetch_product(asin) self.fetch_trademarkia(asin) self.fetch_uspto(asin) else: logger.info('pass %s' % asin) except Exception as ex: logger.error(ex, exc_info=True) def fetch_product(self, asin): try: # self.lr.load('https://www.amazon.com/dp/%s' % asin) self.load_amazon('https://www.amazon.com/dp/%s' % asin) title_ele = self.lr.xpath('//span[@id="productTitle"]') title = ''.join(title_ele.itertext()).strip() brand = '' brand_ele = self.lr.xpath('//a[@id="bylineInfo"]') if brand_ele is not None: text = brand_ele.text.strip().lower() if text.startswith('visit'): brand = text[9:-5].strip() elif text.startswith('brand'): brand = text[6:].strip() self.product_info = [asin, brand, title] except Exception as ex: open('xx\\%s.html' % time.time(), 'w', encoding='utf-8').write(str(self.lr.body)) logger.error(ex, exc_info=True) def fetch_uspto(self, asin): self.lr.load('https://tmsearch.uspto.gov/') search_ele = self.lr.xpath('//a[contains(@href, "searchss")]') if search_ele is not None: self.lr.load(urljoin(self.lr.current_url, search_ele.get('href'))) try: asin, brand, title = self.product_info if brand: state_ele = self.lr.xpath('//input[@name="state"]') state = state_ele.get('value') payload = {'f': 'toc', 'state': state, 'p_search': 'search', 'p_s_All': '', 'p_s_ALL': brand, 'a_default': 'search', 'a_search': 'Submit',} self.lr.load('https://tmsearch.uspto.gov/bin/showfield', method='POST', data=payload) eles = self.lr.xpaths('//table[@id="searchResultTable"]//tr') if eles is not None and len(eles) > 1: logger.info('Brand %s: %s' % (brand, len(eles))) self.uspto_count = len(eles) else: logger.info('Brand %s: None' % brand) self.uspto_count = 0 else: logger.info('Pass Empty Brand %s' % asin) except Exception as ex: logger.error(ex, exc_info=True) def fetch_trademarkia(self, asin): try: asin, brand, title = self.product_info if brand: self.lr.load('https://www.trademarkia.com/trademarks-search.aspx?tn=%s' % quote(brand)) eles = self.lr.xpaths('//table[contains(@class, "tablesaw")]//tr') if eles is not None and len(eles) > 1: logger.info('Brand %s: %s' % (brand, len(eles))) self.trademarkia_count = len(eles) else: logger.info('Brand %s: None' % brand) self.trademarkia_count = 0 else: logger.info('Pass Empty Brand %s' % asin) except Exception as ex: logger.error(ex, exc_info=True) def output_csv(self, keyword): product_dir = os.path.join(PRODUCTS_DIR, keyword) products = [] for file in os.listdir(product_dir): asin, brand, title = open(os.path.join(product_dir, file), 'r', encoding='utf-8').read().strip().split('|||') t_m = '否' t_dir = os.path.join(TRADEMARKIA_DIR, '%s.txt' % asin) if os.path.exists(t_dir): if int(open(t_dir).read().strip()) > 0: t_m = '是' u_m = '否' u_dir = os.path.join(USPTO_DIR, '%s.txt' % asin) if os.path.exists(u_dir): if int(open(u_dir).read().strip()) > 0: u_m = '是' products.append([title, asin, brand, t_m, u_m]) fields = ['标题', 'asin', '品牌', 'TRADEMARKIA', 'USPTO'] csv_path = os.path.join(CSV_DIR, '%s.csv' % keyword) with open(csv_path, 'w', encoding='utf-8', newline="") as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(fields) csvwriter.writerows(products) def start(self): self.fetch_list()
def __init__(self): self.lr = LRequests() self.gsa = GsaCaptcha() self.key_asins = {}
class SpiderAmazon(): def __init__(self): self.lr = LRequests() self.gsa = GsaCaptcha() self.key_asins = {} def load_amazon(self, url): self.lr.load(url) # if(url.find('ref=nb_sb_noss') > -1): # open('xxx\\%s.html' % time.time(), 'w', encoding='utf-8').write(self.lr.body) if self.lr.body.find('Something went wrong on our end') > 0: forms = BeautifulSoup(self.lr.body).find_all('form') for f in forms: print('=======-----') print('1111111 %s' % f.attrs.get('action')) for input_tag in f.find_all("input"): print('%s - %s' % (input_tag.attrs.get('name'), input_tag.attrs.get('value'))) print('2222222 %s' % f.attrs.get('action')) for input_tag in f.find_all("select"): print('%s - %s' % (input_tag.attrs.get('name'), input_tag.attrs.get('value'))) self.lr.load('https://www.amazon.com/ref=cs_503_link') time.sleep(1) self.lr.load(url) forms = BeautifulSoup(self.lr.body).find_all('form') for f in forms: print('=======') print('444444 %s' % f.attrs.get('action')) for input_tag in f.find_all("input"): print('%s - %s' % (input_tag.attrs.get('name'), input_tag.attrs.get('value'))) open('xxx\\%s.html' % time.time(), 'w', encoding='utf-8').write(self.lr.body) while self.lr.body.find('Enter the characters you see below') > 0: try: logger.error("Captcha!!!") captcha_path = os.path.join(CAPTCHA_DIR, '%s.jpg' % time.time()) self.lr.load_img( self.lr.xpath('//img[contains(@src, "captcha")]').get( 'src')) with open(captcha_path, 'wb') as f: f.write(self.lr.body) code = self.gsa.decode(captcha_path) logger.info('Decode Captcha: %s' % code) amzn = self.lr.xpath('//input[@name="amzn"]').get('value') amzn_r = self.lr.xpath('//input[@name="amzn-r"]').get('value') captcha_url = 'https://www.amazon.com/errors/validateCaptcha?amzn=%s&amzn-r=%s&field-keywords=%s' % ( quote_plus(amzn), quote_plus(amzn_r), code) # payload = {'amzn': amzn, # 'amzn-r': amzn_r, # 'field-keywords': code,} self.lr.load(captcha_url, method='GET') #, data=payload) # open('xxx\\%s.html' % time.time(), 'w', encoding='utf-8').write(self.lr.body) # self.lr.load(url) except Exception as ex: logger.error(ex, exc_info=True) def fetch_list(self, keyword): file = os.path.join(ASIN_DIR, '%s.txt' % keyword) if not os.path.exists(file): self.key_asins[keyword] = [] # self.lr.load('https://www.amazon.com') # self.load_amazon('https://www.amazon.com') # self.load_amazon('https://www.amazon.com') # self.load_amazon('https://www.amazon.com') # open('xxx\\%s.html' % time.time(), 'w').write(self.lr.body) # print('cccccccccccccccc %s' % self.lr.body.find('crid')) # self.lr.load('https://www.amazon.com/s?k=%s' % quote(keyword)) # self.load_amazon('https://www.amazon.com/s?k=%s' % quote(keyword)) # form = BeautifulSoup(self.lr.body).find_all('form')[0] # for f in forms: # print('33333333333333333') # print('444444 %s' % f.attrs.get('action')) # for input_tag in f.find_all("input"): # print('%s - %s' % (input_tag.attrs.get('name'), input_tag.attrs.get('value'))) # print('--------------------') # s_url = '%s?field-keywords=%s' % (urljoin('https://www.amazon.com/', form.attrs.get('action')), quote(keyword)) # self.load_amazon('https://www.amazon.com/s/ref=nb_sb_noss_1?url=search-alias%%3Daps&field-keywords=%s' % quote(keyword)) # self.load_amazon('https://www.amazon.com/s?field-keywords=%s&ref=cs_503_search' % quote(keyword)) self.load_amazon('https://www.amazon.com/s?k=%s' % quote(keyword)) while 1: self.fetch_asin(keyword) if not self.next_page(): break if len(self.key_asins[keyword]) > 0: open(file, 'w', encoding='utf-8').write('\n'.join( self.key_asins[keyword])) else: logger.info("empty asin: %s" % keyword) else: logger.info('pass keyword: %s' % keyword) def fetch_asin(self, keyword): product_eles = self.lr.xpaths( '//div[contains(@class, "s-result-list")]/div[contains(@data-component-type, "s-search-result")]' ) for product_ele in product_eles: logger.info('asin: %s' % product_ele.get('data-asin')) self.key_asins[keyword].append(product_ele.get('data-asin')) def next_page(self): next_ele = self.lr.xpath( '//div[contains(@class, "s-pagination-container")]//a[contains(@aria-label, "next page")]' ) if next_ele is None: return False else: next_url = urljoin(self.lr.current_url, next_ele.get('href')) self.lr.load(next_url) return True def fetch_products(self, keyword): asins_file = os.path.join(ASIN_DIR, '%s.txt' % keyword) if os.path.exists(asins_file): if not os.path.exists(os.path.join(PRODUCTS_DIR, keyword)): os.makedirs(os.path.join(PRODUCTS_DIR, keyword)) for asin in open(asins_file, 'r', encoding='utf-8').readlines(): try: asin = asin.strip() if not os.path.exists( os.path.join(PRODUCTS_DIR, keyword, '%s.txt' % asin)): self.fetch_product(keyword, asin) else: logger.info('pass %s' % asin) except KeyboardInterrupt: return except Exception as ex: logger.error(ex, exc_info=True) def fetch_product(self, keyword, asin): try: # self.lr.load('https://www.amazon.com/dp/%s' % asin) self.load_amazon('https://www.amazon.com/dp/%s' % asin) title_ele = self.lr.xpath('//span[@id="productTitle"]') title = ''.join(title_ele.itertext()).strip() brand = '' brand_ele = self.lr.xpath('//a[@id="bylineInfo"]') if brand_ele is not None: text = brand_ele.text.strip().lower() if text.startswith('visit'): brand = text[9:-5].strip() elif text.startswith('brand'): brand = text[6:].strip() open(os.path.join(PRODUCTS_DIR, keyword, '%s.txt' % asin), 'w', encoding='utf-8').write('|||'.join([asin, brand, title])) except KeyboardInterrupt: return except Exception as ex: # open('xx.html', 'w', encoding='utf-8').write(str(self.lr.body)) logger.error(ex, exc_info=True) def fetch_uspto(self, keyword): uspth_path = os.path.join(USPTO_DIR, keyword) if not os.path.exists(uspth_path): os.makedirs(uspth_path) self.lr.load('https://tmsearch.uspto.gov/') search_ele = self.lr.xpath('//a[contains(@href, "searchss")]') if search_ele is not None: self.lr.load(urljoin(self.lr.current_url, search_ele.get('href'))) # form = self.lr.get_forms()[0] # form = self.lr.getForms(urljoin(self.lr.current_url, search_ele.get('href')))[0] key_path = os.path.join(PRODUCTS_DIR, keyword) if os.path.isdir(key_path): for file in os.listdir(key_path): try: asin, brand, title = open( os.path.join(key_path, file), 'r', encoding='utf-8').read().strip().split('|||') uspto_file = os.path.join(uspth_path, '%s.txt' % asin) if not os.path.exists(uspto_file): if brand: state_ele = self.lr.xpath( '//input[@name="state"]') state = state_ele.get('value') payload = { 'f': 'toc', 'state': state, 'p_search': 'search', 'p_s_All': '', 'p_s_ALL': brand, 'a_default': 'search', 'a_search': 'Submit', } self.lr.load( 'https://tmsearch.uspto.gov/bin/showfield', method='POST', data=payload) eles = self.lr.xpaths( '//table[@id="searchResultTable"]//tr') if eles is not None and len(eles) > 1: logger.info('Brand %s: %s' % (brand, len(eles))) open(uspto_file, 'w', encoding='utf-8').write( str(len(eles))) else: logger.info('Brand %s: None' % brand) open(uspto_file, 'w', encoding='utf-8').write("0") else: logger.info('Pass Empty Brand %s' % asin) else: logger.info('Pass Empty Uspto %s' % asin) except KeyboardInterrupt: return except Exception as ex: logger.error(ex, exc_info=True) def fetch_trademarkia(self, keyword): trademarkia_path = os.path.join(TRADEMARKIA_DIR, keyword) if not os.path.exists(trademarkia_path): os.makedirs(trademarkia_path) key_path = os.path.join(PRODUCTS_DIR, keyword) if os.path.isdir(key_path): for file in os.listdir(key_path): try: asin, brand, title = open( os.path.join(key_path, file), 'r', encoding='utf-8').read().strip().split('|||') trademarkia_file = os.path.join(trademarkia_path, '%s.txt' % asin) if not os.path.exists(trademarkia_file): if brand: self.lr.load( 'https://www.trademarkia.com/trademarks-search.aspx?tn=%s' % quote(brand)) eles = self.lr.xpaths( '//table[contains(@class, "tablesaw")]//tr') if eles is not None and len(eles) > 1: logger.info('Brand %s: %s' % (brand, len(eles))) open(trademarkia_file, 'w', encoding='utf-8').write(str(len(eles))) else: logger.info('Brand %s: None' % brand) open(trademarkia_file, 'w', encoding='utf-8').write("0") else: logger.info('Pass Empty Brand %s' % asin) else: logger.info('Pass Trademarkia %s' % asin) except KeyboardInterrupt: return except Exception as ex: logger.error(ex, exc_info=True) def output_csv(self, keyword): product_dir = os.path.join(PRODUCTS_DIR, keyword) if os.path.exists(product_dir): products = [] for file in os.listdir(product_dir): asin, brand, title = open( os.path.join(product_dir, file), 'r', encoding='utf-8').read().strip().split('|||') t_m = '否' t_dir = os.path.join(TRADEMARKIA_DIR, keyword, '%s.txt' % asin) if os.path.exists(t_dir): if int(open(t_dir).read().strip()) > 0: t_m = '是' u_m = '否' u_dir = os.path.join(USPTO_DIR, keyword, '%s.txt' % asin) if os.path.exists(u_dir): if int(open(u_dir).read().strip()) > 0: u_m = '是' products.append([title, asin, brand, t_m, u_m]) fields = ['标题', 'asin', '品牌', 'TRADEMARKIA', 'USPTO'] csv_path = os.path.join(CSV_DIR, '%s.csv' % keyword) with open(csv_path, 'w', encoding='utf-8', newline="") as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(fields) csvwriter.writerows(products)
class SpiderAmazon(): def __init__(self): self.lr = LRequests() self.h = httplib2.Http(".cache") self.gsa = GsaCaptcha() self.key_asins = {} self.headers = { # ':authority': 'www.amazon.com', # ':method': 'GET', # ':path': '/s?k=Wallets', # ':scheme': 'https', 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', 'accept-encoding': 'gzip, deflate, br', 'accept-language': 'en;q=0.9', 'sec-ch-ua': '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"', 'sec-ch-ua-mobile': '?0', 'sec-ch-ua-platform': '"Windows"', 'sec-fetch-dest': 'document', 'sec-fetch-mode': 'navigate', 'sec-fetch-site': 'none', 'sec-fetch-user': '******', 'upgrade-insecure-requests': '1', 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36', } self.cookies = {} def set_body(self, body, resp_headers): if (isinstance(body, bytes)): body = body.decode('utf-8') self.current_url = resp_headers['content-location'] self.body = body self.tree = html.fromstring(str(BeautifulSoup(self.body, 'lxml'))) def xpath(self, xpath): eles = self.tree.xpath(xpath) if eles and len(eles) > 0: return eles[0] return None def xpaths(self, xpath): return self.tree.xpath(xpath) def load_cookies(self, resp_headers): if 'set-cookie' in resp_headers: cookies_str = resp_headers['set-cookie'] for cookie in re.split(';|,', cookies_str): cookie = cookie.strip().lower() if any([ True if not cookie.startswith(e) and cookie.find('=') > 0 else False for e in exclude_cookie ]): name, value = cookie.split('=', 1) self.cookies[name] = value if len(self.cookies.keys()) > 0: cookies = [] for k, v in self.cookies.items(): cookies.append('%s=%s' % (k, v)) # logger.info('update cookies: %s' % '; '.join(cookies)) self.headers['cookie'] = '; '.join(cookies) def load_amazon(self, url): # self.lr.load(url) time.sleep(1) logger.info('load url: %s' % url) (resp_headers, body) = self.h.request(url, method='GET', headers=self.headers) self.set_body(body, resp_headers) self.load_cookies(resp_headers) # if(url.find('ref=nb_sb_noss') > -1): # open('xxx\\%s.html' % time.time(), 'w', encoding='utf-8').write(self.lr.body) # if self.lr.body.find('Something went wrong on our end') > 0: # forms = BeautifulSoup(self.lr.body).find_all('form') # for f in forms: # print('=======-----') # print('1111111 %s' % f.attrs.get('action')) # for input_tag in f.find_all("input"): # print('%s - %s' % (input_tag.attrs.get('name'), input_tag.attrs.get('value'))) # print('2222222 %s' % f.attrs.get('action')) # for input_tag in f.find_all("select"): # print('%s - %s' % (input_tag.attrs.get('name'), input_tag.attrs.get('value'))) # self.lr.load('https://www.amazon.com/ref=cs_503_link') # time.sleep(1) # self.lr.load(url) # forms = BeautifulSoup(self.lr.body).find_all('form') # for f in forms: # print('=======') # print('444444 %s' % f.attrs.get('action')) # for input_tag in f.find_all("input"): # print('%s - %s' % (input_tag.attrs.get('name'), input_tag.attrs.get('value'))) # open('xxx\\%s.html' % time.time(), 'w', encoding='utf-8').write(self.lr.body) while self.body.find('Enter the characters you see below') > 0: try: logger.error("Captcha!!!") captcha_path = os.path.join(CAPTCHA_DIR, '%s.jpg' % time.time()) img_url = self.xpath('//img[contains(@src, "captcha")]').get( 'src') logger.info('load img: %s' % img_url) (resp_headers, body) = self.h.request(img_url, method='GET', headers=self.headers) with open(captcha_path, 'wb') as f: f.write(body) code = self.gsa.decode(captcha_path) logger.info('Decode Captcha: %s' % code) amzn = self.xpath('//input[@name="amzn"]').get('value') amzn_r = self.xpath('//input[@name="amzn-r"]').get('value') captcha_url = 'https://www.amazon.com/errors/validateCaptcha?amzn=%s&amzn-r=%s&field-keywords=%s' % ( quote_plus(amzn), quote_plus(amzn_r), code) # payload = {'amzn': amzn, # 'amzn-r': amzn_r, # 'field-keywords': code,} logger.info('load url: %s' % captcha_url) (resp_headers, body) = self.h.request(captcha_url, method='GET', headers=self.headers) #, data=payload) self.load_cookies(resp_headers) # open('xxx\\%s.html' % time.time(), 'w', encoding='utf-8').write(self.lr.body) (resp_headers, body) = self.h.request(url, method='GET', headers=self.headers) self.set_body(body, resp_headers) self.load_cookies(resp_headers) except Exception as ex: logger.error(ex, exc_info=True) def fetch_list(self, keyword): file = os.path.join(ASIN_DIR, '%s.txt' % keyword) if not os.path.exists(file): self.key_asins[keyword] = [] # self.lr.load('https://www.amazon.com') # self.load_amazon('https://www.amazon.com') # self.load_amazon('https://www.amazon.com') # self.load_amazon('https://www.amazon.com') # open('xxx\\%s.html' % time.time(), 'w').write(self.lr.body) # print('cccccccccccccccc %s' % self.lr.body.find('crid')) # self.lr.load('https://www.amazon.com/s?k=%s' % quote(keyword)) # self.load_amazon('https://www.amazon.com/s?k=%s' % quote(keyword)) # form = BeautifulSoup(self.lr.body).find_all('form')[0] # for f in forms: # print('33333333333333333') # print('444444 %s' % f.attrs.get('action')) # for input_tag in f.find_all("input"): # print('%s - %s' % (input_tag.attrs.get('name'), input_tag.attrs.get('value'))) # print('--------------------') # s_url = '%s?field-keywords=%s' % (urljoin('https://www.amazon.com/', form.attrs.get('action')), quote(keyword)) # self.load_amazon('https://www.amazon.com/s/ref=nb_sb_noss_1?url=search-alias%%3Daps&field-keywords=%s' % quote(keyword)) # self.load_amazon('https://www.amazon.com/s?field-keywords=%s&ref=cs_503_search' % quote(keyword)) self.load_amazon('https://www.amazon.com/s?k=%s' % quote(keyword)) while 1: self.fetch_asin(keyword) if not self.next_page(): break if len(self.key_asins[keyword]) > 0: open(file, 'w', encoding='utf-8').write('\n'.join( self.key_asins[keyword])) else: logger.info("empty asin: %s" % keyword) else: logger.info('pass keyword: %s' % keyword) def fetch_asin(self, keyword): product_eles = self.xpaths( '//div[contains(@class, "s-result-list")]/div[contains(@data-component-type, "s-search-result")]' ) for product_ele in product_eles: logger.info('asin: %s' % product_ele.get('data-asin')) self.key_asins[keyword].append(product_ele.get('data-asin')) def next_page(self): next_ele = self.xpath( '//div[contains(@class, "s-pagination-container")]//a[contains(@aria-label, "next page")]' ) if next_ele is None: return False else: next_url = urljoin(self.current_url, next_ele.get('href')) self.load_amazon(next_url) return True def fetch_products(self, keyword): asins_file = os.path.join(ASIN_DIR, '%s.txt' % keyword) if os.path.exists(asins_file): if not os.path.exists(os.path.join(PRODUCTS_DIR, keyword)): os.makedirs(os.path.join(PRODUCTS_DIR, keyword)) for asin in open(asins_file, 'r', encoding='utf-8').readlines(): try: asin = asin.strip() if not os.path.exists( os.path.join(PRODUCTS_DIR, keyword, '%s.txt' % asin)): self.fetch_product(keyword, asin) else: logger.info('pass %s' % asin) except KeyboardInterrupt: return except Exception as ex: logger.error(ex, exc_info=True) def fetch_product(self, keyword, asin): try: # self.lr.load('https://www.amazon.com/dp/%s' % asin) self.load_amazon('https://www.amazon.com/dp/%s' % asin) title_ele = self.xpath('//span[@id="productTitle"]') title = ''.join(title_ele.itertext()).strip() brand = '' brand_ele = self.xpath('//a[@id="bylineInfo"]') if brand_ele is not None: text = brand_ele.text.strip().lower() if text.startswith('visit'): brand = text[9:-5].strip() elif text.startswith('brand'): brand = text[6:].strip() open(os.path.join(PRODUCTS_DIR, keyword, '%s.txt' % asin), 'w', encoding='utf-8').write('|||'.join([asin, brand, title])) except KeyboardInterrupt: return except Exception as ex: # open('xx.html', 'w', encoding='utf-8').write(str(self.lr.body)) logger.error(ex, exc_info=True) def fetch_uspto(self, keyword): uspth_path = os.path.join(USPTO_DIR, keyword) if not os.path.exists(uspth_path): os.makedirs(uspth_path) self.lr.load('https://tmsearch.uspto.gov/') search_ele = self.lr.xpath('//a[contains(@href, "searchss")]') if search_ele is not None: self.lr.load(urljoin(self.lr.current_url, search_ele.get('href'))) # form = self.lr.get_forms()[0] # form = self.lr.getForms(urljoin(self.lr.current_url, search_ele.get('href')))[0] key_path = os.path.join(PRODUCTS_DIR, keyword) if os.path.isdir(key_path): for file in os.listdir(key_path): try: asin, brand, title = open( os.path.join(key_path, file), 'r', encoding='utf-8').read().strip().split('|||') uspto_file = os.path.join(uspth_path, '%s.txt' % asin) if not os.path.exists(uspto_file): if brand: state_ele = self.lr.xpath( '//input[@name="state"]') state = state_ele.get('value') payload = { 'f': 'toc', 'state': state, 'p_search': 'search', 'p_s_All': '', 'p_s_ALL': brand, 'a_default': 'search', 'a_search': 'Submit', } self.lr.load( 'https://tmsearch.uspto.gov/bin/showfield', method='POST', data=payload) eles = self.lr.xpaths( '//table[@id="searchResultTable"]//tr') if eles is not None and len(eles) > 1: logger.info('Brand %s: %s' % (brand, len(eles))) open(uspto_file, 'w', encoding='utf-8').write( str(len(eles))) else: logger.info('Brand %s: None' % brand) open(uspto_file, 'w', encoding='utf-8').write("0") else: logger.info('Pass Empty Brand %s' % asin) else: logger.info('Pass Empty Uspto %s' % asin) except KeyboardInterrupt: return except Exception as ex: logger.error(ex, exc_info=True) def fetch_trademarkia(self, keyword): trademarkia_path = os.path.join(TRADEMARKIA_DIR, keyword) if not os.path.exists(trademarkia_path): os.makedirs(trademarkia_path) key_path = os.path.join(PRODUCTS_DIR, keyword) if os.path.isdir(key_path): for file in os.listdir(key_path): try: asin, brand, title = open( os.path.join(key_path, file), 'r', encoding='utf-8').read().strip().split('|||') trademarkia_file = os.path.join(trademarkia_path, '%s.txt' % asin) if not os.path.exists(trademarkia_file): if brand: self.lr.load( 'https://www.trademarkia.com/trademarks-search.aspx?tn=%s' % quote(brand)) eles = self.lr.xpaths( '//table[contains(@class, "tablesaw")]//tr') if eles is not None and len(eles) > 1: logger.info('Brand %s: %s' % (brand, len(eles))) open(trademarkia_file, 'w', encoding='utf-8').write(str(len(eles))) else: logger.info('Brand %s: None' % brand) open(trademarkia_file, 'w', encoding='utf-8').write("0") else: logger.info('Pass Empty Brand %s' % asin) else: logger.info('Pass Trademarkia %s' % asin) except KeyboardInterrupt: return except Exception as ex: logger.error(ex, exc_info=True) def output_csv(self, keyword): product_dir = os.path.join(PRODUCTS_DIR, keyword) if os.path.exists(product_dir): products = [] for file in os.listdir(product_dir): asin, brand, title = open( os.path.join(product_dir, file), 'r', encoding='utf-8').read().strip().split('|||') t_m = '否' t_dir = os.path.join(TRADEMARKIA_DIR, keyword, '%s.txt' % asin) if os.path.exists(t_dir): if int(open(t_dir).read().strip()) > 0: t_m = '是' u_m = '否' u_dir = os.path.join(USPTO_DIR, keyword, '%s.txt' % asin) if os.path.exists(u_dir): if int(open(u_dir).read().strip()) > 0: u_m = '是' products.append([title, asin, brand, t_m, u_m]) fields = ['标题', 'asin', '品牌', 'TRADEMARKIA', 'USPTO'] csv_path = os.path.join(CSV_DIR, '%s.csv' % keyword) with open(csv_path, 'w', encoding='utf-8', newline="") as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(fields) csvwriter.writerows(products)
class SpiderAmazon(): def __init__(self, list_queue, product_queue, record_queue, csv_queue): self.list_queue = list_queue self.product_queue = product_queue self.record_queue = record_queue self.csv_queue = csv_queue self.lr = LRequests() self.gsa = GsaCaptcha() self.key_asins = {} def load_amazon(self, url): self.lr.load(url) # if(url.find('ref=nb_sb_noss') > -1): open('xx\\%s.html' % time.time(), 'w', encoding='utf-8').write(self.lr.body) while self.lr.body.find('Enter the characters you see below') > 0: try: logger.error("Captcha!!!") captcha_path = os.path.join(CAPTCHA_DIR, '%s.jpg' % time.time()) self.lr.load_img( self.lr.xpath('//img[contains(@src, "captcha")]').get( 'src')) with open(captcha_path, 'wb') as f: f.write(self.lr.body) code = self.gsa.decode(captcha_path) logger.info('Decode Captcha: %s' % code) amzn = self.lr.xpath('//input[@name="amzn"]').get('value') amzn_r = self.lr.xpath('//input[@name="amzn-r"]').get('value') captcha_url = 'https://www.amazon.com/errors/validateCaptcha?amzn=%s&amzn-r=%s&field-keywords=%s' % ( quote_plus(amzn), quote_plus(amzn_r), code) # payload = {'amzn': amzn, # 'amzn-r': amzn_r, # 'field-keywords': code,} self.lr.load(captcha_url, method='GET') #, data=payload) # open('xxx\\%s.html' % time.time(), 'w', encoding='utf-8').write(self.lr.body) # self.lr.load(url) except Exception as ex: logger.error(ex, exc_info=True) def fetch_products(self, keyword): if not os.path.exists(os.path.join(PRODUCTS_DIR, keyword)): os.makedirs(os.path.join(PRODUCTS_DIR, keyword)) for asin in open(os.path.join(ASIN_DIR, '%s.txt' % keyword), 'r', encoding='utf-8').readlines(): try: asin = asin.strip() if not os.path.exists( os.path.join(PRODUCTS_DIR, keyword, '%s.txt' % asin)): self.fetch_product(keyword, asin) else: logger.info('pass %s' % asin) except KeyboardInterrupt: return except Exception as ex: logger.error(ex, exc_info=True) def fetch_product(self, keyword, asin): try: # self.lr.load('https://www.amazon.com/dp/%s' % asin) self.load_amazon('https://www.amazon.com/dp/%s' % asin) title_ele = self.lr.xpath('//span[@id="productTitle"]') title = ''.join(title_ele.itertext()).strip() brand = '' brand_ele = self.lr.xpath('//a[@id="bylineInfo"]') if brand_ele is not None: text = brand_ele.text.strip().lower() if text.startswith('visit'): brand = text[9:-5].strip() elif text.startswith('brand'): brand = text[6:].strip() open(os.path.join(PRODUCTS_DIR, keyword, '%s.txt' % asin), 'w', encoding='utf-8').write('|||'.join([asin, brand, title])) record_queue.put('%s|||%s|||%s' % (keyword, asin, brand)) except KeyboardInterrupt: return except Exception as ex: # open('xx.html', 'w', encoding='utf-8').write(str(self.lr.body)) logger.error(ex, exc_info=True) def fetch_uspto(self, keyword, brand, asin): uspth_path = os.path.join(USPTO_DIR, keyword) if not os.path.exists(uspth_path): os.makedirs(uspth_path) self.lr.load('https://tmsearch.uspto.gov/') search_ele = self.lr.xpath('//a[contains(@href, "searchss")]') if search_ele is not None: self.lr.load(urljoin(self.lr.current_url, search_ele.get('href'))) # form = self.lr.get_forms()[0] # form = self.lr.getForms(urljoin(self.lr.current_url, search_ele.get('href')))[0] try: uspto_file = os.path.join(uspth_path, '%s.txt' % asin) if not os.path.exists(uspto_file): if brand: state_ele = self.lr.xpath('//input[@name="state"]') state = state_ele.get('value') payload = { 'f': 'toc', 'state': state, 'p_search': 'search', 'p_s_All': '', 'p_s_ALL': brand, 'a_default': 'search', 'a_search': 'Submit', } self.lr.load( 'https://tmsearch.uspto.gov/bin/showfield', method='POST', data=payload) eles = self.lr.xpaths( '//table[@id="searchResultTable"]//tr') if eles is not None and len(eles) > 1: logger.info('Brand %s: %s' % (brand, len(eles))) open(uspto_file, 'w', encoding='utf-8').write(str(len(eles))) else: logger.info('Brand %s: None' % brand) open(uspto_file, 'w', encoding='utf-8').write("0") else: logger.info('Pass Empty Brand %s' % asin) else: logger.info('Pass Empty Uspto %s' % asin) except KeyboardInterrupt: return except Exception as ex: logger.error(ex, exc_info=True) def fetch_trademarkia(self, keyword, brand, asin): trademarkia_path = os.path.join(TRADEMARKIA_DIR, keyword) if not os.path.exists(trademarkia_path): os.makedirs(trademarkia_path) try: trademarkia_file = os.path.join(trademarkia_path, '%s.txt' % asin) if not os.path.exists(trademarkia_file): if brand: self.lr.load( 'https://www.trademarkia.com/trademarks-search.aspx?tn=%s' % quote(brand)) eles = self.lr.xpaths( '//table[contains(@class, "tablesaw")]//tr') if eles is not None and len(eles) > 1: logger.info('Brand %s: %s' % (brand, len(eles))) open(trademarkia_file, 'w', encoding='utf-8').write(str(len(eles))) else: logger.info('Brand %s: None' % brand) open(trademarkia_file, 'w', encoding='utf-8').write("0") else: logger.info('Pass Empty Brand %s' % asin) else: logger.info('Pass Trademarkia %s' % asin) except KeyboardInterrupt: return except Exception as ex: logger.error(ex, exc_info=True) def do_products(self): while True: try: keyword = self.product_queue.get(timeout=5) self.fetch_products(keyword) except queues.Empty: logger.info('products empty') time.sleep(5) except Exception as ex: logger.error(ex, exc_info=True) def do_records(self): while True: try: keyword, asin, brand = self.record_queue.get( timeout=5).split('|||') self.fetch_trademarkia(keyword, asin, brand) self.fetch_uspto(keyword, asin, brand) self.csv_queue.put('%s|||%s' % (keyword, asin)) except queues.Empty: logger.info('records empty') time.sleep(5) except Exception as ex: logger.error(ex, exc_info=True) def do_csv(self): while True: try: keyword, asin = self.csv_queue.get(timeout=5).split('|||') self.output_csv(keyword, asin) except queues.Empty: logger.info('csv empty') time.sleep(5) except Exception as ex: logger.error(ex, exc_info=True) def output_csv(self, keyword, asin): product_dir = os.path.join(PRODUCTS_DIR, keyword) products = [] for file in os.listdir(product_dir): asin, brand, title = open( os.path.join(product_dir, file), 'r', encoding='utf-8').read().strip().split('|||') t_m = '否' t_dir = os.path.join(TRADEMARKIA_DIR, keyword, '%s.txt' % asin) if os.path.exists(t_dir): if int(open(t_dir).read().strip()) > 0: t_m = '是' u_m = '否' u_dir = os.path.join(USPTO_DIR, keyword, '%s.txt' % asin) if os.path.exists(u_dir): if int(open(u_dir).read().strip()) > 0: u_m = '是' products.append([title, asin, brand, t_m, u_m]) fields = ['标题', 'asin', '品牌', 'TRADEMARKIA', 'USPTO'] csv_path = os.path.join(CSV_DIR, '%s.csv' % keyword) if os.path.exists(csv_path): with open(csv_path, 'w', encoding='utf-8', newline="") as csvfile: csvwriter = csv.writer(csvfile) csvwriter.writerow(fields) csvwriter.writerows(products) else: with open(csv_path, 'a', encoding='utf-8', newline="") as csvfile: csvwriter.writerows(products)