Python LRequests Examples

Programming Language: Python

Namespace/Package Name: lutils.lrequests

Class/Type: LRequests

Examples at hotexamples.com: 11

Python LRequests - 11 examples found. These are the top rated real world Python examples of lutils.lrequests.LRequests extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

LRequests(6)

load(5)

xpath(5)

xpaths(5)

load_img(2)

Example #1

Show file

File: am2.py Project: xtwxfxk/left5

 def load_amazon(self, url):
     self.lr.load(url)
     if self.lr.body.find('Enter the characters you see below') > 0:
         logger.error("Captcha!!!")
         time.sleep(10)
         self.lr = LRequests()
         self.lr.load(url)

Example #2

Show file

    def __init__(self, q, profile_dir=None):

        self.lr = LRequests()
        self.gsa = GsaCaptcha()
        self.key_asins = {}

        self.q = q

        # options = webdriver.ChromeOptions()
        options = uc.ChromeOptions()
        if profile_dir is not None:
            options.add_argument('--user-data-dir=%s' % profile_dir)

        prefs = {}
        # prefs["profile.default_content_settings"] = {"images": 2}
        # prefs["profile.managed_default_content_settings"] = {"images": 2}
        # prefs["intl.accept_languages"] = 'en,en_US'

        # options.add_experimental_option("prefs", prefs)
        # options.add_argument('--start-maximized')
        options.add_argument('--blink-settings=imagesEnabled=false')
        options.add_argument('--lang=en')
        
        # self.browser = webdriver.Chrome(options=options)
        self.browser = uc.Chrome(options=options, driver_executable_path=os.path.join(os.getcwd(), 'chromedriver.exe'), use_subprocess=True)
        self.wait = WebDriverWait(self.browser, 120)

Example #3

Show file

File: am3.py Project: xtwxfxk/left5

    def __init__(self, list_queue, product_queue, record_queue, csv_queue):

        self.list_queue = list_queue
        self.product_queue = product_queue
        self.record_queue = record_queue
        self.csv_queue = csv_queue

        self.lr = LRequests()
        self.gsa = GsaCaptcha()
        self.key_asins = {}

Example #4

Show file

File: amh.py Project: xtwxfxk/left5

    def __init__(self):

        self.lr = LRequests()
        self.h = httplib2.Http(".cache")

        self.gsa = GsaCaptcha()
        self.key_asins = {}

        self.headers = {
            # ':authority': 'www.amazon.com',
            # ':method': 'GET',
            # ':path': '/s?k=Wallets',
            # ':scheme': 'https',
            'accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'accept-encoding':
            'gzip, deflate, br',
            'accept-language':
            'en;q=0.9',
            'sec-ch-ua':
            '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
            'sec-ch-ua-mobile':
            '?0',
            'sec-ch-ua-platform':
            '"Windows"',
            'sec-fetch-dest':
            'document',
            'sec-fetch-mode':
            'navigate',
            'sec-fetch-site':
            'none',
            'sec-fetch-user':
            '******',
            'upgrade-insecure-requests':
            '1',
            'user-agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
        }
        self.cookies = {}

Example #5

Show file

class SpiderAmazon():

    def __init__(self, q, profile_dir=None):

        self.lr = LRequests()
        self.gsa = GsaCaptcha()
        self.key_asins = {}

        self.q = q

        # options = webdriver.ChromeOptions()
        options = uc.ChromeOptions()
        if profile_dir is not None:
            options.add_argument('--user-data-dir=%s' % profile_dir)

        prefs = {}
        # prefs["profile.default_content_settings"] = {"images": 2}
        # prefs["profile.managed_default_content_settings"] = {"images": 2}
        # prefs["intl.accept_languages"] = 'en,en_US'

        # options.add_experimental_option("prefs", prefs)
        # options.add_argument('--start-maximized')
        options.add_argument('--blink-settings=imagesEnabled=false')
        options.add_argument('--lang=en')
        
        # self.browser = webdriver.Chrome(options=options)
        self.browser = uc.Chrome(options=options, driver_executable_path=os.path.join(os.getcwd(), 'chromedriver.exe'), use_subprocess=True)
        self.wait = WebDriverWait(self.browser, 120)

    def load_amazon(self, url):
        while 1:
            try:
                logger.info('load url %s' % url)
                self.browser.get(url)
                break

                # while self.lr.body.find('Enter the characters you see below') > 0:
                #     try:
                #         logger.error("Captcha!!!")

                #         captcha_path = os.path.join(CAPTCHA_DIR, '%s.jpg' % time.time())
                #         self.lr.load_img(self.lr.xpath('//img[contains(@src, "captcha")]').get('src'))
                #         with open(captcha_path, 'wb') as f:
                #             f.write(self.lr.body)
                #         code = self.gsa.decode(captcha_path)

                #         logger.info('Decode Captcha: %s' % code)
                #         amzn = self.lr.xpath('//input[@name="amzn"]').get('value')
                #         amzn_r = self.lr.xpath('//input[@name="amzn-r"]').get('value')

                #         captcha_url = 'https://www.amazon.com/errors/validateCaptcha?amzn=%s&amzn-r=%s&field-keywords=%s' % (quote_plus(amzn), quote_plus(amzn_r), code)
                #         # payload = {'amzn': amzn,
                #         #             'amzn-r': amzn_r,
                #         #             'field-keywords': code,}

                #         self.lr.load(captcha_url, method='GET') #, data=payload)
                #         # open('xxx\\%s.html' % time.time(), 'w', encoding='utf-8').write(self.lr.body)
                #         # self.lr.load(url)
                #     except Exception as ex:
                #         logger.error(ex, exc_info=True)
            except WebDriverException as ex:
                if ex.msg.find('unknown error: unexpected command response') > -1:
                    logger.info('unknown error: unexpected command response')
                    time.sleep(1)
                    pass
                else:
                    raise ex
            # except Exception as ex:
            #     logger.info('pass keyword: %s' % keyword)


    def fetch_list(self, keyword):

        file = os.path.join(ASIN_DIR, '%s.txt' % keyword)
        if not os.path.exists(file):
            self.key_asins[keyword] = []

            self.load_amazon('https://www.amazon.com/s?k=%s' % quote(keyword))

            while 1:
                self.fetch_asin(keyword)

                if not self.next_page():
                    break

            if len(self.key_asins[keyword]) > 0:
                open(file, 'w', encoding='utf-8').write('\n'.join(self.key_asins[keyword]))
            else:
                logger.info("empty asin: %s" % keyword)
        else:
            logger.info('pass keyword: %s' % keyword)

    def fetch_asin(self, keyword):
        product_eles = self.browser.find_elements_by_xpath('//div[contains(@class, "s-result-list")]/div[contains(@data-component-type, "s-search-result")]')
        for product_ele in product_eles:
            logger.info('asin: %s' % product_ele.get_attribute('data-asin'))
            self.key_asins[keyword].append(product_ele.get_attribute('data-asin'))


    def next_page(self):
        r = False
        for i in range(3):
            try:
                self.browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(1)
                next_ele = self.browser.find_element_by_xpath('//div[contains(@class, "s-pagination-container")]//a[contains(@class, "s-pagination-next")]')
                if next_ele is None:
                    pass
                else:
                    # next_ele.click()
                    url = urljoin(self.browser.current_url, next_ele.get_attribute('href'))
                    logger.info('load url %s' % url)
                    self.browser.get(url)
                    r = True
                    break
            except Exception as ex:
                logger.info('error agent %s' % str(i+1))
                logger.error(ex, exc_info=True)
        return r


    def fetch_products(self, keyword):
        asins_file = os.path.join(ASIN_DIR, '%s.txt' % keyword)
        if os.path.exists(asins_file):
            if not os.path.exists(os.path.join(PRODUCTS_DIR, keyword)): os.makedirs(os.path.join(PRODUCTS_DIR, keyword))
            for asin in open(asins_file, 'r', encoding='utf-8').readlines():
                try:
                    asin = asin.strip()
                    if not os.path.exists(os.path.join(PRODUCTS_DIR, keyword, '%s.txt' % asin)):
                        self.fetch_product(keyword, asin)
                    else:
                        logger.info('pass %s' % asin)
                except KeyboardInterrupt:
                    return
                except Exception as ex:
                    logger.error(ex, exc_info=True)

    def fetch_product(self, keyword, asin):
        try:
            # self.lr.load('https://www.amazon.com/dp/%s' % asin)
            self.load_amazon('https://www.amazon.com/dp/%s' % asin)

            title_ele = self.browser.find_element_by_xpath('//span[@id="productTitle"]')
            title = title_ele.text.strip()

            brand = ''
            try:
                brand_ele = self.browser.find_element_by_xpath('//a[@id="bylineInfo"]')
                if brand_ele is not None:
                    text = brand_ele.text.strip().lower()
                    if text.startswith('visit'):
                        brand = text[9:-5].strip()
                    elif text.startswith('brand'):
                        brand = text[6:].strip()
            except NoSuchElementException as ex:
                logger.info('not brand %s' % asin)

            open(os.path.join(PRODUCTS_DIR, keyword, '%s.txt' % asin), 'w', encoding='utf-8').write('|||'.join([asin, brand, title]))
        except KeyboardInterrupt:
            return
        except Exception as ex:
            # open('xx.html', 'w', encoding='utf-8').write(str(self.lr.body))
            logger.error(ex, exc_info=True)

    def fetch_uspto(self, keyword):
        uspth_path = os.path.join(USPTO_DIR, keyword)
        if not os.path.exists(uspth_path):
            os.makedirs(uspth_path)
        self.lr.load('https://tmsearch.uspto.gov/')
        search_ele = self.lr.xpath('//a[contains(@href, "searchss")]')
        if search_ele is not None:
            self.lr.load(urljoin(self.lr.current_url, search_ele.get('href')))
            # form = self.lr.get_forms()[0]
            # form = self.lr.getForms(urljoin(self.lr.current_url, search_ele.get('href')))[0]

            key_path = os.path.join(PRODUCTS_DIR, keyword)
            if os.path.isdir(key_path):
                for file in os.listdir(key_path):
                    try:
                        asin, brand, title = open(os.path.join(key_path, file), 'r', encoding='utf-8').read().strip().split('|||')
                        uspto_file = os.path.join(uspth_path, '%s.txt' % asin)
                        if not os.path.exists(uspto_file):
                            if brand:
                                state_ele = self.lr.xpath('//input[@name="state"]')
                                state = state_ele.get('value')

                                payload = {'f': 'toc', 
                                'state': state,
                                'p_search': 'search',
                                'p_s_All': '',
                                'p_s_ALL': brand,
                                'a_default': 'search',
                                'a_search': 'Submit',}

                                self.lr.load('https://tmsearch.uspto.gov/bin/showfield', method='POST', data=payload)

                                eles = self.lr.xpaths('//table[@id="searchResultTable"]//tr')
                                if eles is not None and len(eles) > 1:
                                    logger.info('Brand %s: %s' % (brand, len(eles)))
                                    open(uspto_file, 'w', encoding='utf-8').write(str(len(eles)))
                                else:
                                    logger.info('Brand %s: None' % brand)
                                    open(uspto_file, 'w', encoding='utf-8').write("0")
                            else:
                                logger.info('Pass Empty Brand %s' % asin)
                        else:
                            logger.info('Pass Empty Uspto %s' % asin)
                    except KeyboardInterrupt:
                        return
                    except Exception as ex:
                        logger.error(ex, exc_info=True)


    def fetch_trademarkia(self, keyword):
        trademarkia_path = os.path.join(TRADEMARKIA_DIR, keyword)
        if not os.path.exists(trademarkia_path):
            os.makedirs(trademarkia_path)
        key_path = os.path.join(PRODUCTS_DIR, keyword)
        if os.path.isdir(key_path):
            for file in os.listdir(key_path):
                try:
                    asin, brand, title = open(os.path.join(key_path, file), 'r', encoding='utf-8').read().strip().split('|||')
                    trademarkia_file = os.path.join(trademarkia_path, '%s.txt' % asin)
                    if not os.path.exists(trademarkia_file):
                        if brand:
                            self.lr.load('https://www.trademarkia.com/trademarks-search.aspx?tn=%s' % quote(brand))
                            eles = self.lr.xpaths('//table[contains(@class, "tablesaw")]//tr')
                            if eles is not None and len(eles) > 1:
                                logger.info('Brand %s: %s' % (brand, len(eles)))
                                open(trademarkia_file, 'w', encoding='utf-8').write(str(len(eles)))
                            else:
                                logger.info('Brand %s: None' % brand)
                                open(trademarkia_file, 'w', encoding='utf-8').write("0")
                        else:
                            logger.info('Pass Empty Brand %s' % asin)
                    else:
                        logger.info('Pass Trademarkia %s' % asin)
                except KeyboardInterrupt:
                    return
                except Exception as ex:
                    logger.error(ex, exc_info=True)

    def output_csv(self, keyword):
        product_dir = os.path.join(PRODUCTS_DIR, keyword)

        if os.path.exists(product_dir):
            products = []
            for file in os.listdir(product_dir):
                asin, brand, title = open(os.path.join(product_dir, file), 'r', encoding='utf-8').read().strip().split('|||')

                t_m = '否'
                t_dir = os.path.join(TRADEMARKIA_DIR, keyword, '%s.txt' % asin)
                if os.path.exists(t_dir):
                    if int(open(t_dir).read().strip()) > 0:
                        t_m = '是'

                u_m = '否'
                u_dir = os.path.join(USPTO_DIR, keyword, '%s.txt' % asin)
                if os.path.exists(u_dir):
                    if int(open(u_dir).read().strip()) > 0:
                        u_m = '是'

                products.append([title, asin, brand, t_m, u_m])

            fields = ['标题', 'asin', '品牌', 'TRADEMARKIA', 'USPTO']
            csv_path = os.path.join(CSV_DIR, '%s.csv' % keyword)

            with open(csv_path, 'w', encoding='utf-8', newline="") as csvfile:
                csvwriter = csv.writer(csvfile)
                csvwriter.writerow(fields)
                csvwriter.writerows(products)

    def do(self):
        try:
            while 1:
                try:
                    keyword = self.q.get(timeout=5)
                    self.fetch_list(keyword)
                    self.fetch_products(keyword)
                    self.fetch_trademarkia(keyword)
                    self.fetch_uspto(keyword)

                    self.output_csv(keyword)
                except queue.Empty:
                    logger.info('Empty')
                    time.sleep(5)
                except Exception as ex:
                    logger.error(ex, exc_info=True)

        except Exception as ex:
            logger.error(ex, exc_info=True)

Example #6

Show file

File: am2.py Project: xtwxfxk/left5

    def __init__(self, keyword):

        self.lr = LRequests()
        self.asins = []
        self.keyword = keyword

Example #7

Show file

File: am2.py Project: xtwxfxk/left5

class SpiderAmazon():

    def __init__(self, keyword):

        self.lr = LRequests()
        self.asins = []
        self.keyword = keyword

    def load_amazon(self, url):
        self.lr.load(url)
        if self.lr.body.find('Enter the characters you see below') > 0:
            logger.error("Captcha!!!")
            time.sleep(10)
            self.lr = LRequests()
            self.lr.load(url)

    def fetch_list(self):
        file = os.path.join(ASIN_DIR, '%s.txt' % self.keyword)
        if not os.path.exists(file):
            self.lr.load('https://www.amazon.com/s?k=%s' % quote(self.keyword))

            while 1:
                self.fetch_asin()
                if not self.next_page():
                    break

            open(file, 'w', encoding='utf-8').write('\n'.join(self.key_asins[keyword]))
        else:
            logger.info('pass keyword: %s' % keyword)

    def fetch_asin(self):
        product_eles = self.lr.xpaths('//div[contains(@class, "s-result-list")]/div[contains(@data-component-type, "s-search-result")]')
        for product_ele in product_eles:
            self.asins.append(product_ele.get('data-asin'))


    def next_page(self):
        next_ele = self.lr.xpath('//div[contains(@class, "s-pagination-container")]//a[contains(@aria-label, "next page")]')
        if next_ele is None:
            return False
        else:
            next_url = urljoin(self.lr.current_url, next_ele.get('href'))
            self.lr.load(next_url)
            return True

    def fetch_asins(self):
        if not os.path.exists(os.path.join(PRODUCTS_DIR, self.keyword)): os.makedirs(os.path.join(PRODUCTS_DIR, self.keyword))

        for asin in self.asins:
            try:
                if not os.path.exists(os.path.join(PRODUCTS_DIR, self.keyword, '%s.txt' % asin)):
                    self.fetch_product(asin)
                    self.fetch_trademarkia(asin)
                    self.fetch_uspto(asin)
                else:
                    logger.info('pass %s' % asin)
            except Exception as ex:
                logger.error(ex, exc_info=True)

    def fetch_product(self, asin):
        try:
            # self.lr.load('https://www.amazon.com/dp/%s' % asin)
            self.load_amazon('https://www.amazon.com/dp/%s' % asin)

            title_ele = self.lr.xpath('//span[@id="productTitle"]')
            title = ''.join(title_ele.itertext()).strip()

            brand = ''
            brand_ele = self.lr.xpath('//a[@id="bylineInfo"]')
            if brand_ele is not None:
                text = brand_ele.text.strip().lower()
                if text.startswith('visit'):
                    brand = text[9:-5].strip()
                elif text.startswith('brand'):
                    brand = text[6:].strip()

            self.product_info = [asin, brand, title]
        except Exception as ex:
            open('xx\\%s.html' % time.time(), 'w', encoding='utf-8').write(str(self.lr.body))
            logger.error(ex, exc_info=True)

    def fetch_uspto(self, asin):
        self.lr.load('https://tmsearch.uspto.gov/')
        search_ele = self.lr.xpath('//a[contains(@href, "searchss")]')
        if search_ele is not None:
            self.lr.load(urljoin(self.lr.current_url, search_ele.get('href')))

            try:
                asin, brand, title = self.product_info
                if brand:
                    state_ele = self.lr.xpath('//input[@name="state"]')
                    state = state_ele.get('value')

                    payload = {'f': 'toc', 
                    'state': state,
                    'p_search': 'search',
                    'p_s_All': '',
                    'p_s_ALL': brand,
                    'a_default': 'search',
                    'a_search': 'Submit',}

                    self.lr.load('https://tmsearch.uspto.gov/bin/showfield', method='POST', data=payload)

                    eles = self.lr.xpaths('//table[@id="searchResultTable"]//tr')
                    if eles is not None and len(eles) > 1:
                        logger.info('Brand %s: %s' % (brand, len(eles)))
                        self.uspto_count = len(eles)
                    else:
                        logger.info('Brand %s: None' % brand)
                        self.uspto_count = 0
                else:
                    logger.info('Pass Empty Brand %s' % asin)
            except Exception as ex:
                logger.error(ex, exc_info=True)


    def fetch_trademarkia(self, asin):
        try:
            asin, brand, title = self.product_info
            if brand:
                self.lr.load('https://www.trademarkia.com/trademarks-search.aspx?tn=%s' % quote(brand))
                eles = self.lr.xpaths('//table[contains(@class, "tablesaw")]//tr')
                if eles is not None and len(eles) > 1:
                    logger.info('Brand %s: %s' % (brand, len(eles)))
                    self.trademarkia_count = len(eles)
                else:
                    logger.info('Brand %s: None' % brand)
                    self.trademarkia_count = 0
            else:
                logger.info('Pass Empty Brand %s' % asin)

        except Exception as ex:
            logger.error(ex, exc_info=True)



    def output_csv(self, keyword):
        product_dir = os.path.join(PRODUCTS_DIR, keyword)

        products = []
        for file in os.listdir(product_dir):
            asin, brand, title = open(os.path.join(product_dir, file), 'r', encoding='utf-8').read().strip().split('|||')

            t_m = '否'
            t_dir = os.path.join(TRADEMARKIA_DIR, '%s.txt' % asin)
            if os.path.exists(t_dir):
                if int(open(t_dir).read().strip()) > 0:
                    t_m = '是'

            u_m = '否'
            u_dir = os.path.join(USPTO_DIR, '%s.txt' % asin)
            if os.path.exists(u_dir):
                if int(open(u_dir).read().strip()) > 0:
                    u_m = '是'

            products.append([title, asin, brand, t_m, u_m])

        fields = ['标题', 'asin', '品牌', 'TRADEMARKIA', 'USPTO']
        csv_path = os.path.join(CSV_DIR, '%s.csv' % keyword)

        with open(csv_path, 'w', encoding='utf-8', newline="") as csvfile:
            csvwriter = csv.writer(csvfile)
            csvwriter.writerow(fields)
            csvwriter.writerows(products)

    def start(self):
        self.fetch_list()

Example #8

Show file

File: am.py Project: xtwxfxk/left5

    def __init__(self):

        self.lr = LRequests()
        self.gsa = GsaCaptcha()
        self.key_asins = {}

Example #9

Show file

File: am.py Project: xtwxfxk/left5

class SpiderAmazon():
    def __init__(self):

        self.lr = LRequests()
        self.gsa = GsaCaptcha()
        self.key_asins = {}

    def load_amazon(self, url):
        self.lr.load(url)
        # if(url.find('ref=nb_sb_noss') > -1):
        #     open('xxx\\%s.html' % time.time(), 'w', encoding='utf-8').write(self.lr.body)

        if self.lr.body.find('Something went wrong on our end') > 0:
            forms = BeautifulSoup(self.lr.body).find_all('form')
            for f in forms:
                print('=======-----')
                print('1111111 %s' % f.attrs.get('action'))
                for input_tag in f.find_all("input"):
                    print('%s - %s' % (input_tag.attrs.get('name'),
                                       input_tag.attrs.get('value')))
                print('2222222 %s' % f.attrs.get('action'))
                for input_tag in f.find_all("select"):
                    print('%s - %s' % (input_tag.attrs.get('name'),
                                       input_tag.attrs.get('value')))

            self.lr.load('https://www.amazon.com/ref=cs_503_link')
            time.sleep(1)
            self.lr.load(url)

            forms = BeautifulSoup(self.lr.body).find_all('form')
            for f in forms:
                print('=======')
                print('444444 %s' % f.attrs.get('action'))
                for input_tag in f.find_all("input"):
                    print('%s - %s' % (input_tag.attrs.get('name'),
                                       input_tag.attrs.get('value')))

            open('xxx\\%s.html' % time.time(), 'w',
                 encoding='utf-8').write(self.lr.body)

        while self.lr.body.find('Enter the characters you see below') > 0:
            try:
                logger.error("Captcha!!!")

                captcha_path = os.path.join(CAPTCHA_DIR,
                                            '%s.jpg' % time.time())
                self.lr.load_img(
                    self.lr.xpath('//img[contains(@src, "captcha")]').get(
                        'src'))
                with open(captcha_path, 'wb') as f:
                    f.write(self.lr.body)
                code = self.gsa.decode(captcha_path)

                logger.info('Decode Captcha: %s' % code)
                amzn = self.lr.xpath('//input[@name="amzn"]').get('value')
                amzn_r = self.lr.xpath('//input[@name="amzn-r"]').get('value')

                captcha_url = 'https://www.amazon.com/errors/validateCaptcha?amzn=%s&amzn-r=%s&field-keywords=%s' % (
                    quote_plus(amzn), quote_plus(amzn_r), code)
                # payload = {'amzn': amzn,
                #             'amzn-r': amzn_r,
                #             'field-keywords': code,}

                self.lr.load(captcha_url, method='GET')  #, data=payload)
                # open('xxx\\%s.html' % time.time(), 'w', encoding='utf-8').write(self.lr.body)
                # self.lr.load(url)
            except Exception as ex:
                logger.error(ex, exc_info=True)

    def fetch_list(self, keyword):

        file = os.path.join(ASIN_DIR, '%s.txt' % keyword)
        if not os.path.exists(file):
            self.key_asins[keyword] = []

            # self.lr.load('https://www.amazon.com')
            # self.load_amazon('https://www.amazon.com')
            # self.load_amazon('https://www.amazon.com')
            # self.load_amazon('https://www.amazon.com')
            # open('xxx\\%s.html' % time.time(), 'w').write(self.lr.body)

            # print('cccccccccccccccc %s' % self.lr.body.find('crid'))
            # self.lr.load('https://www.amazon.com/s?k=%s' % quote(keyword))
            # self.load_amazon('https://www.amazon.com/s?k=%s' % quote(keyword))

            # form = BeautifulSoup(self.lr.body).find_all('form')[0]
            # for f in forms:
            #     print('33333333333333333')
            #     print('444444 %s' % f.attrs.get('action'))
            #     for input_tag in f.find_all("input"):
            #         print('%s - %s' % (input_tag.attrs.get('name'), input_tag.attrs.get('value')))
            # print('--------------------')
            # s_url = '%s?field-keywords=%s' % (urljoin('https://www.amazon.com/', form.attrs.get('action')), quote(keyword))
            # self.load_amazon('https://www.amazon.com/s/ref=nb_sb_noss_1?url=search-alias%%3Daps&field-keywords=%s' % quote(keyword))
            # self.load_amazon('https://www.amazon.com/s?field-keywords=%s&ref=cs_503_search' % quote(keyword))
            self.load_amazon('https://www.amazon.com/s?k=%s' % quote(keyword))

            while 1:
                self.fetch_asin(keyword)
                if not self.next_page():
                    break

            if len(self.key_asins[keyword]) > 0:
                open(file, 'w', encoding='utf-8').write('\n'.join(
                    self.key_asins[keyword]))
            else:
                logger.info("empty asin: %s" % keyword)
        else:
            logger.info('pass keyword: %s' % keyword)

    def fetch_asin(self, keyword):
        product_eles = self.lr.xpaths(
            '//div[contains(@class, "s-result-list")]/div[contains(@data-component-type, "s-search-result")]'
        )
        for product_ele in product_eles:
            logger.info('asin: %s' % product_ele.get('data-asin'))
            self.key_asins[keyword].append(product_ele.get('data-asin'))

    def next_page(self):
        next_ele = self.lr.xpath(
            '//div[contains(@class, "s-pagination-container")]//a[contains(@aria-label, "next page")]'
        )
        if next_ele is None:
            return False
        else:
            next_url = urljoin(self.lr.current_url, next_ele.get('href'))
            self.lr.load(next_url)
            return True

    def fetch_products(self, keyword):
        asins_file = os.path.join(ASIN_DIR, '%s.txt' % keyword)
        if os.path.exists(asins_file):
            if not os.path.exists(os.path.join(PRODUCTS_DIR, keyword)):
                os.makedirs(os.path.join(PRODUCTS_DIR, keyword))
            for asin in open(asins_file, 'r', encoding='utf-8').readlines():
                try:
                    asin = asin.strip()
                    if not os.path.exists(
                            os.path.join(PRODUCTS_DIR, keyword,
                                         '%s.txt' % asin)):
                        self.fetch_product(keyword, asin)
                    else:
                        logger.info('pass %s' % asin)
                except KeyboardInterrupt:
                    return
                except Exception as ex:
                    logger.error(ex, exc_info=True)

    def fetch_product(self, keyword, asin):
        try:
            # self.lr.load('https://www.amazon.com/dp/%s' % asin)
            self.load_amazon('https://www.amazon.com/dp/%s' % asin)

            title_ele = self.lr.xpath('//span[@id="productTitle"]')
            title = ''.join(title_ele.itertext()).strip()

            brand = ''
            brand_ele = self.lr.xpath('//a[@id="bylineInfo"]')
            if brand_ele is not None:
                text = brand_ele.text.strip().lower()
                if text.startswith('visit'):
                    brand = text[9:-5].strip()
                elif text.startswith('brand'):
                    brand = text[6:].strip()

            open(os.path.join(PRODUCTS_DIR, keyword, '%s.txt' % asin),
                 'w',
                 encoding='utf-8').write('|||'.join([asin, brand, title]))
        except KeyboardInterrupt:
            return
        except Exception as ex:
            # open('xx.html', 'w', encoding='utf-8').write(str(self.lr.body))
            logger.error(ex, exc_info=True)

    def fetch_uspto(self, keyword):
        uspth_path = os.path.join(USPTO_DIR, keyword)
        if not os.path.exists(uspth_path):
            os.makedirs(uspth_path)
        self.lr.load('https://tmsearch.uspto.gov/')
        search_ele = self.lr.xpath('//a[contains(@href, "searchss")]')
        if search_ele is not None:
            self.lr.load(urljoin(self.lr.current_url, search_ele.get('href')))
            # form = self.lr.get_forms()[0]
            # form = self.lr.getForms(urljoin(self.lr.current_url, search_ele.get('href')))[0]

            key_path = os.path.join(PRODUCTS_DIR, keyword)
            if os.path.isdir(key_path):
                for file in os.listdir(key_path):
                    try:
                        asin, brand, title = open(
                            os.path.join(key_path, file),
                            'r',
                            encoding='utf-8').read().strip().split('|||')
                        uspto_file = os.path.join(uspth_path, '%s.txt' % asin)
                        if not os.path.exists(uspto_file):
                            if brand:
                                state_ele = self.lr.xpath(
                                    '//input[@name="state"]')
                                state = state_ele.get('value')

                                payload = {
                                    'f': 'toc',
                                    'state': state,
                                    'p_search': 'search',
                                    'p_s_All': '',
                                    'p_s_ALL': brand,
                                    'a_default': 'search',
                                    'a_search': 'Submit',
                                }

                                self.lr.load(
                                    'https://tmsearch.uspto.gov/bin/showfield',
                                    method='POST',
                                    data=payload)

                                eles = self.lr.xpaths(
                                    '//table[@id="searchResultTable"]//tr')
                                if eles is not None and len(eles) > 1:
                                    logger.info('Brand %s: %s' %
                                                (brand, len(eles)))
                                    open(uspto_file,
                                         'w', encoding='utf-8').write(
                                             str(len(eles)))
                                else:
                                    logger.info('Brand %s: None' % brand)
                                    open(uspto_file, 'w',
                                         encoding='utf-8').write("0")
                            else:
                                logger.info('Pass Empty Brand %s' % asin)
                        else:
                            logger.info('Pass Empty Uspto %s' % asin)
                    except KeyboardInterrupt:
                        return
                    except Exception as ex:
                        logger.error(ex, exc_info=True)

    def fetch_trademarkia(self, keyword):
        trademarkia_path = os.path.join(TRADEMARKIA_DIR, keyword)
        if not os.path.exists(trademarkia_path):
            os.makedirs(trademarkia_path)
        key_path = os.path.join(PRODUCTS_DIR, keyword)
        if os.path.isdir(key_path):
            for file in os.listdir(key_path):
                try:
                    asin, brand, title = open(
                        os.path.join(key_path, file), 'r',
                        encoding='utf-8').read().strip().split('|||')
                    trademarkia_file = os.path.join(trademarkia_path,
                                                    '%s.txt' % asin)
                    if not os.path.exists(trademarkia_file):
                        if brand:
                            self.lr.load(
                                'https://www.trademarkia.com/trademarks-search.aspx?tn=%s'
                                % quote(brand))
                            eles = self.lr.xpaths(
                                '//table[contains(@class, "tablesaw")]//tr')
                            if eles is not None and len(eles) > 1:
                                logger.info('Brand %s: %s' %
                                            (brand, len(eles)))
                                open(trademarkia_file, 'w',
                                     encoding='utf-8').write(str(len(eles)))
                            else:
                                logger.info('Brand %s: None' % brand)
                                open(trademarkia_file, 'w',
                                     encoding='utf-8').write("0")
                        else:
                            logger.info('Pass Empty Brand %s' % asin)
                    else:
                        logger.info('Pass Trademarkia %s' % asin)
                except KeyboardInterrupt:
                    return
                except Exception as ex:
                    logger.error(ex, exc_info=True)

    def output_csv(self, keyword):
        product_dir = os.path.join(PRODUCTS_DIR, keyword)

        if os.path.exists(product_dir):
            products = []
            for file in os.listdir(product_dir):
                asin, brand, title = open(
                    os.path.join(product_dir, file), 'r',
                    encoding='utf-8').read().strip().split('|||')

                t_m = '否'
                t_dir = os.path.join(TRADEMARKIA_DIR, keyword, '%s.txt' % asin)
                if os.path.exists(t_dir):
                    if int(open(t_dir).read().strip()) > 0:
                        t_m = '是'

                u_m = '否'
                u_dir = os.path.join(USPTO_DIR, keyword, '%s.txt' % asin)
                if os.path.exists(u_dir):
                    if int(open(u_dir).read().strip()) > 0:
                        u_m = '是'

                products.append([title, asin, brand, t_m, u_m])

            fields = ['标题', 'asin', '品牌', 'TRADEMARKIA', 'USPTO']
            csv_path = os.path.join(CSV_DIR, '%s.csv' % keyword)

            with open(csv_path, 'w', encoding='utf-8', newline="") as csvfile:
                csvwriter = csv.writer(csvfile)
                csvwriter.writerow(fields)
                csvwriter.writerows(products)

Example #10

Show file

File: amh.py Project: xtwxfxk/left5

class SpiderAmazon():
    def __init__(self):

        self.lr = LRequests()
        self.h = httplib2.Http(".cache")

        self.gsa = GsaCaptcha()
        self.key_asins = {}

        self.headers = {
            # ':authority': 'www.amazon.com',
            # ':method': 'GET',
            # ':path': '/s?k=Wallets',
            # ':scheme': 'https',
            'accept':
            'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
            'accept-encoding':
            'gzip, deflate, br',
            'accept-language':
            'en;q=0.9',
            'sec-ch-ua':
            '".Not/A)Brand";v="99", "Google Chrome";v="103", "Chromium";v="103"',
            'sec-ch-ua-mobile':
            '?0',
            'sec-ch-ua-platform':
            '"Windows"',
            'sec-fetch-dest':
            'document',
            'sec-fetch-mode':
            'navigate',
            'sec-fetch-site':
            'none',
            'sec-fetch-user':
            '******',
            'upgrade-insecure-requests':
            '1',
            'user-agent':
            'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/537.36',
        }
        self.cookies = {}

    def set_body(self, body, resp_headers):
        if (isinstance(body, bytes)):
            body = body.decode('utf-8')
        self.current_url = resp_headers['content-location']
        self.body = body
        self.tree = html.fromstring(str(BeautifulSoup(self.body, 'lxml')))

    def xpath(self, xpath):
        eles = self.tree.xpath(xpath)
        if eles and len(eles) > 0:
            return eles[0]
        return None

    def xpaths(self, xpath):
        return self.tree.xpath(xpath)

    def load_cookies(self, resp_headers):
        if 'set-cookie' in resp_headers:
            cookies_str = resp_headers['set-cookie']
            for cookie in re.split(';|,', cookies_str):
                cookie = cookie.strip().lower()
                if any([
                        True if not cookie.startswith(e)
                        and cookie.find('=') > 0 else False
                        for e in exclude_cookie
                ]):
                    name, value = cookie.split('=', 1)
                    self.cookies[name] = value

        if len(self.cookies.keys()) > 0:
            cookies = []
            for k, v in self.cookies.items():
                cookies.append('%s=%s' % (k, v))
            # logger.info('update cookies: %s' % '; '.join(cookies))
            self.headers['cookie'] = '; '.join(cookies)

    def load_amazon(self, url):
        # self.lr.load(url)
        time.sleep(1)
        logger.info('load url: %s' % url)
        (resp_headers, body) = self.h.request(url,
                                              method='GET',
                                              headers=self.headers)
        self.set_body(body, resp_headers)

        self.load_cookies(resp_headers)

        # if(url.find('ref=nb_sb_noss') > -1):
        #     open('xxx\\%s.html' % time.time(), 'w', encoding='utf-8').write(self.lr.body)

        # if self.lr.body.find('Something went wrong on our end') > 0:
        #     forms = BeautifulSoup(self.lr.body).find_all('form')
        #     for f in forms:
        #         print('=======-----')
        #         print('1111111 %s' % f.attrs.get('action'))
        #         for input_tag in f.find_all("input"):
        #             print('%s - %s' % (input_tag.attrs.get('name'), input_tag.attrs.get('value')))
        #         print('2222222 %s' % f.attrs.get('action'))
        #         for input_tag in f.find_all("select"):
        #             print('%s - %s' % (input_tag.attrs.get('name'), input_tag.attrs.get('value')))

        #     self.lr.load('https://www.amazon.com/ref=cs_503_link')
        #     time.sleep(1)
        #     self.lr.load(url)

        #     forms = BeautifulSoup(self.lr.body).find_all('form')
        #     for f in forms:
        #         print('=======')
        #         print('444444 %s' % f.attrs.get('action'))
        #         for input_tag in f.find_all("input"):
        #             print('%s - %s' % (input_tag.attrs.get('name'), input_tag.attrs.get('value')))

        #     open('xxx\\%s.html' % time.time(), 'w', encoding='utf-8').write(self.lr.body)

        while self.body.find('Enter the characters you see below') > 0:
            try:
                logger.error("Captcha!!!")

                captcha_path = os.path.join(CAPTCHA_DIR,
                                            '%s.jpg' % time.time())
                img_url = self.xpath('//img[contains(@src, "captcha")]').get(
                    'src')
                logger.info('load img: %s' % img_url)
                (resp_headers, body) = self.h.request(img_url,
                                                      method='GET',
                                                      headers=self.headers)
                with open(captcha_path, 'wb') as f:
                    f.write(body)
                code = self.gsa.decode(captcha_path)

                logger.info('Decode Captcha: %s' % code)
                amzn = self.xpath('//input[@name="amzn"]').get('value')
                amzn_r = self.xpath('//input[@name="amzn-r"]').get('value')

                captcha_url = 'https://www.amazon.com/errors/validateCaptcha?amzn=%s&amzn-r=%s&field-keywords=%s' % (
                    quote_plus(amzn), quote_plus(amzn_r), code)
                # payload = {'amzn': amzn,
                #             'amzn-r': amzn_r,
                #             'field-keywords': code,}
                logger.info('load url: %s' % captcha_url)
                (resp_headers,
                 body) = self.h.request(captcha_url,
                                        method='GET',
                                        headers=self.headers)  #, data=payload)
                self.load_cookies(resp_headers)
                # open('xxx\\%s.html' % time.time(), 'w', encoding='utf-8').write(self.lr.body)
                (resp_headers, body) = self.h.request(url,
                                                      method='GET',
                                                      headers=self.headers)
                self.set_body(body, resp_headers)
                self.load_cookies(resp_headers)
            except Exception as ex:
                logger.error(ex, exc_info=True)

    def fetch_list(self, keyword):

        file = os.path.join(ASIN_DIR, '%s.txt' % keyword)
        if not os.path.exists(file):
            self.key_asins[keyword] = []

            # self.lr.load('https://www.amazon.com')
            # self.load_amazon('https://www.amazon.com')
            # self.load_amazon('https://www.amazon.com')
            # self.load_amazon('https://www.amazon.com')
            # open('xxx\\%s.html' % time.time(), 'w').write(self.lr.body)

            # print('cccccccccccccccc %s' % self.lr.body.find('crid'))
            # self.lr.load('https://www.amazon.com/s?k=%s' % quote(keyword))
            # self.load_amazon('https://www.amazon.com/s?k=%s' % quote(keyword))

            # form = BeautifulSoup(self.lr.body).find_all('form')[0]
            # for f in forms:
            #     print('33333333333333333')
            #     print('444444 %s' % f.attrs.get('action'))
            #     for input_tag in f.find_all("input"):
            #         print('%s - %s' % (input_tag.attrs.get('name'), input_tag.attrs.get('value')))
            # print('--------------------')
            # s_url = '%s?field-keywords=%s' % (urljoin('https://www.amazon.com/', form.attrs.get('action')), quote(keyword))
            # self.load_amazon('https://www.amazon.com/s/ref=nb_sb_noss_1?url=search-alias%%3Daps&field-keywords=%s' % quote(keyword))
            # self.load_amazon('https://www.amazon.com/s?field-keywords=%s&ref=cs_503_search' % quote(keyword))
            self.load_amazon('https://www.amazon.com/s?k=%s' % quote(keyword))

            while 1:
                self.fetch_asin(keyword)
                if not self.next_page():
                    break

            if len(self.key_asins[keyword]) > 0:
                open(file, 'w', encoding='utf-8').write('\n'.join(
                    self.key_asins[keyword]))
            else:
                logger.info("empty asin: %s" % keyword)
        else:
            logger.info('pass keyword: %s' % keyword)

    def fetch_asin(self, keyword):
        product_eles = self.xpaths(
            '//div[contains(@class, "s-result-list")]/div[contains(@data-component-type, "s-search-result")]'
        )
        for product_ele in product_eles:
            logger.info('asin: %s' % product_ele.get('data-asin'))
            self.key_asins[keyword].append(product_ele.get('data-asin'))

    def next_page(self):
        next_ele = self.xpath(
            '//div[contains(@class, "s-pagination-container")]//a[contains(@aria-label, "next page")]'
        )
        if next_ele is None:
            return False
        else:
            next_url = urljoin(self.current_url, next_ele.get('href'))
            self.load_amazon(next_url)
            return True

    def fetch_products(self, keyword):
        asins_file = os.path.join(ASIN_DIR, '%s.txt' % keyword)
        if os.path.exists(asins_file):
            if not os.path.exists(os.path.join(PRODUCTS_DIR, keyword)):
                os.makedirs(os.path.join(PRODUCTS_DIR, keyword))
            for asin in open(asins_file, 'r', encoding='utf-8').readlines():
                try:
                    asin = asin.strip()
                    if not os.path.exists(
                            os.path.join(PRODUCTS_DIR, keyword,
                                         '%s.txt' % asin)):
                        self.fetch_product(keyword, asin)
                    else:
                        logger.info('pass %s' % asin)
                except KeyboardInterrupt:
                    return
                except Exception as ex:
                    logger.error(ex, exc_info=True)

    def fetch_product(self, keyword, asin):
        try:
            # self.lr.load('https://www.amazon.com/dp/%s' % asin)
            self.load_amazon('https://www.amazon.com/dp/%s' % asin)

            title_ele = self.xpath('//span[@id="productTitle"]')
            title = ''.join(title_ele.itertext()).strip()

            brand = ''
            brand_ele = self.xpath('//a[@id="bylineInfo"]')
            if brand_ele is not None:
                text = brand_ele.text.strip().lower()
                if text.startswith('visit'):
                    brand = text[9:-5].strip()
                elif text.startswith('brand'):
                    brand = text[6:].strip()

            open(os.path.join(PRODUCTS_DIR, keyword, '%s.txt' % asin),
                 'w',
                 encoding='utf-8').write('|||'.join([asin, brand, title]))
        except KeyboardInterrupt:
            return
        except Exception as ex:
            # open('xx.html', 'w', encoding='utf-8').write(str(self.lr.body))
            logger.error(ex, exc_info=True)

    def fetch_uspto(self, keyword):
        uspth_path = os.path.join(USPTO_DIR, keyword)
        if not os.path.exists(uspth_path):
            os.makedirs(uspth_path)
        self.lr.load('https://tmsearch.uspto.gov/')
        search_ele = self.lr.xpath('//a[contains(@href, "searchss")]')
        if search_ele is not None:
            self.lr.load(urljoin(self.lr.current_url, search_ele.get('href')))
            # form = self.lr.get_forms()[0]
            # form = self.lr.getForms(urljoin(self.lr.current_url, search_ele.get('href')))[0]

            key_path = os.path.join(PRODUCTS_DIR, keyword)
            if os.path.isdir(key_path):
                for file in os.listdir(key_path):
                    try:
                        asin, brand, title = open(
                            os.path.join(key_path, file),
                            'r',
                            encoding='utf-8').read().strip().split('|||')
                        uspto_file = os.path.join(uspth_path, '%s.txt' % asin)
                        if not os.path.exists(uspto_file):
                            if brand:
                                state_ele = self.lr.xpath(
                                    '//input[@name="state"]')
                                state = state_ele.get('value')

                                payload = {
                                    'f': 'toc',
                                    'state': state,
                                    'p_search': 'search',
                                    'p_s_All': '',
                                    'p_s_ALL': brand,
                                    'a_default': 'search',
                                    'a_search': 'Submit',
                                }

                                self.lr.load(
                                    'https://tmsearch.uspto.gov/bin/showfield',
                                    method='POST',
                                    data=payload)

                                eles = self.lr.xpaths(
                                    '//table[@id="searchResultTable"]//tr')
                                if eles is not None and len(eles) > 1:
                                    logger.info('Brand %s: %s' %
                                                (brand, len(eles)))
                                    open(uspto_file,
                                         'w', encoding='utf-8').write(
                                             str(len(eles)))
                                else:
                                    logger.info('Brand %s: None' % brand)
                                    open(uspto_file, 'w',
                                         encoding='utf-8').write("0")
                            else:
                                logger.info('Pass Empty Brand %s' % asin)
                        else:
                            logger.info('Pass Empty Uspto %s' % asin)
                    except KeyboardInterrupt:
                        return
                    except Exception as ex:
                        logger.error(ex, exc_info=True)

    def fetch_trademarkia(self, keyword):
        trademarkia_path = os.path.join(TRADEMARKIA_DIR, keyword)
        if not os.path.exists(trademarkia_path):
            os.makedirs(trademarkia_path)
        key_path = os.path.join(PRODUCTS_DIR, keyword)
        if os.path.isdir(key_path):
            for file in os.listdir(key_path):
                try:
                    asin, brand, title = open(
                        os.path.join(key_path, file), 'r',
                        encoding='utf-8').read().strip().split('|||')
                    trademarkia_file = os.path.join(trademarkia_path,
                                                    '%s.txt' % asin)
                    if not os.path.exists(trademarkia_file):
                        if brand:
                            self.lr.load(
                                'https://www.trademarkia.com/trademarks-search.aspx?tn=%s'
                                % quote(brand))
                            eles = self.lr.xpaths(
                                '//table[contains(@class, "tablesaw")]//tr')
                            if eles is not None and len(eles) > 1:
                                logger.info('Brand %s: %s' %
                                            (brand, len(eles)))
                                open(trademarkia_file, 'w',
                                     encoding='utf-8').write(str(len(eles)))
                            else:
                                logger.info('Brand %s: None' % brand)
                                open(trademarkia_file, 'w',
                                     encoding='utf-8').write("0")
                        else:
                            logger.info('Pass Empty Brand %s' % asin)
                    else:
                        logger.info('Pass Trademarkia %s' % asin)
                except KeyboardInterrupt:
                    return
                except Exception as ex:
                    logger.error(ex, exc_info=True)

    def output_csv(self, keyword):
        product_dir = os.path.join(PRODUCTS_DIR, keyword)

        if os.path.exists(product_dir):
            products = []
            for file in os.listdir(product_dir):
                asin, brand, title = open(
                    os.path.join(product_dir, file), 'r',
                    encoding='utf-8').read().strip().split('|||')

                t_m = '否'
                t_dir = os.path.join(TRADEMARKIA_DIR, keyword, '%s.txt' % asin)
                if os.path.exists(t_dir):
                    if int(open(t_dir).read().strip()) > 0:
                        t_m = '是'

                u_m = '否'
                u_dir = os.path.join(USPTO_DIR, keyword, '%s.txt' % asin)
                if os.path.exists(u_dir):
                    if int(open(u_dir).read().strip()) > 0:
                        u_m = '是'

                products.append([title, asin, brand, t_m, u_m])

            fields = ['标题', 'asin', '品牌', 'TRADEMARKIA', 'USPTO']
            csv_path = os.path.join(CSV_DIR, '%s.csv' % keyword)

            with open(csv_path, 'w', encoding='utf-8', newline="") as csvfile:
                csvwriter = csv.writer(csvfile)
                csvwriter.writerow(fields)
                csvwriter.writerows(products)

Example #11

Show file

File: am3.py Project: xtwxfxk/left5

class SpiderAmazon():
    def __init__(self, list_queue, product_queue, record_queue, csv_queue):

        self.list_queue = list_queue
        self.product_queue = product_queue
        self.record_queue = record_queue
        self.csv_queue = csv_queue

        self.lr = LRequests()
        self.gsa = GsaCaptcha()
        self.key_asins = {}

    def load_amazon(self, url):
        self.lr.load(url)
        # if(url.find('ref=nb_sb_noss') > -1):
        open('xx\\%s.html' % time.time(), 'w',
             encoding='utf-8').write(self.lr.body)

        while self.lr.body.find('Enter the characters you see below') > 0:
            try:
                logger.error("Captcha!!!")

                captcha_path = os.path.join(CAPTCHA_DIR,
                                            '%s.jpg' % time.time())
                self.lr.load_img(
                    self.lr.xpath('//img[contains(@src, "captcha")]').get(
                        'src'))
                with open(captcha_path, 'wb') as f:
                    f.write(self.lr.body)
                code = self.gsa.decode(captcha_path)

                logger.info('Decode Captcha: %s' % code)
                amzn = self.lr.xpath('//input[@name="amzn"]').get('value')
                amzn_r = self.lr.xpath('//input[@name="amzn-r"]').get('value')

                captcha_url = 'https://www.amazon.com/errors/validateCaptcha?amzn=%s&amzn-r=%s&field-keywords=%s' % (
                    quote_plus(amzn), quote_plus(amzn_r), code)
                # payload = {'amzn': amzn,
                #             'amzn-r': amzn_r,
                #             'field-keywords': code,}

                self.lr.load(captcha_url, method='GET')  #, data=payload)
                # open('xxx\\%s.html' % time.time(), 'w', encoding='utf-8').write(self.lr.body)
                # self.lr.load(url)
            except Exception as ex:
                logger.error(ex, exc_info=True)

    def fetch_products(self, keyword):
        if not os.path.exists(os.path.join(PRODUCTS_DIR, keyword)):
            os.makedirs(os.path.join(PRODUCTS_DIR, keyword))
        for asin in open(os.path.join(ASIN_DIR, '%s.txt' % keyword),
                         'r',
                         encoding='utf-8').readlines():
            try:
                asin = asin.strip()
                if not os.path.exists(
                        os.path.join(PRODUCTS_DIR, keyword, '%s.txt' % asin)):
                    self.fetch_product(keyword, asin)
                else:
                    logger.info('pass %s' % asin)
            except KeyboardInterrupt:
                return
            except Exception as ex:
                logger.error(ex, exc_info=True)

    def fetch_product(self, keyword, asin):
        try:
            # self.lr.load('https://www.amazon.com/dp/%s' % asin)
            self.load_amazon('https://www.amazon.com/dp/%s' % asin)

            title_ele = self.lr.xpath('//span[@id="productTitle"]')
            title = ''.join(title_ele.itertext()).strip()

            brand = ''
            brand_ele = self.lr.xpath('//a[@id="bylineInfo"]')
            if brand_ele is not None:
                text = brand_ele.text.strip().lower()
                if text.startswith('visit'):
                    brand = text[9:-5].strip()
                elif text.startswith('brand'):
                    brand = text[6:].strip()

            open(os.path.join(PRODUCTS_DIR, keyword, '%s.txt' % asin),
                 'w',
                 encoding='utf-8').write('|||'.join([asin, brand, title]))
            record_queue.put('%s|||%s|||%s' % (keyword, asin, brand))
        except KeyboardInterrupt:
            return
        except Exception as ex:
            # open('xx.html', 'w', encoding='utf-8').write(str(self.lr.body))
            logger.error(ex, exc_info=True)

    def fetch_uspto(self, keyword, brand, asin):
        uspth_path = os.path.join(USPTO_DIR, keyword)
        if not os.path.exists(uspth_path):
            os.makedirs(uspth_path)
        self.lr.load('https://tmsearch.uspto.gov/')
        search_ele = self.lr.xpath('//a[contains(@href, "searchss")]')
        if search_ele is not None:
            self.lr.load(urljoin(self.lr.current_url, search_ele.get('href')))
            # form = self.lr.get_forms()[0]
            # form = self.lr.getForms(urljoin(self.lr.current_url, search_ele.get('href')))[0]

            try:
                uspto_file = os.path.join(uspth_path, '%s.txt' % asin)
                if not os.path.exists(uspto_file):
                    if brand:
                        state_ele = self.lr.xpath('//input[@name="state"]')
                        state = state_ele.get('value')

                        payload = {
                            'f': 'toc',
                            'state': state,
                            'p_search': 'search',
                            'p_s_All': '',
                            'p_s_ALL': brand,
                            'a_default': 'search',
                            'a_search': 'Submit',
                        }

                        self.lr.load(
                            'https://tmsearch.uspto.gov/bin/showfield',
                            method='POST',
                            data=payload)

                        eles = self.lr.xpaths(
                            '//table[@id="searchResultTable"]//tr')
                        if eles is not None and len(eles) > 1:
                            logger.info('Brand %s: %s' % (brand, len(eles)))
                            open(uspto_file, 'w',
                                 encoding='utf-8').write(str(len(eles)))
                        else:
                            logger.info('Brand %s: None' % brand)
                            open(uspto_file, 'w', encoding='utf-8').write("0")
                    else:
                        logger.info('Pass Empty Brand %s' % asin)
                else:
                    logger.info('Pass Empty Uspto %s' % asin)
            except KeyboardInterrupt:
                return
            except Exception as ex:
                logger.error(ex, exc_info=True)

    def fetch_trademarkia(self, keyword, brand, asin):
        trademarkia_path = os.path.join(TRADEMARKIA_DIR, keyword)
        if not os.path.exists(trademarkia_path):
            os.makedirs(trademarkia_path)

        try:
            trademarkia_file = os.path.join(trademarkia_path, '%s.txt' % asin)
            if not os.path.exists(trademarkia_file):
                if brand:
                    self.lr.load(
                        'https://www.trademarkia.com/trademarks-search.aspx?tn=%s'
                        % quote(brand))
                    eles = self.lr.xpaths(
                        '//table[contains(@class, "tablesaw")]//tr')
                    if eles is not None and len(eles) > 1:
                        logger.info('Brand %s: %s' % (brand, len(eles)))
                        open(trademarkia_file, 'w',
                             encoding='utf-8').write(str(len(eles)))
                    else:
                        logger.info('Brand %s: None' % brand)
                        open(trademarkia_file, 'w',
                             encoding='utf-8').write("0")
                else:
                    logger.info('Pass Empty Brand %s' % asin)
            else:
                logger.info('Pass Trademarkia %s' % asin)
        except KeyboardInterrupt:
            return
        except Exception as ex:
            logger.error(ex, exc_info=True)

    def do_products(self):
        while True:
            try:
                keyword = self.product_queue.get(timeout=5)

                self.fetch_products(keyword)
            except queues.Empty:
                logger.info('products empty')
                time.sleep(5)
            except Exception as ex:
                logger.error(ex, exc_info=True)

    def do_records(self):
        while True:
            try:
                keyword, asin, brand = self.record_queue.get(
                    timeout=5).split('|||')

                self.fetch_trademarkia(keyword, asin, brand)
                self.fetch_uspto(keyword, asin, brand)

                self.csv_queue.put('%s|||%s' % (keyword, asin))
            except queues.Empty:
                logger.info('records empty')
                time.sleep(5)
            except Exception as ex:
                logger.error(ex, exc_info=True)

    def do_csv(self):
        while True:
            try:
                keyword, asin = self.csv_queue.get(timeout=5).split('|||')

                self.output_csv(keyword, asin)
            except queues.Empty:
                logger.info('csv empty')
                time.sleep(5)
            except Exception as ex:
                logger.error(ex, exc_info=True)

    def output_csv(self, keyword, asin):

        product_dir = os.path.join(PRODUCTS_DIR, keyword)

        products = []
        for file in os.listdir(product_dir):
            asin, brand, title = open(
                os.path.join(product_dir, file), 'r',
                encoding='utf-8').read().strip().split('|||')

            t_m = '否'
            t_dir = os.path.join(TRADEMARKIA_DIR, keyword, '%s.txt' % asin)
            if os.path.exists(t_dir):
                if int(open(t_dir).read().strip()) > 0:
                    t_m = '是'

            u_m = '否'
            u_dir = os.path.join(USPTO_DIR, keyword, '%s.txt' % asin)
            if os.path.exists(u_dir):
                if int(open(u_dir).read().strip()) > 0:
                    u_m = '是'

            products.append([title, asin, brand, t_m, u_m])

        fields = ['标题', 'asin', '品牌', 'TRADEMARKIA', 'USPTO']
        csv_path = os.path.join(CSV_DIR, '%s.csv' % keyword)
        if os.path.exists(csv_path):
            with open(csv_path, 'w', encoding='utf-8', newline="") as csvfile:
                csvwriter = csv.writer(csvfile)
                csvwriter.writerow(fields)
                csvwriter.writerows(products)
        else:
            with open(csv_path, 'a', encoding='utf-8', newline="") as csvfile:
                csvwriter.writerows(products)