コード例 #1
0
ファイル: site_linear.py プロジェクト: Gzigithub/workspace
    def parse_detail(self, resp):
        if 'item' in resp.request.meta:
            root = lxml.html.fromstring(resp.text.encode('utf-8'))
            item = resp.request.meta.get('item')
            goods_desc = root.xpath('//td[@class="txt11"]/text()')
            item['goods_desc'] = goods_desc[0].replace('\n', '').replace(
                '\t', '') if goods_desc else ''
            # goods_name
            goods_name = root.xpath('//td[@class="lnk11b-colorOff"]')
            item['goods_name'] = util.clear_text(
                goods_name[0].text) if goods_name else ''
            # goods_sn
            match = self.goods_sn_pattern.search(resp.url)
            item['goods_sn'] = match.group(1) if match else ''
            # tiered
            tiered = []
            price_list = root.xpath('//td[@class="texttable"]')
            for x in range(0, len(price_list), 2):
                qty = util.intval(price_list[x].text_content())
                price = util.floatval(price_list[x + 1].text_content())
                if qty and price:
                    tiered.append([qty, price])
                else:
                    tiered = [[0, 0.00]]
                    break
            if not tiered:
                price = root.xpath('//td[@class="txt18b-red"]/text()')
                price = util.floatval(price[0]) if price else 0
                if price:
                    tiered = [1, price]
                else:
                    tiered = []

            item['tiered'] = tiered if tiered else [[0, 0.00]]
            # stock
            qty = root.xpath('//input[@id="qty"]/@value')
            qty = util.intval(qty[0]) if qty else 1
            stock = root.xpath('//input[@id="custcol7"]/@value')
            stock = util.intval(stock[0]) if stock else 0
            item['stock'] = [stock, qty]
            # url
            item['url'] = resp.url
            # provider_name
            item['provider_name'] = 'LINEAR'
            item['provider_url'] = ''
            # attr
            item['attr'] = []
            # rohs
            item['rohs'] = -1
            item['goods_other_name'] = ''
            # increment
            item['increment'] = 1
            # img
            item['goods_img'] = ''
            item['goods_thumb'] = ''
            #
        else:
            item = None
        return item
コード例 #2
0
ファイル: supchip.py プロジェクト: Gzigithub/workspace
def get_pages(url=None):
    try:
        # response = requests.get(url=url, headers=default_headers)
        response = fetcher(url=url, return_response=True)
        soup = BeautifulSoup(response.content, 'lxml')
    except:
        print 'Failed'
    search_num_div = soup.find('div', class_='search_num')
    search_num = box.intval(search_num_div.get_text(
        strip=True)) if search_num_div else 0
    if search_num:
        pages = int(math.ceil(search_num / 10.0))
        return pages
    else:
        return 0
コード例 #3
0
ファイル: finally.py プロジェクト: Gzigithub/workspace
 def parse_stock(self, resp):
     root = lxml.html.fromstring(resp.text.encode('utf-8'))
     product_list = root.xpath('//tr[@valign="top"][@height=85]')
     data = resp.request.meta.get('data', {})
     for product in product_list:
         detail = product.xpath('.//a[@class="lnk12b-blackOff"]')
         # goods_name
         goods_name = detail[0].text_content() if detail else ''
         detail_url = util.urljoin(
             resp.url, detail[0].xpath('./@href')[0]) if detail else ''
         # goods_sn
         goods_sn = self.goods_sn_pattern.search(detail_url)
         goods_sn = goods_sn.group(1) if goods_sn else ''
         # stock
         stock = self.stock_pattern.search(
             util.cleartext(remove_tags(product.text_content())))
         stock = util.intval(stock.group(1)) if stock else 0
         headers = copy.copy(self.headers)
         headers.update({
             'Host': 'shopping.netsuite.com',
             'Referer': '',
         })
         if goods_name and goods_sn:
             data['goods_name'] = goods_name
             data['goods_sn'] = goods_sn
             yield Request(url=detail_url,
                           headers=headers,
                           meta={
                               'item': copy.deepcopy(data),
                               'stock': copy.deepcopy(stock)
                           },
                           callback=self.parse_detail)
         else:
             yield Request(url=resp.url, headers=headers)
         # 翻页
         if 'range=' not in resp.url:
             links = LinkExtractor(allow=r'search=').extract_links(resp)
             for link in links:
                 yield Request(url=link.url,
                               headers=headers,
                               meta={
                                   'data': copy.deepcopy(data),
                                   'stock': copy.deepcopy(stock)
                               },
                               callback=self.parse_stock)
コード例 #4
0
    def parse_detail(self, resp):
        """解析系列型号数据"""
        item = GoodsItem()
        try:
            soup = BeautifulSoup(resp.text.encode('utf-8'), 'lxml')
        except Exception as e:
            logger.debug(u"初始化BS4对象失败,重试一次 URL:{url}".format(url=resp.url))
            # 重试一次
            return Request(url=resp.url,
                           headers=self.headers,
                           cookies=self.cookies)
        # goods_sn

        product_id = self.product_id_pattern_1.search(
            resp.url) or self.product_id_pattern_2.search(resp.url)
        goods_sn = product_id.group(1) if product_id else ''
        item['goods_sn'] = goods_sn
        if not goods_sn:
            logger.debug(u"获取goods_sn失败 URL:{url}".format(url=resp.url))
            return None
        try:
            # goods_name
            product_ref = soup.find('p', class_='ref')
            goods_name = ''
            if product_ref:
                goods_name_pattern = re.compile(
                    ur'<b>制造商零件编号:</b>\s*([^\"\'<>/]+)')
                product_ref_list = unicode(product_ref).split('<br/>')
                for x in product_ref_list:
                    match = goods_name_pattern.search(x)
                    if match:
                        goods_name = match.group(1)
                        break
            item['goods_name'] = goods_name
            # goods_other_name
            item['goods_other_name'] = ''
        except:
            logger.debug(u"获取goods_name失败 URL:{url}".format(url=resp.url))
            item['goods_name'] = ''
            item['goods_other_name'] = ''

        # goods_desc
        goods_desc = soup.find('p', class_='desc')
        if not goods_desc:
            logger.debug(u"获取goods_desc失败 URL:{url}".format(url=resp.url))
        item['goods_desc'] = goods_desc.get_text(
            strip=True) if goods_desc else ''

        # provider_name and provider_url
        provider_name = soup.find('img', id='ctl00_PlaceHolderMain_mfrLogo')
        item['provider_name'] = provider_name.get('title',
                                                  '') if provider_name else ''
        # 如果在商标图片中无法获取 provider_name ,尝试从 product-desc 中获取
        if not provider_name:
            desc_div = soup.find('div', id='product-desc')
            provider_name = desc_div.find('h2')
            provider_name = provider_name.get_text(
                strip=True) if provider_name else ''
            item['provider_name'] = provider_name
        item['provider_url'] = ''
        # url
        item['url'] = resp.url
        # doc
        doc = soup.find(
            'a',
            id='ctl00_PlaceHolderMain_csDownloadCenter_linkDatasheetUrlJustText'
        )
        item['doc'] = doc.get('href', '') if doc else ''
        # goods_img and goods_thumb
        goods_img = soup.find('img', id='previewedMEDImage')
        item['goods_img'] = goods_img.get('src', '') if goods_img else ''
        goods_thumb = soup.find('img', id='thumbnail-1')
        item['goods_thumb'] = goods_thumb.get('src', '') if goods_thumb else ''
        # catlog
        item['catlog'] = []
        catlog = soup.find('ul', id='breadcrumb-navigation')
        catlog_list = catlog.find_all('a')
        for a in catlog_list:
            breadcrumb_name = a.get_text(strip=True)
            breadcrumb_url = urlparse.urljoin(resp.url, a.get('href', ''))
            item['catlog'].append([breadcrumb_name, breadcrumb_url])
        # attr
        item['attr'] = []
        product_attr_div = soup.find('div',
                                     id='product-details-overview-highlights')
        product_attr_list = product_attr_div.find_all(
            'li') if product_attr_div else []
        for li in product_attr_list:
            attr_name, attr_value = li.get_text(strip=True).split(':')
            item['attr'].append([attr_name, attr_value])
        # tiered
        try:
            item['tiered'] = []
            price_table = soup.find('table', class_='product-prices')
            price_tr_list = price_table.find_all('tr', class_='price-break')
            for tr in price_tr_list:
                qty_th = tr.find('th')
                qty = qty_th.get_text(strip=True) if qty_th else 0
                qty = box.intval(qty)
                price_span = tr.find('span')
                price = price_span.get_text(strip=True) if price_span else 0.00
                price = box.floatval(price)
                # print qty, price
                if qty and price:
                    item['tiered'].append([qty, price])
                else:
                    item['tiered'] = [0, 0.00]
        except:
            logger.debug(u"获取tiered失败 URL:{url}".format(url=resp.url))
            item['tiered'] = [0, 0.00]
        # stock、increment、 min_qty
        try:
            stock_div = soup.find('div', id='product-qty-content')
            stock_tr = stock_div.find('tr', class_='qtyInStock')
            increment_tr = stock_div.find('tr', class_='multipleOf')
            min_qty_tr = stock_div.find('tr', class_='minOrderQty')
            stock = stock_tr.find('td', class_='qty').get_text(
                strip=True) if stock_tr else 0
            stock = box.intval(stock)
            increment = increment_tr.find('td', class_='qty').get_text(
                strip=True) if increment_tr else 1
            increment = box.intval(increment)
            min_qty = min_qty_tr.find('td', class_='qty').get_text(
                strip=True) if min_qty_tr else 1
            min_qty = box.intval(min_qty)
            item['stock'] = [stock, min_qty]
            item['increment'] = increment
        except:
            logger.debug(u"获取stock失败 URL:{url}".format(url=resp.url))
            item['stock'] = [0, 1]
            item['increment'] = 1
        # rohs
        rohs_div = soup.find('div', id='ctl00_PlaceHolderMain_imgRoHS')
        item['rohs'] = 1 if rohs_div else -1
        return item
コード例 #5
0
ファイル: supchip.py プロジェクト: Gzigithub/workspace
def get_page_detail(url=None, category=None):
    try:
        # response = requests.get(url=url, headers=default_headers)
        response = fetcher(url=url, return_response=True)
        soup = BeautifulSoup(response.content, 'lxml')
    except:
        print 'Failed'

    goods_div = soup.find('div', class_='shengpin')
    if goods_div:
        goods_list = goods_div.find_all('li')
        for goods in goods_list:
            data = {}
            # goods_sn
            if not goods:
                continue
            gid = goods.attrs['data-gdsid']
            if gid:
                _sn = ('%s-%s' % (gid, PN2)).encode('utf-8')
                data['goods_sn'] = hashlib.md5(_sn).hexdigest()
            else:
                continue
            # category
            data['category'] = category if category else []

            # goods_other_name
            other_name_span = goods.find('span', class_='top_1')
            other_name = other_name_span.get_text(
                strip=True) if other_name_span else ''
            data['goods_other_name'] = other_name

            # goods_name
            goods_name_span = goods.find_all('span', class_='p_21')
            if goods_name_span:
                goods_name = goods_name_span[1].get_text(strip=True)
                data['goods_name'] = goods_name.split(u':')[1]
                print data['goods_name']

            # goods_desc
            goods_type_span = goods.find('span', class_='p_21')
            data['goods_desc'] = goods_type_span.get_text(
                strip=True) if goods_type_span else ''

            # tiered cn_price
            price_span = goods.find('span', class_='p_23')
            hk_price = 0.0
            oversea_price = 0.0
            cn_price = box.floatval(price_span.get_text(
                strip=True)) if price_span else 0.00
            data['tiered'] = [[1, hk_price, cn_price, oversea_price]]

            # stock
            stock_span = goods.find('span', class_='p_24')
            data['stock'] = box.intval(stock_span.get_text(
                strip=True)) if stock_span else 0

            # url
            data['url'] = url

            # img
            data['goods_img'] = ''

            yield data
コード例 #6
0
    def parse_detail(self, resp):
        """解析系列型号数据"""
        item = GoodsItem()
        root = lxml.html.fromstring(resp.text.encode('utf-8'))
        item.update({
            'goods_img': '',
            'goods_thumb': '',
            'provider_url': '',
            'attr': [],
            'catlog': [],
            'rohs': -1,
        })
        _table = root.xpath('//table[@class="partdetail"]')
        select_parse_mode = len(_table)
        flag = 'Product Change Notice' in resp.text.encode('utf-8')
        if select_parse_mode == 1 or flag:
            detail_table = _table[0]
            info_table = detail_table.xpath('//table[@id="partinfo"]')
            goods_sn = info_table[0].xpath(
                './/td[@class="txtleft"]/h4/text()') if info_table else None
            if not goods_sn:
                return
            item['goods_sn'] = goods_sn[0].strip()
            item['goods_name'] = item['goods_sn']

            # goods_other_name
            goods_other_name = info_table[0].xpath('.//tr[2]/td[2]/text()')
            item['goods_other_name'] = goods_other_name[0].strip(
            ) if goods_other_name else ''

            # provider_name
            provider_name = info_table[0].xpath('.//tr[3]/td[2]/text()')
            item['provider_name'] = provider_name[0].strip(
            ) if provider_name else ''

            # goods_desc
            goods_desc = info_table[0].xpath('.//tr[4]/td[2]/text()')
            item['goods_desc'] = goods_desc[0].strip() if goods_desc else ''

            # doc
            doc = info_table[0].xpath('.//tr[5]//h4/a/@href')
            item['doc'] = urlparse.urljoin(resp.url, doc[0]) if doc else ''

            # url
            item['url'] = resp.url

            # increment
            item['increment'] = 1

            # tiered
            price_table = detail_table.xpath('.//table[@class="price-break"]')
            if not price_table:
                item['tiered'] = [[0, 0.00]]
            else:
                tiered = []
                price_tr = price_table[0].findall('tr')
                for tr in price_tr:
                    tds = tr.findall('td')
                    qty = util.intval(tds[0].text)
                    price = util.floatval(tds[1].text, places=5)
                    if price == 0 or qty == 0:
                        break
                    tiered.append([qty, price])
                item['tiered'] = tiered if tiered else [[0, 0.00]]

            # stock
            item['stock'] = [0, 1]
            available = detail_table.xpath('./tr[2]/td[2]/text()')
            stock = util.intval(available[0].strip()) if available else 0
            # qty
            quantity = detail_table.xpath('./tr[2]/td[4]')
            input_box = quantity[0].findall('input') if quantity else None
            if input_box:
                quantity = quantity[0].xpath(
                    '//input[@class="textbox"]/@value')
            else:
                quantity = util.intval(quantity[0].text) if quantity else 1
            item['stock'] = [stock, quantity]
        elif select_parse_mode == 2:
            stock_table = _table[0].xpath('./tr[2]/td')
            info_table = _table[1]
            goods_sn = stock_table[0].text_content()
            item['goods_sn'] = goods_sn.strip()
            if not goods_sn:
                return
            item['goods_sn'] = goods_sn.strip()
            item['goods_name'] = item['goods_sn']

            # url
            item['url'] = resp.url

            # tiered
            price_table = stock_table[5].xpath(
                './/table[@class="price-break"]')
            if not price_table:
                item['tiered'] = [[0, 0.00]]
            else:
                tiered = []
                price_tr = price_table[0].findall('tr')
                for tr in price_tr:
                    tds = tr.findall('td')
                    qty = util.intval(tds[0].text)
                    price = util.floatval(tds[1].text, places=5)
                    if price == 0 or qty == 0:
                        break
                    tiered.append([qty, price])
                item['tiered'] = tiered if tiered else [[0, 0.00]]

            # stock
            item['stock'] = [0, 1]
            available = stock_table[1].text_content()
            stock = util.intval(available) if available.strip() else 0
            # qty
            quantity = stock_table[6]
            input_box = quantity.findall(
                'input') if quantity is not None else None
            if input_box:
                input_value = quantity.xpath(
                    '//input[@class="textbox"]/@value')
                quantity = util.intval(
                    input_value[0]) if len(input_value) else 1
            else:
                quantity = item['tiered'][0][
                    0] if item['tiered'][0][0] != 0 else 1
            item['stock'] = [stock, quantity]

            # increment
            increment = stock_table[4].text_content()
            item['increment'] = util.intval(increment, index=999)

            # goods_other_name
            goods_other_name = info_table.xpath('./tr[3]/td[2]/text()')
            item['goods_other_name'] = goods_other_name[0].strip() if len(
                goods_other_name) else ''

            # provider_name
            provider_name = info_table.xpath('./tr[4]/td[2]/text()')
            item['provider_name'] = provider_name[0].strip(
            ) if provider_name else ''

            # goods_desc
            goods_desc = info_table.xpath('./tr[5]/td[2]/text()')
            item['goods_desc'] = goods_desc[0].strip() if goods_desc else ''

            # doc
            doc = info_table.xpath('./tr[7]//a/@href')
            item['doc'] = urlparse.urljoin(resp.url, doc[0]) if doc else ''

            # rohs
            rohs = info_table.xpath('./tr[8]//img')
            item['rohs'] = 1 if len(rohs) else -1

        return item
コード例 #7
0
    def parse_detail(self, resp):
        item = GoodsItem()
        root = lxml.html.fromstring(resp.text.encode('utf-8'))
        # goods_name
        goods_name = root.xpath('//td[@class="lnk11b-colorOff"]')
        item['goods_name'] = util.clear_text(
            goods_name[0].text) if goods_name else ''
        # goods_sn
        match = self.goods_sn_pattern.search(resp.url)
        item['goods_sn'] = match.group(1) if match else ''
        if not item['goods_name'] or not item['goods_sn']:
            logger.debug(
                "无法解析goods_name和goods_sn URL:{url}".format(url=resp.url))
            if not resp.request.meta.get('retry', None):
                return Request(url=resp.url,
                               headers=self.headers,
                               meta={'retry': 1})
            else:
                return None
        # goods_desc
        goods_desc = root.xpath('//td[@class="txt11"]/text()')
        item['goods_desc'] = goods_desc[0].replace('\n', '').replace(
            '\t', '') if goods_desc else ''
        # tiered
        tiered = []
        price_list = root.xpath('//td[@class="texttable"]')
        for x in range(0, len(price_list), 2):
            qty = util.intval(price_list[x].text_content())
            price = util.floatval(price_list[x + 1].text_content())
            if qty and price:
                tiered.append([qty, price])
            else:
                tiered = [[0, 0.00]]
                break
        if not tiered:
            price = root.xpath('//td[@class="txt18b-red"]/text()')
            price = util.floatval(price[0]) if price else 0
            if price:
                tiered = [1, price]
            else:
                tiered = []

        item['tiered'] = tiered if tiered else [[0, 0.00]]
        # stock
        qty = root.xpath('//input[@id="qty"]/@value')
        qty = util.intval(qty[0]) if qty else 1
        stock = root.xpath('//input[@id="custcol7"]/@value')
        stock = util.intval(stock[0]) if stock else 0
        item['stock'] = [stock, qty]
        # url
        item['url'] = resp.url
        # provider_name
        item['provider_name'] = 'LINEAR'
        item['provider_url'] = ''
        # doc catlog
        item['doc'] = ''
        item['catlog'] = ''
        # attr
        item['attr'] = []
        # rohs
        item['rohs'] = -1
        item['goods_other_name'] = ''
        # increment
        item['increment'] = 1
        # img
        item['goods_img'] = ''
        item['goods_thumb'] = ''
        # 一些信息需要在linear.com.cn获取
        search_url = 'http://www.linear.com.cn/search/index.php?q={search}'.format(
            search=item['goods_name'])
        _headers = self.headers
        _headers.update({'Host': 'www.linear.com.cn'})
        return Request(url=search_url,
                       headers=_headers,
                       meta={
                           'item': item,
                           'dont_redirect': True,
                           'handle_httpstatus_list': [302]
                       },
                       callback=self.manual_handle_of_redirects)
コード例 #8
0
ファイル: search_page.py プロジェクト: Gzigithub/workspace
def get_detail():
    target = 'https://estore.heilind.com/2BA-AL-36/POM2BA-AL-36.html'
    # rs = requests.get(url=target, headers=_headers, proxies=proxies)
    # html = rs.text.encode('utf-8')
    # with open(r'html/detail3.html', 'w') as fp:
    #     fp.write(html)
    with open('html/detail3.html', 'r') as fp:
        html = fp.read()
    root = lxml.html.fromstring(html)
    item = {
        'goods_img': '',
        'goods_thumb': '',
        'provider_url': '',
        'attr': [],
        'catlog': [],
        'rohs': -1,
    }
    _table = root.xpath('//table[@class="partdetail"]')
    select_parse_mode = len(_table)
    if select_parse_mode == 1:
        detail_table = _table[0]
        info_table = detail_table.xpath('//table[@id="partinfo"]')
        goods_sn = info_table[0].xpath(
            './/td[@class="txtleft"]/h4/text()') if info_table else None
        if not goods_sn:
            return
        item['goods_sn'] = goods_sn[0].strip()
        item['goods_name'] = item['goods_sn']

        # goods_other_name
        goods_other_name = info_table[0].xpath('.//tr[2]/td[2]/text()')
        item['goods_other_name'] = goods_other_name[0].strip(
        ) if goods_other_name else ''

        # provider_name
        provider_name = info_table[0].xpath('.//tr[3]/td[2]/text()')
        item['provider_name'] = provider_name[0].strip(
        ) if provider_name else ''

        # goods_desc
        goods_desc = info_table[0].xpath('.//tr[4]/td[2]/text()')
        item['goods_desc'] = goods_desc[0].strip() if goods_desc else ''

        # doc
        doc = info_table[0].xpath('.//tr[5]//h4/a/@href')
        item['doc'] = urlparse.urljoin(target, doc[0]) if doc else ''

        # url
        item['url'] = ''

        # increment
        item['increment'] = 1

        # tiered
        price_table = detail_table.xpath('.//table[@class="price-break"]')
        if not price_table:
            item['tiered'] = [[0, 0.00]]
        else:
            tiered = []
            price_tr = price_table[0].findall('tr')
            for tr in price_tr:
                tds = tr.findall('td')
                qty = util.intval(tds[0].text)
                price = util.floatval(tds[1].text, places=5)
                if price == 0 or qty == 0:
                    break
                tiered.append([qty, price])
            item['tiered'] = tiered if tiered else [[0, 0.00]]

        # stock
        item['stock'] = [0, 1]
        available = detail_table.xpath('./tr[2]/td[2]/text()')
        stock = util.intval(available[0].strip()) if available else 0
        # qty
        quantity = detail_table.xpath('./tr[2]/td[4]')
        input_box = quantity[0].findall('input') if quantity else None
        if input_box:
            quantity = quantity[0].xpath('//input[@class="textbox"]/@value')
        else:
            quantity = util.intval(quantity[0].text) if quantity else 1
        item['stock'] = [stock, quantity]
    elif select_parse_mode == 2:
        stock_table = _table[0].xpath('./tr[2]/td')
        info_table = _table[1]
        goods_sn = stock_table[0].text_content()
        item['goods_sn'] = goods_sn.strip()
        if not goods_sn:
            return
        item['goods_sn'] = goods_sn.strip()
        item['goods_name'] = item['goods_sn']

        # url
        item['url'] = ''

        # tiered
        price_table = stock_table[5].xpath('.//table[@class="price-break"]')
        if not price_table:
            item['tiered'] = [[0, 0.00]]
        else:
            tiered = []
            price_tr = price_table[0].findall('tr')
            for tr in price_tr:
                tds = tr.findall('td')
                qty = util.intval(tds[0].text)
                price = util.floatval(tds[1].text, places=5)
                if price == 0 or qty == 0:
                    break
                tiered.append([qty, price])
            item['tiered'] = tiered if tiered else [[0, 0.00]]

        # stock
        item['stock'] = [0, 1]
        available = stock_table[1].text_content()
        stock = util.intval(available) if available.strip() else 0
        # qty
        quantity = stock_table[6]
        input_box = quantity.findall('input') if quantity is not None else None
        if input_box:
            input_value = quantity.xpath('//input[@class="textbox"]/@value')
            quantity = util.intval(input_value[0]) if len(input_value) else 1
        else:
            quantity = item['tiered'][0][0] if item['tiered'][0][0] != 0 else 1
        item['stock'] = [stock, quantity]

        # increment
        increment = stock_table[4].text_content()
        item['increment'] = util.intval(increment, index=999)

        # goods_other_name
        goods_other_name = info_table.xpath('./tr[3]/td[2]/text()')
        item['goods_other_name'] = goods_other_name[0].strip() if len(
            goods_other_name) else ''

        # provider_name
        provider_name = info_table.xpath('./tr[4]/td[2]/text()')
        item['provider_name'] = provider_name[0].strip(
        ) if provider_name else ''

        # goods_desc
        goods_desc = info_table.xpath('./tr[5]/td[2]/text()')
        item['goods_desc'] = goods_desc[0].strip() if goods_desc else ''

        # doc
        doc = info_table.xpath('./tr[7]//a/@href')
        item['doc'] = urlparse.urljoin(target, doc[0]) if doc else ''

        # rohs
        rohs = info_table.xpath('./tr[8]//img')
        item['rohs'] = 1 if len(rohs) else -1

    return item
コード例 #9
0
    def parse_detail(self, resp):
        item = GoodsItem()
        root = lxml.html.fromstring(resp.text.encode('utf-8'))
        # goods_sn
        goods_sn_match = re.search(r'productId=(\d+)', resp.url)
        if goods_sn_match:
            item['goods_sn'] = goods_sn_match.group(1)
        else:
            logger.debug(u"解析 goods_sn 失败,重试URL:{url}".format(url=resp.url))
            return None
        # goods_name, provider_name, goods_desc
        try:
            title = root.xpath('//span[@class="ContentTitle"]')[0]
            item['goods_name'] = util.cleartext(title.text)
            provider_name = title.xpath('a')
            item['goods_desc'] = title.text_content().strip(' ')
            item['provider_name'] = util.cleartext(
                provider_name[0].text) if provider_name else ''
            item['provider_url'] = ''
        except IndexError:
            logger.debug(u"解析 goods_name 失败,重试URL:{url}".format(url=resp.url))
            return Request(url=resp.url, headers=self.headers)

        # goods_other_name
        goods_other_name = root.xpath('//span[@style="font-weight:bold;"]')
        for x in goods_other_name:
            match = re.search('MFG\s*Part\s*Number:\s*([^\s]+)', x.text,
                              re.IGNORECASE)
            item['goods_other_name'] = match.group(1) if match else ''

        # url
        item['url'] = resp.url

        # catlog
        item['catlog'] = []
        catlog_div = root.xpath('//div[@class="breadcrumb"]//a')
        for catlog in catlog_div:
            catlog_name = util.cleartext(catlog.text)
            catlog_url = util.urljoin(resp.url, catlog.xpath('./@href')[0])
            if catlog_name and catlog_url:
                if '/Pages/Home.aspx' in catlog_url or 'productCategory=All' in catlog_url:
                    continue
                item['catlog'].append([catlog_name, catlog_url])

        # attr and tiered div
        div = root.xpath('//div[@id="div2"]')
        # 获取不到div就重试一次
        if not div and not resp.request.meta.get('retry'):
            logger.debug(u'网页加载不完整。重试一次 URL:{url}'.format(url=resp.url))
            return Request(url=resp.url,
                           headers=self.headers,
                           meta={'retry': 1})

        # rohs
        rohs_img = div[0].xpath('.//img[contains(@title, "ROHS")]/@src')
        item['rohs'] = 1 if rohs_img else -1

        # img
        img_thumb = div[0].xpath('.//table[@align="Right"]//img/@src')
        item['goods_thumb'] = util.urljoin(resp.url,
                                           img_thumb[0]) if img_thumb else ''
        img_large = div[0].xpath(
            './/table[@align="Right"]//a[@id="imgFull"]/@href')
        item['goods_img'] = util.urljoin(resp.url,
                                         img_large[0]) if img_large else ''

        # attr
        item['attr'] = []
        try:
            attr_table = div[0].xpath(
                './/td[@align="left"]//table[@class="PDTable"]//td')
            for x in range(0, len(attr_table), 2):
                attr_key = attr_table[x].text
                attr_value = attr_table[x + 1].text
                if attr_key:
                    attr_key = attr_key.strip(' ')
                    attr_value = attr_value.strip(' ') if attr_value else ''
                    if attr_value:
                        item['attr'].append([attr_key, attr_value])
                else:
                    break
        except IndexError:
            logger.debug(u"无法查找到属性列表 URL:{url}".format(url=resp.url))

        # tiered
        item['tiered'] = []
        try:
            price_table = div[0].xpath(
                './/td[@align="center"]//table[@class="PDTable"]/tr')
            stock = []
            for tr in price_table:
                td = tr.findall('td')
                if len(td) == 1:
                    if "Quote Required" in td[0].text:
                        item['tiered'] = [[0, 0.00]]
                        break
                    else:
                        stock.append(util.intval(td[0].text))
                elif len(td) == 2:
                    qty = util.intval(td[0].text)
                    price = util.floatval(td[1].text)
                    if price:
                        item['tiered'].append([qty, price])
                else:
                    continue
            # 可能 Manufacturer Stock 并没有显示在表格中,将其设置为0,并添加到stock中
            if len(stock) == 1:
                stock.append(0)
            # 从价格阶梯中获取最小起订量加入stock
            min_qty = item['tiered'][0][0] if item['tiered'][0][0] else 1
            stock.insert(1, min_qty)
            item['stock'] = stock
        except IndexError:
            logger.debug(u"无法正确解析价格列表 URL:{url}".format(url=resp.url))
            item['stock'] = [0, 1, 0]
            item['tiered'] = [[0, 0.00]]

        # doc
        doc_link = root.xpath('//a[@id="docDown"]/@href')
        item['doc'] = doc_link[0] if doc_link else ''

        # increment
        item['increment'] = 1

        return item
コード例 #10
0
    def parse_resp(self, resp):
        global request_list
        request_list.append(resp.url)
        if 'Product-Details' in resp.url:
            yield self.parse_detail(resp)
        elif 'productCategory=' in resp.url:
            html = resp.text.encode('utf-8')
            root = lxml.html.fromstring(html)
            # 获取页数
            search_result = root.xpath('//span[@class="SearchResult"]/text()')
            count = util.intval(search_result[0]) if search_result else 0
            pages = int(math.ceil(count / self.limit))
            print "O*O" * 20
            print pages
            if pages <= 1:
                yield None
                return
            # if resp.request.meta.get('next_page', False):
            #     links = LinkExtractor(allow=filter_rules).extract_links(resp)
            #     print "&%" * 20
            #     print links
            #     for link in links:
            #         yield Request(url=link.url, headers=self.headers, callback=self.parse_resp)
            form_data = {}
            # 获取翻页参数 post_back
            page_list = root.xpath('//tr[@class="Paging"]//a/@href')
            post_back_pattern = re.compile('\'([^\']+)\',\'([^\']+)\'')

            match = post_back_pattern.search(
                page_list[0]) if page_list else None
            post_data = match.group(1)

            # 获取事件参数 ctl00$scr
            match = re.search(r'(ctl00[^\"\',]+outerPanelPanel)', html)
            src = match.group() + '|' if match else ''

            # 获取事件参数 __VIEWSTATE
            field1 = root.xpath('//input[@id="__VIEWSTATE"]/@value')
            form_data['__VIEWSTATE'] = field1[0] if field1 else ''

            # 获取事件参数 __VIEWSTATEGENERATOR
            field2 = root.xpath('//input[@id="__VIEWSTATEGENERATOR"]/@value')
            form_data['__VIEWSTATEGENERATOR'] = field2[0] if field2 else ''

            # 获取事件参数 __VIEWSTATEENCRYPTED 没有这个参数请求会出错
            form_data['__VIEWSTATEENCRYPTED'] = ''

            # 获取事件参数 __EVENTVALIDATION
            field3 = root.xpath('//input[@id="__EVENTVALIDATION"]/@value')
            form_data['__EVENTVALIDATION'] = field3[0] if field3 else ''

            # 构造翻页表单
            for x in xrange(2, pages + 1):
                form_data.update({
                    'ctl00$scr':
                    src + post_data,
                    '__EVENTTARGET':
                    post_data,
                    '__EVENTARGUMENT':
                    'Page${page_num}'.format(page_num=x),
                })
                _headers = self.headers
                _headers.update({
                    'Content-Type':
                    'application/x-www-form-urlencoded; charset=UTF-8',
                    'X-MicrosoftAjax': 'Delta=true',
                    'Accept': '*/*'
                })
                # yield FormRequest(url=resp.url, headers=self.headers,
                #                   formdata=copy.deepcopy(form_data), meta={'next_page': True, 'page': x},
                #                   callback=self.parse_resp)
                yield FormRequest(url=resp.url,
                                  headers=self.headers,
                                  formdata=copy.deepcopy(form_data),
                                  meta={
                                      'next_page': True,
                                      'page': x
                                  })