Ejemplo n.º 1
0
 def parse_price(self, resp):
     """解析库存价格数据"""
     items = resp.meta.get('items')
     if not items:
         logger.error('request meta data error, url: %s', resp.url)
         return
     prices = {}
     try:
         data = json.loads(resp.body)
         for entprice in data['EntitledPrice']:
             tiered = []
             if 'RangePrice' not in entprice:
                 entprice['RangePrice'] = []
             for vo in entprice['RangePrice']:
                 qty = util.intval(vo['minimumQuantity']['value']) if 'minimumQuantity' in vo else 1
                 price = util.floatval(vo['priceInRange']['value']) if 'priceInRange' in vo else 0
                 if not qty or (tiered and qty < tiered[-1][0]):
                     continue
                 tiered.append([qty, price])
             if not tiered:
                 tiered.append([0, 0.0])
             prices[entprice['productId']] = tiered
     except:
         logger.exception('parse stock price error, url: {0}---price_Json_error---{1}'.format(resp.url, resp.body) )
     for item in items:
         if item['goods_sn'] in prices:
             item['tiered'] = prices[item['goods_sn']]
         yield item
Ejemplo n.º 2
0
 def parse_resp(self, resp):
     search_match = self.product_url_pattern_0.search(
         urllib.unquote(resp.url))
     detail_match = self.product_url_pattern_1.search(
         urllib.unquote(resp.url)) or self.product_url_pattern_2.search(
             urllib.unquote(resp.url))
     print "=" * 30
     print resp.url
     print urllib.unquote(resp.url)
     print detail_match
     print search_match
     if detail_match:
         yield self.parse_detail(resp)
     elif search_match:
         soup = BeautifulSoup(resp.text.encode('utf-8'), 'lxml')
         # 获取搜索数目
         try:
             total = soup.find('h3', class_='results')
             total = util.intval(total.get_text(strip=True)) if total else 0
         except:
             total = 0
         pages = int(math.ceil(total / self.limit_num))
         if pages <= 1:
             return
         for x in range(1, pages + 1):
             page_url = "http://cn.futureelectronics.com/zh/search.aspx?dsNav=Ro:%d,Aro:%d" % (
                 x * 10, x * 10)
             search_id = search_match.group(1)
             page_url = page_url + ',N:{search_id}'.format(
                 search_id=search_id)
             yield Request(url=page_url,
                           headers=self.headers,
                           cookies=self.cookies)
Ejemplo n.º 3
0
 def parse_detail(self, data, category=None):
     """解析系列型号数据"""
     if category is None:
         category = {}
     item = GoodsItem()
     item['url'] = urlparse.urljoin(self.base_url, data['avn_pdp_seo_path'])
     item['goods_sn'] = data['uniqueID']
     item['goods_name'] = data['mfPartNumber_ntk'].upper()
     if not item['goods_name']:
         return None
     if 'packageTypeCode' in item:
         item['goods_other_name'] = '{0}/{1}'.format(item['goods_name'], item['packageTypeCode']).upper()
     item['provider_name'] = data['manufacturer']
     item['provider_url'] = ''
     item['goods_desc'] = data['shortDescription'] if 'shortDescription' in data else ''
     if 'avn_thumbnail' in data and data['avn_thumbnail']:
         item['goods_thumb'] = util.urljoin(self.base_url, data['avn_thumbnail'])
     else:
         item['goods_thumb'] = ''
     item['goods_img'] = item['goods_thumb'].replace('icon_thumb', 'icon_web')
     if 'auxDescription2' in data and data['auxDescription2']:
         item['doc'] = data['auxDescription2']
     else:
         item['doc'] = ''
     min_qty = int(data['xcatField1']) if 'xcatField1' in data else 1
     if 'multQuantity' in data:
         increment = int(data['multQuantity'])
     else:
         increment = 1
     if 'inv_strlocqty' in data:
         stock_qty = util.intval(data['inv_strlocqty'])
     else:
         stock_qty = 0
     item['rohs'] = 1 if 'ROHSComplianceCode' in data and data['ROHSComplianceCode'] == 'Y' else 0
     item['tiered'] = [[0, 0.0]]
     item['stock'] = [stock_qty, min_qty]  # 库存
     item['increment'] = increment
     # 属性
     item['attr'] = []
     if 'attributes' not in data:
         data['attributes'] = []
     for vo in data['attributes']:
         try:
             item['attr'].append([vo['name'], vo['values'][0]['value']])
         except:
             pass
     # 分类
     item['catlog'] = []
     catelogs = data['parentCatgroup_id_path'].split('_')[-1].split(':')
     for vo in catelogs:
         if vo not in category:
             continue
         item['catlog'].append((category[vo], vo))
     item['region'] = 'AMERICAS'
     item['id'] = 16
     return item
Ejemplo n.º 4
0
 def parse(self, resp):
     systems_catalog = 0
     try:
         product_dict = json.loads(resp.text.encode('utf-8'))
         systems_catalog = resp.meta.get('systemsCatalog')
         total_match_count_string = util.intval(
             product_dict.get('totalMatchCountString'))
         pages = int(math.ceil(total_match_count_string / self.limit_num))
         for pageNum in xrange(1, pages + 1):
             self.form_data['pageNum'] = str(pageNum)
             yield Request(url=self.processData_url,
                           method='POST',
                           headers=self.headers,
                           body=json.dumps(self.form_data),
                           meta={'systemsCatalog': systems_catalog},
                           callback=self.parse_detail)
     except:
         logger.exception('Parse error, systemsCatalog: %s',
                          systems_catalog)
Ejemplo n.º 5
0
    def parse_model_detail(self, response):
        '''解析产品详情'''
        json_html = re.findall(
            r'<script type="application/ld\+json">(.*?)</script>',
            response.body, re.S)
        if not json_html:
            raise DropItem('匹配源码内容异常 请检查:{0}'.format(response.url))
        json_data = json.loads(json_html[0])
        product_list = json_data['offers']
        pre_url = 'https://www.ti.com.cn/product/cn/{}'.format(
            json_data['mpn'])
        description = json_data['description']
        doc_url = urljoin(
            self.base_url,
            response.xpath(
                '//div/a[@data-navtitle="data sheet"]/@href').extract_first())
        attrs_items = response.xpath(
            '//ti-multicolumn-list/ti-multicolumn-list-row')
        attr_list = []
        # 获取属性列表
        for attrs_item in attrs_items:
            attr = attrs_item.xpath(
                './ti-multicolumn-list-cell/span/text()').extract()
            if not attr:
                continue
            key = util.cleartext(attr[0])
            val = util.cleartext(attr[1])
            if key and val:
                attr_list.append((key, val))
        # 获取分类列表
        cat_list = []
        cat_items = response.xpath(
            '//ti-breadcrumb/ti-breadcrumb-section/a')[1:]
        for cat_item in cat_items:
            ckey = util.cleartext(cat_item.xpath('./text()').extract_first())
            cval = urljoin(self.base_url,
                           cat_item.xpath('./@href').extract_first())
            cat_list.append((ckey, cval))

        for data in product_list:
            item = GoodsItem()
            data = data['itemOffered']
            item['url'] = pre_url
            item['goods_sn'] = data['sku']
            item['goods_other_name'] = item['goods_name'] = data['mpn']
            item['provider_name'] = data['brand']
            item['provider_url'] = ''
            item['goods_desc'] = description
            item['goods_img'] = item['goods_thumb'] = ''
            item['doc'] = doc_url
            item['rohs'] = 0
            shop_price = data['offers'].get('price')
            item['tiered'] = []
            if not shop_price:
                item['stock'] = [0, 1]  # 库存
                item['increment'] = 1
            else:
                # 庫存判斷
                if not data['offers'].get('inventoryLevel'):
                    item['stock'] = [0, 1]
                else:
                    item['stock'] = [
                        util.intval(data['offers']['inventoryLevel']), 1
                    ]  # 库存
                for price_item in data['offers']['priceSpecification']:
                    pnum = price_item['eligibleQuantity']['minValue']
                    pval = price_item['price']
                    item['tiered'].append(
                        (util.intval(pnum), util.floatval(pval)))
                item['increment'] = item['tiered'][0][0]
            if not item['tiered']:
                item['tiered'] = [[0, 0.00]]
            # 属性
            item['attr'] = attr_list
            # 分类
            item['catlog'] = cat_list
            yield item
Ejemplo n.º 6
0
def fetch_search_data(keyword=None,
                      id=None,
                      data_dict=None,
                      headers=None,
                      proxy=None,
                      **kwargs):
    """获取搜索数据"""
    if keyword:
        print '正在获取 richardsonrfpd 中关键词:%s 的相关数据' % keyword
        url = 'http://www.richardsonrfpd.com/Pages/AdvanceSearch.aspx'
    elif 'url' in kwargs:
        url = kwargs['url']
    else:
        return 404
    _headers = copy.copy(default_headers)
    if isinstance(headers, dict):
        _headers.update(util.rfc_headers(headers))
    try:
        proxies = kwargs.get('proxies')
        if proxies is None and proxy:
            i = random.randint(0, proxy[0] - 1)
            proxies = {
                'http': 'http://' + proxy[1][i],
                'https': 'https://' + proxy[1][i]
            }
        response = requests.get(url,
                                headers=_headers,
                                timeout=30,
                                proxies=proxies)
        resp = do_search(response, keyword)
        if isinstance(resp, int):
            raise ValueError
    except Exception as e:
        logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' %
                     (util.traceback_info(e), url))
        if 'Invalid URL' not in str(e):
            data_dict['list'].append({
                'status': -400,
                'url': url,
                'id': id,
                'count': kwargs.get('count', 1)
            })
        return -400
    if resp.status_code != 200:
        if resp.status_code == 404 and '404.html' in resp.url:
            logger.info('STATUS:404; INFO:无效产品; URL: %s' % url)
            return 404
        logger.debug('STATUS:-405 ; INFO:请求错误,网页响应码 %s ; PROXY:%s ; URL:%s' %
                     (resp.status_code, proxies['http'] if proxy else '', url))
        data_dict['list'].append({
            'status': -405,
            'url': url,
            'id': id,
            'count': kwargs.get('count', 1)
        })
        return -405
    resp.encoding = 'utf-8'
    # 开始解析resp
    # 获取搜索的数量
    if 'Search-Results.aspx' in resp.url:
        product_list = analyse_product_url(resp)
    root = lxml.html.fromstring(resp.text.encode('utf-8'))
    product_list = root.xpath('//tr[@valign="top"][@height=85]')
    if len(product_list) <= 0:
        data_dict['list'].append({
            'status': 404,
            'url': url,
            'id': id,
            'count': kwargs.get('count', 1)
        })
        return 404
    for product in product_list:
        detail = product.xpath('.//a[@class="lnk12b-blackOff"]')
        detail_url = util.urljoin(
            resp.url, detail[0].xpath('./@href')[0]) if detail else ''
        match = goods_sn_pattern.search(detail_url)
        if not match and detail_url:
            logger.debug(u"无法匹配链接中的goods_sn URL{url}".format(url=detail_url))
            return -404
        goods_sn = match.group(1)
        goods_name = detail[0].text_content() if detail else ''
        data_dict['url'].append({
            'id': id,
            'url': detail_url,
            'goods_sn': goods_sn,
            'goods_name': goods_name,
        })
    if 'showMore=true' in url:
        return 200
    count = root.xpath('//td[@class="medtext"]')
    count = util.number_format(count[0].text, places=0, index=999,
                               smart=True) if count else 0
    page_num = int(math.ceil(count / 10.0))
    if page_num <= 1:
        return 200
    max_list_num = util.intval(kwargs.get('max_list_num', 5))
    page_list = root.xpath('//td[@class="medtext"]/a/@href')
    for x in xrange(1, page_num + 1):
        if max_list_num and x > max_list_num:
            break
        page_url = 'http://shopping.netsuite.com/s.nl/c.402442/sc.2/.f?search={search}&range={start}%2C{end}%2C{total}'.format(
            search=keyword, start=x * 10 + 1, end=(x + 1) * 10, total=count)
        data_dict['list'].append({
            'id': id,
            'url': page_url,
        })
    return 200
Ejemplo n.º 7
0
def _parse_detail_data(resp, headers=None, **kwargs):
    """
    解析详情数据,独立出来

    @param  data    页面数据
    @param  url     解析的页面url(方便记录异常)
    @param  kwargs  扩展参数
    """
    item = {}
    root = lxml.html.fromstring(resp.text.encode('utf-8'))
    # goods_name
    goods_name = root.xpath('//td[@class="lnk11b-colorOff"]')
    item['goods_name'] = util.cleartext(
        goods_name[0].text) if goods_name else ''
    # goods_sn
    match = goods_sn_pattern.search(resp.url)
    item['goods_sn'] = match.group(1) if match else ''
    if not item['goods_name'] or not item['goods_sn']:
        logger.debug("无法解析goods_name和goods_sn URL:{url}".format(url=resp.url))
        return -404
    # goods_desc
    goods_desc = root.xpath('//td[@class="txt11"]/text()')
    item['desc'] = util.cleartext(goods_desc[0], '\n',
                                  '\t') if goods_desc else ''
    # tiered
    tiered = []
    price_list = root.xpath('//td[@class="texttable"]')
    for x in range(0, len(price_list), 2):
        qty = util.intval(price_list[x].text_content())
        price = util.floatval(price_list[x + 1].text_content())
        if qty and price:
            tiered.append([qty, price])
        else:
            tiered = [[0, 0.00]]
            break
    if not tiered:
        price = root.xpath('//td[@class="txt18b-red"]/text()')
        price = util.floatval(price[0]) if price else 0
        if price:
            tiered = [1, price]
        else:
            tiered = []

    item['tiered'] = tiered if tiered else [[0, 0.00]]
    # stock
    qty = root.xpath('//input[@id="qty"]/@value')
    qty = util.intval(qty[0]) if qty else 1
    stock = root.xpath('//input[@id="custcol7"]/@value')
    stock = util.intval(stock[0]) if stock else 0
    item['stock'] = [stock, qty]
    # url
    item['url'] = resp.url
    # provider_name
    item['provider_name'] = 'LINEAR'
    item['provider_url'] = ''
    # doc catlog
    item['doc'] = ''
    item['catlog'] = ''
    # attr
    item['attr'] = []
    # rohs
    item['rohs'] = -1
    item['goods_other_name'] = ''
    # increment
    item['increment'] = 1
    # img
    item['goods_img'] = ''
    item['goods_thumb'] = ''
    # 一些信息需要在linear.com.cn获取
    return handle_of_redirects(item)
Ejemplo n.º 8
0
    def parse_detail(self, resp):
        """解析系列型号数据"""
        item = GoodsItem()
        try:
            soup = BeautifulSoup(resp.text.encode('utf-8'), 'lxml')
        except Exception as e:
            logger.debug(u"初始化BS4对象失败,重试一次 URL:{url}".format(url=resp.url))
            # 重试一次
            return Request(url=resp.url,
                           headers=self.headers,
                           cookies=self.cookies)
        # goods_sn

        product_id = self.product_id_pattern_1.search(
            resp.url) or self.product_id_pattern_2.search(resp.url)
        goods_sn = product_id.group(1) if product_id else ''
        item['goods_sn'] = goods_sn
        if not goods_sn:
            logger.debug(u"获取goods_sn失败 URL:{url}".format(url=resp.url))
            return None
        try:
            # goods_name
            product_ref = soup.find('p', class_='ref')
            goods_name = ''
            if product_ref:
                goods_name_pattern = re.compile(
                    ur'<b>制造商零件编号:</b>\s*([^\"\'<>/]+)')
                product_ref_list = unicode(product_ref).split('<br/>')
                for x in product_ref_list:
                    match = goods_name_pattern.search(x)
                    if match:
                        goods_name = match.group(1)
                        break
            item['goods_name'] = goods_name
            # goods_other_name
            item['goods_other_name'] = ''
        except:
            logger.debug(u"获取goods_name失败 URL:{url}".format(url=resp.url))
            item['goods_name'] = ''
            item['goods_other_name'] = ''

        # goods_desc
        goods_desc = soup.find('p', class_='desc')
        if not goods_desc:
            logger.debug(u"获取goods_desc失败 URL:{url}".format(url=resp.url))
        item['goods_desc'] = goods_desc.get_text(
            strip=True) if goods_desc else ''

        # provider_name and provider_url
        provider_name = soup.find('img', id='ctl00_PlaceHolderMain_mfrLogo')
        item['provider_name'] = provider_name.get('title', '')
        # 如果在商标图片中无法获取 provider_name ,尝试从 product-desc 中获取
        if not provider_name:
            desc_div = soup.find('div', id='product-desc')
            provider_name = desc_div.find('h2')
            provider_name = provider_name.get_text(
                strip=True) if provider_name else ''
            item['provider_name'] = provider_name
        item['provider_url'] = ''
        # url
        item['url'] = resp.url
        # doc
        doc = soup.find(
            'a',
            id='ctl00_PlaceHolderMain_csDownloadCenter_linkDatasheetUrlJustText'
        )
        item['doc'] = doc.get('href', '')
        # goods_img and goods_thumb
        goods_img = soup.find('img', id='previewedMEDImage')
        item['goods_img'] = goods_img.get('src', '')
        goods_thumb = soup.find('img', id='thumbnail-1')
        item['goods_thumb'] = goods_thumb.get('src', '')
        # catlog
        item['catlog'] = []
        catlog = soup.find('ul', id='breadcrumb-navigation')
        catlog_list = catlog.find_all('a')
        for a in catlog_list:
            breadcrumb_name = a.get_text(strip=True)
            breadcrumb_url = util.urljoin(resp.url, a.get('href', ''))
            item['catlog'].append([breadcrumb_name, breadcrumb_url])
        # attr
        item['attr'] = []
        product_attr_div = soup.find('div',
                                     id='product-details-overview-highlights')
        product_attr_list = product_attr_div.find_all(
            'li') if product_attr_div else []
        for li in product_attr_list:
            attr_name, attr_value = li.get_text(strip=True).split(':')
            item['attr'].append([attr_name, attr_value])
        # tiered
        try:
            item['tiered'] = []
            price_table = soup.find('table', class_='product-prices')
            price_tr_list = price_table.find_all('tr', class_='price-break')
            for tr in price_tr_list:
                qty_th = tr.find('th')
                qty = qty_th.get_text(strip=True) if qty_th else 0
                qty = util.intval(qty)
                price_span = tr.find('span')
                price = price_span.get_text(strip=True) if price_span else 0.00
                price = util.floatval(price)
                # print qty, price
                if qty and price:
                    item['tiered'].append([qty, price])
                else:
                    item['tiered'] = [0, 0.00]
        except:
            logger.debug(u"获取tiered失败 URL:{url}".format(url=resp.url))
            item['tiered'] = [0, 0.00]
        # stock、increment、 min_qty
        try:
            stock_div = soup.find('div', id='product-qty-content')
            stock_tr = stock_div.find('tr', class_='qtyInStock')
            increment_tr = stock_div.find('tr', class_='multipleOf')
            min_qty_tr = stock_div.find('tr', class_='minOrderQty')
            stock = stock_tr.find('td', class_='qty').get_text(
                strip=True) if stock_tr else 0
            stock = util.intval(stock)
            increment = increment_tr.find('td', class_='qty').get_text(
                strip=True) if increment_tr else 1
            increment = util.intval(increment)
            min_qty = min_qty_tr.find('td', class_='qty').get_text(
                strip=True) if min_qty_tr else 1
            min_qty = util.intval(min_qty)
            item['stock'] = [stock, min_qty]
            item['increment'] = increment
        except:
            logger.debug(u"获取stock失败 URL:{url}".format(url=resp.url))
            item['stock'] = [0, 1]
            item['increment'] = 1
        # rohs
        rohs_div = soup.find('div', id='ctl00_PlaceHolderMain_imgRoHS')
        item['rohs'] = 1 if rohs_div else -1
        return item
Ejemplo n.º 9
0
def fetch_search_data(keyword=None,
                      id=None,
                      data_dict=None,
                      headers=None,
                      proxy=None,
                      **kwargs):
    """获取搜索数据"""
    if keyword:
        if not kwargs.get('other_usage', False):
            print '正在获取 ti.com 中关键词:%s 的相关数据' % keyword
        url = 'http://www.ti.com/sitesearch/docs/partnumsearch.tsp?sort=asc&linkId=2&filter=p&sortBy=pstatus&searchTerm=%s' % keyword
    elif 'url' in kwargs:
        url = kwargs['url']
    else:
        return 404

    _headers = copy.copy(default_headers)
    if isinstance(headers, dict):
        _headers.update(util.rfc_headers(headers))
    try:
        proxies = None
        if proxy:
            i = random.randint(0, proxy[0] - 1)
            proxies = {'http': 'http://' + proxy[1][i]}
        resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies)
    except Exception as e:
        logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' %
                     (util.traceback_info(e), url))
        if 'Invalid URL' not in str(e):
            data_dict['list'].append({
                'status': -400,
                'url': url,
                'id': id,
                'count': kwargs.get('count', 1)
            })
        return -400

    if resp.status_code != 200:
        if resp.status_code == 404 and '404.html' in resp.url:
            logger.info('STATUS:404; INFO:无效产品; URL: %s' % url)
            return 404
        logger.debug('STATUS:-405 ; INFO:请求错误,网页响应码 %s ; PROXY:%s ; URL:%s' %
                     (resp.status_code, proxies['http'] if proxy else '', url))
        data_dict['list'].append({
            'status': -405,
            'url': url,
            'id': id,
            'count': kwargs.get('count', 1)
        })
        return -405
    resp.encoding = 'utf-8'
    resp_json = {}
    try:
        resp_json = json.loads(resp.content)
        product = resp_json.get('response', {}).get('searchResults',
                                                    {}).get('PartNoArray', [])
        # print len(product)
    except:
        product = []
        logger.debug('STATUS:-404 ; INFO:数据异常 ; URL:%s' % url)
    if len(product) <= 0:
        data_dict['list'].append({
            'status': 404,
            'url': url,
            'id': id,
            'count': kwargs.get('count', 1)
        })
        return 404
    links = product
    for vo in links:
        pn = vo.get('PartNumber', '')
        tn = vo.get('PartType', '')
        if pn:
            link = 'http://www.ti.com/product/%s' % pn
            if 'tool' in tn:
                link = 'http://www.ti.com/tool/%s' % pn
            data_dict['url'].append({'id': id, 'url': link, 'goods_sn': pn})
    if 'startNum=' in resp.url:
        return 200
    page_num = 0
    count = 0
    try:
        count = resp_json.get('response',
                              {}).get('searchResults',
                                      {}).get('filter',
                                              {}).get('MaxRecordCount', '')
        count = util.intval(count)
    except:
        count = 0
    page_num = int(math.ceil(count / 25.0))
    if page_num <= 1:
        return 200
    # 翻页的form_data
    max_list_num = util.intval(kwargs.get('max_list_num', 5))
    for x in xrange(1, page_num + 1):
        if max_list_num and x > max_list_num:
            break
        url = 'http://www.ti.com/sitesearch/docs/partnumsearch.tsp?sort=asc&linkId=2&startNum=%d&filter=p&sortBy=pstatus&searchTerm=%s' % (
            25 * x, keyword)
        page_url = url
        data_dict['list'].append({
            'id': id,
            'url': page_url,
        })
    return 200
Ejemplo n.º 10
0
def get_detail(gpn=None, **kwargs):
    data = dict()
    if not gpn:
        yield data
        # return product_list
    url = "http://www.ti.com/product/%s/samplebuy" % gpn
    try:
        proxies = kwargs.get('proxies')
        html = requests.get(url=url,
                            headers=default_headers,
                            timeout=30,
                            proxies=proxies)
        if 'Page not found' in html.content:
            raise StopIteration
    except:
        raise StopIteration
    if html.status_code != 200:
        raise StopIteration
    soup = BeautifulSoup(html.content, "lxml")
    # category
    breadcrumb_div = soup.find('div', class_='breadcrumb')
    breadcrumb_div = breadcrumb_div.find_all('a') if breadcrumb_div else []
    cat_log = []
    for a in breadcrumb_div:
        if 'TI Home' in a.get_text(strip=True):
            continue
        cat_log.append([a.get_text(strip=True), a['href']])
    data['catlog'] = cat_log if cat_log else []
    # goods_img, goods_thumb
    img_div = soup.find('div', class_='image')
    img = img_div.img['src'] if img_div else ''
    data['goods_img'] = img
    data['goods_thumb'] = img
    # pretty table
    table = soup.find('table', id='tblBuy')
    # 存在一些GPN商品组内没有商品列表,直接返回默认值
    if not table:
        data['goods_sn'] = gpn
        data['tiered'] = [[0, 0.00]]
        data['stock'] = [0, 1]
        yield data
    body_div = table.tbody if table else None
    # 如果获取不到商品列表就退出
    if not body_div:
        raise StopIteration
    ths = table.find_all('th') if table else []
    th_td = dict()
    for th in ths:
        if 'Part' in th.get_text(strip=True):
            th_td['PartNum'] = ths.index(th)
        if 'Price' in th.get_text(strip=True):
            th_td['Price'] = ths.index(th)
        if 'Inventory' in th.get_text(strip=True):
            th_td['Inventory'] = ths.index(th)
    tds = body_div.find_all('td') if body_div else []
    step = len(ths)
    tr = [tds[x:x + step] for x in range(0, len(tds), step)]
    total_parts = len(tr)
    for td in tr:
        logger.info("GPN:%s 共有%d个商品需要抓取,正在抓取第%d个。" %
                    (gpn.encode('utf-8'), total_parts, tr.index(td) + 1))
        # tiered
        price = th_td.get('Price')
        pattern_price = re.compile(r'\s*(\d+.\d+)\s*\|\s*(\d+)ku\s*')
        if td[price].script:
            td[price].script.extract()
        tiered = pattern_price.search(td[price].get_text())
        if tiered:
            price = tiered.group(1)
            qty = int(tiered.group(2)) * 1000
            data['tiered'] = [[util.intval(qty), util.floatval(price)]]
        else:
            data['tiered'] = [[0, 0.00]]
        # goods_sn
        part_num = th_td.get('PartNum')
        data['goods_sn'] = ''
        for x in td[part_num].contents:
            if x.name == 'script':
                continue
            elif x.name == 'a':
                data['goods_sn'] = str(x.string).strip()
                # data['tiered'] = get_tiered(x['href'], **kwargs)
                stock, tiered = get_stock(data['goods_sn'], x['href'],
                                          **kwargs)
                data['tiered'] = tiered
                data['url'] = x['href']
                data['provider_name'] = 'TI'
                data['stock'] = [util.intval(stock), 1]
            elif x.string and str(x.string).strip():
                data['goods_sn'] = str(x.string).strip()
                data['stock'] = [0, 1]
                data['provider_name'] = ''
                # data['url'] = "https://store.ti.com/%s.aspx" % data['goods_sn']
                data['url'] = "http://www.ti.com/product/%s" % gpn
        yield data
Ejemplo n.º 11
0
def add_to_cart(url, only_session, **kwargs):
    form_data = {
        "ctl00$ctl00$ScriptManager1":
        "ctl00$ctl00$NestedMaster$PageContent$ctl00$BuyProductDialog1$BuyProductPanel|ctl00$ctl00$NestedMaster$PageContent$ctl00$BuyProductDialog1$btnBuyPaid",
        "__EVENTTARGET":
        "",
        "__EVENTARGUMENT":
        "",
        "__VIEWSTATE":
        "",
        "__VIEWSTATEGENERATOR":
        "",
        "__VIEWSTATEENCRYPTED":
        "",
        "ctl00$ctl00$NestedMaster$PageHeader$StoreHeader_H$SearchPhrase":
        "",
        "ctl00$ctl00$NestedMaster$PageHeader$StoreHeader_H$hiLastHeaderAction":
        "none",
        "ctl00$ctl00$NestedMaster$PageHeader$StoreHeader_H$hiSearchFilterValue":
        "none",
        "__ASYNCPOST":
        "true",
        "ctl00$ctl00$NestedMaster$PageContent$ctl00$BuyProductDialog1$btnBuyPaid":
        "Buy",
    }
    proxies = kwargs.get('proxies')
    try:
        stock_page = only_session.get(url=url, proxies=proxies)
    except:
        return 0
    if stock_page.status_code == 200:
        soup = BeautifulSoup(stock_page.content, 'lxml')
        view_state = soup.find('input', id="__VIEWSTATE")
        form_data['__VIEWSTATE'] = view_state.value if view_state else ''
        view_state_generator = soup.find('input', id="__VIEWSTATEGENERATOR")
        form_data[
            '__VIEWSTATEGENERATOR'] = view_state_generator.value if view_state_generator else ''
        # tiered
        tiered = []
        table = soup.find(
            'table',
            id=
            'ctl00_ctl00_NestedMaster_PageContent_ctl00_BuyProductDialog1_PricingTierList'
        )
        if table:
            for tr in table.find_all('tr')[1:]:
                tds = tr.find_all('td')
                qty = tds[0].get_text(strip=True)
                price = tds[1].get_text(strip=True)
                tiered.append([util.intval(qty), util.floatval(price)])
        else:
            tiered = [[0, 0.00]]
    else:
        return 0
    # post
    try:
        resp = only_session.post(url=url, data=form_data, proxies=proxies)
    except:
        return 0
    # print resp.content
    return tiered
Ejemplo n.º 12
0
        if html.status_code == 200:
            item = {}
            _desc = soup.find(
                'tr',
                id=
                'ctl00_ctl00_NestedMaster_PageContent_ctl00_BuyProductDialog1_trSku'
            )
            item['goods_name'] = _desc.find('h1').get_text(
                strip=True) if _desc else ''
            item['goods_sn'] = item['goods_name']
            item['desc'] = _desc.get_text(strip=True) if _desc else ''
            _img = soup.find('img', id='ProductImage')
            item['goods_img'] = util.urljoin(url, _img.get('src'))
            stock_info = get_stock(goods_sn=goods_sn, url=url)
            if stock_info:
                item['stock'] = [util.intval(stock_info[0]), 1]
                item['tiered'] = stock_info[1]
            else:
                item['stock'] = [0, 1]
                item['tiered'] = [[0, 0.00]]
            item['provider_name'] = 'TI'
            item["increment"] = 1
            item['url'] = url
            return item


# def _parse_store_ti_com(url, **kwargs):
#     """
#     更新数据仅需要
#         id          此处为GoodsId
#         tiered      价格阶梯
Ejemplo n.º 13
0
def fetch_search_data(keyword=None,
                      id=None,
                      data_dict=None,
                      headers=None,
                      proxy=None,
                      **kwargs):
    """获取搜索数据"""
    if keyword:
        print '正在获取 avnet 中关键词:%s 的相关数据' % keyword
        url = "https://www.avnet.com/search/resources/store/715839038/productview/bySearchTerm/select?searchType=102&profileName=Avn_findProductsBySearchTermCatNav_Ajax&searchSource=Q&landingPage=true&storeId=715839038&catalogId=10001&langId=-1&currency=USD&orgEntityId=-2000&responseFormat=json&pageSize=20&pageNumber=1&_wcf.search.internal.boostquery=price_USD:{{0.00001+TO+*}}^499999.0+inStock:%22true%22^9000.0+topSellerFlag:%22Yes%22^0.085+newProductFlag:%22Yes%22^0.080+packageTypeCode:%22BKN%22^0.075&_wcf.search.internal.filterquery=-newProductFlag%3ANPI&q={keyword}&intentSearchTerm={keyword}&searchTerm={keyword}&wt=json".format(
            keyword=keyword)
    elif 'url' in kwargs:
        url = kwargs['url']
    else:
        return 404
    _headers = copy.copy(default_headers)
    if isinstance(headers, dict):
        _headers.update(util.rfc_headers(headers))
    try:
        proxies = kwargs.get('proxies')
        if proxies is None and proxy:
            i = random.randint(0, proxy[0] - 1)
            proxies = {
                'http': 'http://' + proxy[1][i],
                'https': 'https://' + proxy[1][i]
            }
        resp = requests.get(url, headers=_headers, timeout=30, proxies=proxies)
    except Exception as e:
        logger.debug('STATUS:-400 ; INFO:数据请求异常, %s ; URL:%s' %
                     (util.traceback_info(e), url))
        if 'Invalid URL' not in str(e):
            data_dict['list'].append({
                'status': -400,
                'url': url,
                'id': id,
                'count': kwargs.get('count', 1)
            })
        return -400
    if resp.status_code != 200:
        if resp.status_code == 404 and '404.html' in resp.url:
            logger.info('STATUS:404; INFO:无效产品; URL: %s' % url)
            return 404
        logger.debug('STATUS:-405 ; INFO:请求错误,网页响应码 %s ; PROXY:%s ; URL:%s' %
                     (resp.status_code, proxies['http'] if proxy else '', url))
        data_dict['list'].append({
            'status': -405,
            'url': url,
            'id': id,
            'count': kwargs.get('count', 1)
        })
        return -405
    resp.encoding = 'utf-8'
    # 开始解析resp
    # 获取搜索的数量
    search_dict = {}
    try:
        search_dict = json.loads(resp.text.encode('utf-8'))
        product_list = search_dict.get('catalogEntryView', [])
    except:
        product_list = []
        logger.debug('STATUS:-404 ; INFO:数据异常 ; URL:%s' % url)
    if len(product_list) <= 0:
        data_dict['list'].append({
            'status': 404,
            'url': url,
            'id': id,
            'count': kwargs.get('count', 1)
        })
        return 404
    # sn = product.xpath('.//td[@class="partColHeader"]//span[@class="defaultSearchText"]')
    for product in product_list:
        goods_sn = product.get('seo_token_ntk', '')
        base_url = 'https://www.avnet.com/shop/apac/'
        product_url = product.get('avn_pdp_seo_path', '')
        data_dict['url'].append({
            'id': id,
            'url': util.urljoin(base_url, product_url),
            'goods_sn': goods_sn
        })
    if 'showMore=true' in url:
        return 200
    count = search_dict.get('recordSetTotal', 0)
    page_num = int(math.ceil(count / 20.0))
    if page_num <= 1:
        return 200
    max_list_num = util.intval(kwargs.get('max_list_num', 5))
    for x in xrange(2, page_num + 1):
        if max_list_num and x > max_list_num:
            break
        page_url = 'https://www.avnet.com/search/resources/store/715839038/productview/bySearchTerm/select?searchType=102&profileName=Avn_findProductsBySearchTermCatNav_More_Ajax&searchSource=Q&landingPage=true&storeId=715839038&catalogId=10001&langId=-1&currency=USD&orgEntityId=-2000&responseFormat=json&pageSize=20&pageNumber={next_page}&_wcf.search.internal.boostquery=price_USD:{{0.00001+TO+*}}^499999.0+inStock:%22true%22^9000.0+topSellerFlag:%22Yes%22^0.085+newProductFlag:%22Yes%22^0.080+packageTypeCode:%22BKN%22^0.075&_wcf.search.internal.filterquery=-newProductFlag:NPI&q={keyword}&intentSearchTerm={keyword}&searchTerm={keyword}&showMore=true&wt=json'.format(
            next_page=x, keyword=keyword)
        # print page_url
        data_dict['list'].append({
            'id': id,
            'url': page_url,
        })
    return 200
Ejemplo n.º 14
0
def _parse_detail_data(resp, headers=None, **kwargs):
    """
    解析详情数据,独立出来

    @param  data    页面数据
    @param  url     解析的页面url(方便记录异常)
    @param  kwargs  扩展参数
    """
    item = {}
    try:
        soup = BeautifulSoup(resp.text, 'lxml')
        if soup is None:
            logger.debug('初始化商品详情页面失败 URL: %s', resp.url)
            return -404
    except Exception as e:
        logger.debug('初始化商品详情页面失败 URL: %s ERROR: %s',
                     (resp.url, util.traceback_info(e)))
        return -404
    # goods_sn
    url_path_list = resp.url.split('/')
    goods_sn_pattern = re.compile(r'.*-\d{19}')
    for path in url_path_list[::-1]:
        if goods_sn_pattern.findall(path):
            item['goods_sn'] = path
            break
    if not item.get('goods_sn', False):
        logger.debug("无法从链接中解析goods_sn URL: {url} ".format(url=resp.url))
        return -400
    # goods_name
    goods_info_div = soup.find('div', class_='section-left')
    item['goods_name'] = goods_info_div.find('h1').get_text(
        strip=True) if goods_info_div else item['goods_sn']
    # url
    item['url'] = resp.url
    # goods_img
    img_div = soup.find('div', id="outer-div1")
    img = img_div.find('img') if img_div else None
    item['goods_img'] = util.urljoin(resp.url, img.get('src')) if img else ''
    # goods_thumb
    item['goods_thumb'] = item['goods_img']
    # desc
    desc_p = soup.find('p', class_='RB-pdp_short_Desc')
    item['desc'] = desc_p.get_text(strip=True) if desc_p else ''
    # provider_name
    item['provider_name'] = "AVNET"
    # provider_url
    item['provider_url'] = ''
    # attr: [[None, None]]
    attr_body = soup.find('div', id="techAttr")
    attr_div = attr_body.find_all('div', class_='pdpDescriptionsBodyContent')
    attr = []
    if attr_div is not None:
        for content in attr_div:
            att_name = content.find('div', class_='pdpDescriptionColumn')
            attr_value = content.find('div', class_='pdpValueColumn')
            if att_name and attr_value:
                attr.append([
                    att_name.get_text(strip=True),
                    attr_value.get_text(strip=True)
                ])
            else:
                continue
        item['attr'] = attr
    else:
        item['attr'] = attr
    # tiered: [[0, 0.00]]
    tiered_span = soup.find_all('span', class_='usdpart1')
    tiered = []
    if tiered_span:
        for span in tiered_span:
            qty_span = span.find('span', class_='pdpTierMinQty')
            qty = qty_span.get_text(strip=True) if qty_span else 0
            price_p = span.find('p')
            price = price_p.get_text(strip=True) if price_p else 0.00
            if qty and price:
                tiered.append([util.intval(qty), util.floatval(price)])
            else:
                tiered = [[0, 0.00]]
                break
        item['tiered'] = tiered
    else:
        item['tiered'] = [[0, 0.00]]

    # stock: [0, 1]  >> [stock, qty]
    stock_input = soup.find('input', id='inStock')
    stock = stock_input.get('value') if stock_input else 0
    stock = util.intval(stock)
    # qty
    min_qty_input = soup.find('input', attrs={'name': 'min'})
    min_qty = min_qty_input.get('value') if min_qty_input else 1
    min_qty = util.intval(min_qty)
    item['stock'] = [stock, min_qty] if stock else ['0', '1']
    # increment: 1
    multi_input = soup.find('input', attrs={'name': 'mult'})
    item['increment'] = util.intval(
        multi_input.get('value')) if multi_input else 1
    # doc
    doc_div = soup.find('div', class_='pdfcontent')
    if doc_div is not None:
        doc_url = doc_div.find('a', class_='datasheet_align')
        item['doc'] = doc_url.get('href') if doc_url else ''
    else:
        item['doc'] = ''
    # rohs: -1
    rohs_div = soup.find('div', class_='leafcontent')
    item['rohs'] = 1 if rohs_div else -1
    # catlog: [[name, url]]
    nav = soup.find('nav', class_='breadcrumb')
    nav_ul = nav.find('ul', class_='nav')
    catlog = []
    if nav is not None:
        lis = nav.find_all('a')
        for a in lis:
            cat_name = a.get_text(strip=True)
            cat_url = util.urljoin(resp.url, a.get('href'))
            if cat_name and cat_url:
                catlog.append([cat_name, cat_url])
            else:
                continue
        item['catlog'] = catlog
    else:
        item['catlog'] = catlog
    # goods_other_name
    item['goods_other_name'] = ''
    # product_id
    # family_sn
    return item
Ejemplo n.º 15
0
    def import_goods(self, data, put_xs_list=None):
        """导入产品数据"""
        put_xs_list = put_xs_list if put_xs_list else []
        if not data:
            return 0
        data['category'] = [x.encode('utf-8') for x in data['category']]
        cids = self.get_ic_category(data['category'])
        try:
            cat_id1 = cids[0]
        except IndexError:
            cat_id1 = 0
        try:
            cat_id2 = cids[1]
        except IndexError:
            cat_id2 = 0
        goods_sn = data['goods_sn']
        min_buynum = data['min_buynum']
        data['mpq'] = 1
        increment = data['increment']
        url = data['url']
        goods_desc = util.binary_type(data['goods_desc']) if data['goods_desc'] else ''
        goods_img = data['goods_img'] if 'goods_img' in data else ''
        _unix_time = int(time.time())
        goods_data = {
            'cat_id1': cat_id1,
            'cat_id2': cat_id2,
            'cat_id3': 0,
            'PN2': PN2,
            'goods_name': util.binary_type(data['goods_name']),
            'goods_other_name': util.binary_type(data['goods_other_name']),
            'provider_name': util.binary_type(data['brand']),
            'batch_number': '',
            'encap': '',
            'goods_desc': goods_desc,
            'SPQ': data['mpq'],
            'goods_number_hk': data['hk_stock'] if 'hk_stock' in data else 0,
            'goods_number': data['cn_stock'] if 'cn_stock' in data else 0,
            'DT_HK': HDT,
            'DT': CDT,
            'CDT': CDT,
            'HDT': HDT,
            'increment': increment,
            'min_buynum': min_buynum,
            'goods_sn': goods_sn,
            'brand_goods_id': 0,
            'doc_url': '',
            'digikey_url': url,
            'series': '',
            'source_type': 0,
            'user_id': 0,
            'log_id': 0,
            'to_china': 1,
            'to_hongkong': 0,
            'goods_weight': 0.0,
            'goods_img': goods_img,
            'goods_thumb': goods_img,
            'last_update': _unix_time - 8 * 3600,
        }
        if goods_data['provider_name']:
            brand_id = self.get_ic_brand(goods_data['provider_name'])
            goods_data['brand_id'] = brand_id

        info = self.supplier.select('goods', condition={'goods_sn': goods_sn, 'PN2': PN2},
                                    fields=('goods_id',), limit=1)
        if info:
            self.supplier.update('goods', condition={'goods_id': info['goods_id']}, data=goods_data)
            goods_id = info['goods_id']
            print('更新mysql成功,GoodsId:%s' % (goods_id,))
        else:
            goods_data['add_time'] = _unix_time - 8 * 3600
            goods_id = self.supplier.insert('goods', data=goods_data, return_insert_id=1)
            put_xs_list.append({
                'goods_id': goods_id,
                'goods_name': util.binary_type(goods_data['goods_name']),
                'goods_other_name': util.binary_type(goods_data['goods_other_name'])
            })
            print('保存mysql成功,GoodsId:%s' % (goods_id,))

        if not goods_id:
            return 0
        table_id = str(goods_id)[-1]
        if info:
            self.supplier.delete('goods_price_%s' % (table_id,), condition={'goods_id': goods_id})

        # 获取价格阶梯
        price_tiered = data['tiered']
        if not price_tiered:
            price_tiered.append((goods_data['min_buynum'], 0.0, 0.0))

        goods_price = []
        for p in price_tiered:
            qty = util.intval(p[0])
            if qty <= 0:
                continue
            goods_price.append({
                "purchases": p[0],
                "price": 0,
                "price_cn": p[2] * PRICE_PROPORTION,
            })

        self.supplier.insert('goods_price_%s' % (table_id,), data={
            'goods_id': goods_id,
            'price': json.dumps(goods_price),
        })

        tiered = []
        for p in goods_price:
            tiered.append([
                p["purchases"],
                p['price'],
                p['price_cn'],
            ])

        mongo_data = {
            'ModelName': goods_data['goods_name'],
            'OtherModelName': goods_data['goods_other_name'],
            'BrandName': goods_data['provider_name'],
            'DT': (goods_data['HDT'], goods_data['CDT']),
            'Desc': goods_data['goods_desc'],
            'GoodsId': goods_id,
            'GoodsSn': goods_data['goods_sn'],
            'Stock': (goods_data['goods_number'], goods_data['min_buynum'], 0),
            'Tiered': tiered,
            'error': 0,
            'time': int(time.time()),
            'url': goods_data['digikey_url'],
            'DocUrl': '',
            'increment': goods_data['increment']
        }

        # 保存mongodb
        collect = getattr(self.mongo, 'supplier')
        info = collect.find_one({'GoodsId': goods_id})
        if info:
            collect.update({'GoodsId': goods_id}, {"$set": mongo_data})
            print('更新mongodb成功,GoodsId:%s' % (goods_id,))
        else:
            collect.insert(mongo_data)
            print('保存mongodb成功,GoodsId:%s' % (goods_id,))
        # print('成功导入立创商城产品 %s 数据' % (data[4].encode('utf-8'),))
        return 1