Python extract_one Examples, utils.extract_one Python Examples

Example #1

0

Show file

    def parse_item(self, response):
        hxs = Selector(response)
        item_titles = extract(hxs, "//div[@class='gl-i-wrap j-sku-item']//a/em/text()")
        top_id = extract_one(hxs, '//*[@id="J_crumbsBar"]/div/div/div/div[1]/a/text()')
        type_id1 = extract(hxs, '//*[@id="J_crumbsBar"]//div[@class="trigger"]/span/text()')[0]
        type_id2 = extract(hxs, '//*[@id="J_crumbsBar"]//div[@class="trigger"]/span/text()')[-1]

        if type_id1 != type_id2:
            for i, t in enumerate(item_titles):
                if i < 20:
                    good = {
                        'mall': '2',
                        'rank': str(i + 1),
                        'title': t,
                        'price': '0',
                        'turnover_index': '0',
                        'top_id': top_id,
                        'type_id1': type_id1,
                        'type_id2': type_id2,
                        'url': response.url
                    }

                    yield Good(good)

        for link in self.normal_url_extractor.extract_links(response):
            yield SplashRequest(link.url, callback=self.parse_url, args={'wait': 0.5, 'html': 1, })

        for link in self.needed_url_extractor.extract_links(response):
            if 'ev' not in link.url:
                url = re.sub(r'page=.*&', 'page=1&', link.url)
                url = re.sub(r'stock=.*&', 'stock=0&', url)
                url = re.sub(r'delivery_daofu=.*&', 'delivery_daofu=0&', url)
                url = re.sub(r'delivery=.*&', 'delivery=0&', url)
                yield SplashRequest(url, callback=self.parse_item, args={'wait': 0.5, 'html': 1, })

Example #2

0

Show file

File: taobao.py Project: halfapple/Spiders

    def parse_item(self, response):
        hxs = Selector(response)
        top_id = re.findall(r'.*&topId=(\S+_\S+)&type.*', response.url)[0]
        #        type_id=re.findall(r'.*leafId=(\d+)&rank=.*',response.url)[0]
        type_id1 = extract_one(
            hxs,
            "//div[@class='block-body ']/div[@class='params-cont']/a[@class='param-item icon-tag param-item-selected']/text()"
        )
        ranks_tuple = extract(
            hxs,
            '//*[@class="rank-num rank-focus"]/text()|//*[@class="rank-num rank-important"]/text()|//*[@class="rank-num rank-"]/text()'
        )
        ranks = []
        for r in ranks_tuple:
            if r.strip() != '':
                ranks.append(r)

        titles = extract(hxs, '//*[@class="title"]/a/text()')
        prices = extract(hxs, '//*[@class="col3 col"]/text()')[1:]
        turnover_indexs = extract(hxs, '//*[@class="focus-bar"]/span/text()')

        for r, t, p, i in zip(ranks, titles, prices, turnover_indexs):
            good = {
                'mall': '0',
                'rank': r.strip(),
                'title': t.strip(),
                'price': p.split('￥')[-1].strip(),
                'turnover_index': i.strip(),
                'top_id': top_id.strip(),
                'type_id1': type_id1.strip(),
                'type_id2': '',
                'url': response.url
            }
            yield Good(good)

Example #3

0

Show file

File: taobao.py Project: edwardwbli/spider_practicing

    def parse_item(self,response):
        hxs=Selector(response)
        top_id=re.findall(r'.*&topId=(\S+_\S+)&type.*',response.url)[0]
#        type_id=re.findall(r'.*leafId=(\d+)&rank=.*',response.url)[0]
        type_id1=extract_one(hxs,"//div[@class='block-body ']/div[@class='params-cont']/a[@class='param-item icon-tag param-item-selected']/text()")
        ranks_tuple=extract(hxs,'//*[@class="rank-num rank-focus"]/text()|//*[@class="rank-num rank-important"]/text()|//*[@class="rank-num rank-"]/text()')
        ranks=[]
        for r in ranks_tuple:
            if r.strip()!='':
                ranks.append(r)

        titles=extract(hxs,'//*[@class="title"]/a/text()')
        prices=extract(hxs,'//*[@class="col3 col"]/text()')[1:]
        turnover_indexs=extract(hxs,'//*[@class="focus-bar"]/span/text()')

        for r,t,p,i in zip(ranks,titles,prices,turnover_indexs):
            good={
                'mall':'0',
                'rank':r.strip(),
                'title':t.strip(),
                'price':p.split('￥')[-1].strip(),
                'turnover_index':i.strip(),
                'top_id':top_id.strip(),
                'type_id1':type_id1.strip(),
                'type_id2':'',
                'url':response.url
            }
            yield Good(good)

Example #4

0

Show file

File: jd.py Project: wqlin/spider_practicing

    def parse_item(self, response):
        hxs = Selector(response)
        item_titles = extract(hxs, "//div[@class='gl-i-wrap j-sku-item']//a/em/text()")
        top_id = extract_one(hxs, '//*[@id="J_crumbsBar"]/div/div/div/div[1]/a/text()')
        type_id1 = extract(hxs, '//*[@id="J_crumbsBar"]//div[@class="trigger"]/span/text()')[0]
        type_id2 = extract(hxs, '//*[@id="J_crumbsBar"]//div[@class="trigger"]/span/text()')[-1]

        if type_id1 != type_id2:
            for i, t in enumerate(item_titles):
                if i < 20:
                    good = {
                        "mall": "2",
                        "rank": str(i + 1),
                        "title": t,
                        "price": "0",
                        "turnover_index": "0",
                        "top_id": top_id,
                        "type_id1": type_id1,
                        "type_id2": type_id2,
                        "url": response.url,
                    }

                    yield Good(good)

        for link in self.normal_url_extractor.extract_links(response):
            yield SplashRequest(link.url, callback=self.parse_url, args={"wait": 0.5, "html": 1})

        for link in self.needed_url_extractor.extract_links(response):
            if "ev" not in link.url:
                url = re.sub(r"page=.*&", "page=1&", link.url)
                url = re.sub(r"stock=.*&", "stock=0&", url)
                url = re.sub(r"delivery_daofu=.*&", "delivery_daofu=0&", url)
                url = re.sub(r"delivery=.*&", "delivery=0&", url)
                yield SplashRequest(url, callback=self.parse_item, args={"wait": 0.5, "html": 1})

Example #5

0

Show file

File: tm.py Project: edwardwbli/spider_practicing

    def parse_item(self,response):
        hxs=Selector(response)
        search_condition=extract_one(hxs,'//*[@id="J_CrumbSearchInuput"]/@value')
        item_titles=extract(hxs,"//div[@id='J_ItemList']//p[@class='productTitle']/a/text()")
        top_id=extract_one(hxs,'//*[@id="J_CrumbSlideCon"]/li[2]/a/text()')
        type_id1=extract_one(hxs,'//*[@id="J_CrumbSlideCon"]//div[@class="crumbDrop j_CrumbDrop"]/a/text()')
        if type_id1 is not None and search_condition is not None:
            type_id1=type_id1.split('/n')[0]
            titles=[]
            title=''
            for t in item_titles:
                if not t.endswith('\n'):
                    title+=t.strip()
                elif t.endswith('\n'):
                    title+=t.strip()
                    if len(title)>5:
                        titles.append(title.strip())
                    title=''

            if len(titles)>19 and search_condition!=type_id1:
                for i,t in enumerate(titles):
                    if i<20:
                        good={
                            'mall': '1',
                            'rank': str(i+1),
                            'title': t.strip(),
                            'price': '0',
                            'turnover_index':'0',
                            'top_id': top_id.strip(),
                            'type_id1': type_id1.strip(),
                            'type_id2': search_condition.strip(),
                            'url': response.url
                        }

                        yield Good(good)

        for link in self.needed_url_extractor.extract_links(response):
            if 'industryCatId' and 'cat' in link.url and 'post_fee' and 'brand' not in link.url:
                url = re.sub(r'sort=.*&', 'sort=d&', link.url)
                url = re.sub(r'search_condition=.*&', 'search_condition=7', url)
                url=re.sub(r'miaosha=.*&','miaosha=0&',url)
                url=re.sub(r'wwonline=.*&','wwonline=0&',url)
                yield SplashRequest(url, callback=self.parse_item, args={'wait': 0.5, 'html': 1,})

Example #6

0

Show file

File: tm.py Project: lzj3278/spider

    def parse_item(self, response):
        hxs = Selector(response)
        item_titles = extract(hxs, "//div[@id='J_ItemList']//p[@class='productTitle']/a/text()")
        top_id = extract_one(hxs, '//*[@id="J_CrumbSlideCon"]/li[2]/a/text()')
        type_id1 = extract(hxs, '//*[@id="J_CrumbSlideCon"]//div[@class="crumbDrop j_CrumbDrop"]/a/text()')
        if type_id1 is not None:
            if len(type_id1) > 1:
                type_id2 = type_id1.split('/n')[-1]
            else:
                type_id2 = ''
            type_id1 = type_id1.split('/n')[0]
            titles = []
            title = ''
            for t in item_titles:
                if not t.endswith('\n'):
                    title += t.strip()
                elif t.endswith('\n'):
                    title += t.strip()
                    if len(title) > 5:
                        titles.append(title.strip())
                    title = ''

            if len(titles) > 19:
                for i, t in enumerate(titles):
                    if i < 20:
                        good = {
                            'mall': '1',
                            'rank': str(i + 1),
                            'title': t.strip(),
                            'price': '0',
                            'turnover_index': '0',
                            'top_id': top_id.strip(),
                            'type_id1': type_id1.strip(),
                            'type_id2': type_id2.strip(),
                            'url': response.url
                        }

                        yield Good(good)

        for link in self.normal_url_extractor.extract_links(response):
            yield SplashRequest(link.url, callback=self.parse, args={'wait': 0.5, 'html': 1, })

Example #7

0

Show file

File: tm.py Project: wqlin/spider_practicing

    def parse_item(self,response):
        hxs=Selector(response)
        item_titles=extract(hxs,"//div[@id='J_ItemList']//p[@class='productTitle']/a/text()")
        top_id=extract_one(hxs,'//*[@id="J_CrumbSlideCon"]/li[2]/a/text()')
        type_id1=extract(hxs,'//*[@id="J_CrumbSlideCon"]//div[@class="crumbDrop j_CrumbDrop"]/a/text()')
        if type_id1 is not None:
            if len(type_id1) >1:
                type_id2=type_id1.split('/n')[-1]
            else:
                type_id2=''
            type_id1=type_id1.split('/n')[0]
            titles=[]
            title=''
            for t in item_titles:
                if not t.endswith('\n'):
                    title+=t.strip()
                elif t.endswith('\n'):
                    title+=t.strip()
                    if len(title)>5:
                        titles.append(title.strip())
                    title=''

            if len(titles)>19:
                for i,t in enumerate(titles):
                    if i<20:
                        good={
                            'mall': '1',
                            'rank': str(i+1),
                            'title': t.strip(),
                            'price': '0',
                            'turnover_index':'0',
                            'top_id': top_id.strip(),
                            'type_id1': type_id1.strip(),
                            'type_id2': type_id2.strip(),
                            'url': response.url
                        }

                        yield Good(good)

        for link in self.normal_url_extractor.extract_links(response):
            yield SplashRequest(link.url,callback=self.parse,args={'wait':0.5,'html':1,})