Esempio n. 1
0
 def parse_view(self, response):
     sel = Selector(response)
     place = Place()
     place['area'] = sel.select(
         "[@id='places_area__row']/[@class='w2p_fw']").extract_first()
     place['country'] = sel.select(
         "[@id='places_iso__row']/[@class='w2p_fw']").extract_first()
     yield place
        def parse_items(self, response):
            item = TwitterVizItem() 
            self.browser.get(response.url)
# let JavaScript Load
            time.sleep(3) 
# scrape dynamically generated HTML
            hxs = Selector(text=self.browser.page_source) 
            item['Follower'] = hxs.select('(//span[@class="u-linkComplex-target"])[position()>2]').extract()
            item['Follows'] = hxs.select('(//span[@class="u-linkComplex-target"])[position()=1]').extract()
            print item
            return item
Esempio n. 3
0
 def parse_items(self, response):
     item = TwitterVizItem()
     self.browser.get(response.url)
     # let JavaScript Load
     time.sleep(3)
     # scrape dynamically generated HTML
     hxs = Selector(text=self.browser.page_source)
     item['Follower'] = hxs.select(
         '(//span[@class="u-linkComplex-target"])[position()>2]'
     ).extract()
     item['Follows'] = hxs.select(
         '(//span[@class="u-linkComplex-target"])[position()=1]'
     ).extract()
     print item
     return item
    def parse(self, response):

        # Vamos abrir o arquivo em modo "Append", para continuar a escrever nele
        csv_file = open('amazon_search.csv', 'a')

        # Extraindo todas as divs
        hxs = Selector(response)

        divs = hxs.select('//div[@class="sg-row"]')

        for i, div in enumerate(divs):
            name = div.select(
                './/span[contains(@class,"a-text-normal")]/text()').get()
            price = div.select(
                './/span[contains(@class, "a-price")]/span[@class="a-offscreen"]/text()'
            ).get()
            stars = div.select(
                './/i[contains(@class, "a-icon-star-small")]/span/text()').get(
                )

            csv_file.write('%s;%s;%s\n' % (name, price, stars))

        csv_file.close()

        # Extraindo a url da próxima página
        next_url = response.xpath('//li[@class="a-last"]/a/@href').get()

        if not next_url is None:
            yield response.follow(next_url, callback=self.parse)
    def parse(self, response):
        # Precisamos abrir o arquivo em modo "escrita" para criar o arquivo (ou sobrescrever)
        csv_file = open('amazon_games.csv', 'w')

        # Extraindo todas as divs
        hxs = Selector(response)

        divs = hxs.select('//div[@class="s-item-container"]')

        for i, div in enumerate(divs):
            name = div.select('.//h2/text()').get()
            price = div.select(
                './/span[contains(@class, "s-price")]/text()').get()
            stars = div.select(
                './/i[contains(@class, "a-icon-star")]/span/text()').get()

            csv_file.write('%s;%s;%s\n' % (name, price, stars))

        csv_file.close()

        # Extraindo a URL da próxima página e fazendo uma nova requisição
        next_url = response.xpath('//a[@id="pagnNextLink"]/@href').get()

        if not next_url is None:
            yield response.follow(next_url, callback=self.parseNextPage)
Esempio n. 6
0
    def parse5(self, response):
        hxs = Selector(response)
        print "--------------------------------"
        items = []
        item= self.generate_item_dict()
        #Yura_start:

        item["file_urls"]=[]
        item["title"] = hxs.select("//div[@class='breadcrumbs']/b/text()").extract_first()
        print item["title"]
        item["image_urls"] = hxs.select("//div[@class='scroller-view']/div/div/a/@href").extract()

        try:
            z = hxs.select("//div[@class='itemPDF']/div/a/@onclick").extract()
            for i in range(len(z)):
                x = ((z[i]).split("('")[1]).split("','")[0]
                item["file_urls"].append(x)
            print item["file_urls"]
        except:
            item["file_urls"] = ""
            print "No PDF there"

        #Yura_end----------------------------------
        item["name"] = hxs.xpath("//h1[@id='itemName']/text()").extract()[0]
        item["price"] = hxs.xpath("//div[@id='mss-top-price']/em/text()").extract()[0]
        aaa="hhhh"
        try:

            a= hxs.xpath("//div[@class='ytInfoTab']/text()").extract()
            aa=[]
            for i in a:

                i=i.lower()
                a=i.split(" ")

                if "info" in a:
                    aaa=i.capitalize()[:-5]
                    item["pbrand"]= i.capitalize()[:-5]
            if aaa=="" or aaa==" ":
                item["pbrand"]=hxs.xpath("//div[@class='viewMoreLinks']/a/text()").extract()[-1]
        except:
            print "Brand not found"
        category =hxs.xpath("//div[@id='breadcrumbs']//a/text()").extract()[-1]
        items.append(item)
        return items
Esempio n. 7
0
    def parse(self, response):
        hxs = Selector(response)
        items = []
        item = AndrewItem()
        item["file_urls"]=[]
        item["title"] = hxs.select("//div[@class='breadcrumbs']/b/text()").extract_first()
        item["image_urls"] = hxs.select("//div[@class='scroller-view']/div/div/a/@href").extract()
        try:
            z = hxs.select("//div[@class='itemPDF']/div/a/@onclick").extract()
            for i in range(len(z)):
                x = ((z[i]).split("('")[1]).split("','")[0]
                item["file_urls"].append(x)
            print item["file_urls"]
        except:
            item["file_urls"] = ""
            print "No PDF there"

        items.append(item)
        return items
    def parse(self, response):
        page = response.url.split("/")[-2]
        filename = '../../bundestag-%s.html' % page
        response_html = response.body

        #hxs = HtmlXPathSelector(response)
        hxs = Selector(response)
        '''
        body = response.body
        titles = hxs.xpath('//title/text()')
        final = titles.extract()
        print(final)
        '''

        all_titles = hxs.select('//a')
        for titles in all_titles:
            title = titles.select('//title/text()').extract()
            link = titles.select('a/@href').extract()
            print(title)
            print(link)
        '''
    def parse_page(self, response):
        # Given a page response, parse all the images and grab the title
        sel = Selector(response=response)

        title = response.css('title::text').extract()
        text = ''.join(sel.select("//body//text()").extract()).strip()
        url = response.request.url
        base = get_base_url(response)

        image_urls = [
            urljoin(response.url, src)
            for src in response.xpath('//img/@src').extract()
        ]
        image_hashes = defaultdict()
        for img in image_urls:
            image_hashes[hashlib.sha1(img.encode()).hexdigest()] = img

        yield ElectionItem(title=title,
                           base_url=base,
                           url=url,
                           text=text,
                           image_hash_ids=image_hashes,
                           image_urls=image_urls)