Exemple #1
0
    def parse_items(self, response):
        item = ItemLoader(PhoneHouseItem(), response)
        item.add_xpath(
            'nombre',
            '//*[@id="top-contenedor-principal"]/main/section[1]/div[1]/div/h1/text()'
        )
        item.add_xpath(
            'sistemaOperativo',
            '//*[@id="modulo-caracteristicas"]/div/div/div[5]/ul/li[1]/div[2]/text()'
        )
        ram = item.get_xpath(
            '//*[@id="modulo-caracteristicas"]/div/div/div[3]/ul/li[1]/div[2]/text()'
        )
        ram = str(ram[0])
        contenido = ram[0].rstrip('GB')
        item.add_value('ram', contenido)

        alm = item.get_xpath(
            '//*[@id="modulo-caracteristicas"]/div/div/div[3]/ul/li[2]/div[2]/text()'
        )
        alm = str(alm[0])
        contenido = alm.rstrip(' GB')
        item.add_value('almacenamiento', contenido)
        item.add_value('url', response.url)
        item.add_xpath('precio',
                       '//*[@id="precios"]/div[2]/div[1]/h3/span[2]/text()')
        img = item.get_xpath(
            '//*[@id="top-contenedor-principal"]/main/section[1]/div[2]/div/div[1]/div[2]/div[1]/div[1]/img/@src'
        )
        contenido = 'https:' + str(img[0])
        item.add_value('imagen', contenido)

        # Faltaría eliminar los atributos del objeto que aparecen cada vez que se descargan los datos de la web
        # Arreglar el problema de  indexerror list index out of range
        yield item.load_item()
Exemple #2
0
    def parse_item(self, response):
        manga = ItemLoader(item=MangaCrawlerItem(), response=response)

        manga.add_xpath('name', '//title/text()',
                        MapCompose(lambda x: x.split(' | ')[0], str.strip))
        manga.add_value('source', response.url)
        manga.add_xpath('image_src', '//*[@class="thumbnail"]/img/@src')
        manga.add_xpath('description', '//*[@class="content"]//text()',
                        MapCompose(str.strip), Join('\n'),
                        MapCompose(str.strip))
        manga.add_value(
            'total_chap',
            max([
                int(i) for i in manga.get_xpath(
                    '//*[@id="list-chapters"]/p/span/a/text()',
                    MapCompose(lambda x: re.findall(r'\d+', x)))
            ]))

        chapter_source = manga.get_xpath(
            '//*[@id="list-chapters"]/p/span/a/@href')
        chapter_name = manga.get_xpath(
            '//*[@id="list-chapters"]/p/span/a/text()')
        chapters = zip(chapter_name, chapter_source)

        manga.add_value('chapters', chapters)

        return manga.load_item()
Exemple #3
0
    def parse_item(self, response):
        """
        @url http://hocvientruyentranh.com/manga/2/shokugeki-no-souma-
        @returns items 1
        @scrapes name source total_chap chapters
        """
        manga = ItemLoader(item=MangaCrawlerItem(), response=response)

        manga.add_xpath('name', '//h3[@class="__name"]/text()', MapCompose(str.strip))
        manga.add_value('source', response.url)
        manga.add_xpath('image_src', '//*[@class="__image"]/img/@src')
        manga.add_value(
            'total_chap',
            max(
                [int(i) for i in
                    manga.get_xpath(
                        '//*[@class="table table-hover"]/tbody//tr//td//a//text()',
                        MapCompose(lambda x: re.findall(r'\d+', x)))]
            )
        )

        chapter_source = manga.get_xpath('//*[@class="table table-hover"]/tbody//tr//td//a/@href')
        chapter_name = manga.get_xpath('//*[@class="table table-hover"]/tbody//tr//td//a//text()')
        chapters = zip(chapter_name, chapter_source)

        manga.add_value('chapters', chapters)

        return manga.load_item()
    def parse_item(self,response):
        sel = Selector(response)
        il = ItemLoader(item=Product(), response=response)

        cat = il.get_xpath('//div[contains(@id, "ctl00_pnlBreadCrumbs")]/a[last()]/text()')
        availability = il.get_xpath('//a[contains(@id,"hplddToCart") or contains(@class,"addToCart")]/text()')
        price = il.get_css('span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblRegprice > font::text')
        sale = il.get_css('span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblSaleprice > font::text')
       
        """If the xpath doesn't retunr a category, the product belongs to the Bundle category"""
        if not cat:
            il.add_value("category", "Bundle")
        else:
            il.add_value("category", cat)
       
        il.add_css("title", "span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblProdTitle::text")
        il.add_value("url",response.url)
       
        """If a product can be added to the cart, the product is available online, if not, the product is not available online"""
        if "ADD TO CART" in availability:
            il.add_value("availability", "Product is available online")
        else:
            il.add_value("availability", "Product is not available online")

        """If there's a sale price present but not a regular price present, it switches the sale price for the regular price as shown in the website"""
        if not price:
            il.add_value("regPrice",sale)
            il.add_value("salePrice", None)
        else:
            il.add_value("regPrice", price)
            il.add_value("salePrice",sale)
        return il.load_item()
Exemple #5
0
    def parse_item(self, response):
        """
        @url https://www.vinabook.com/lam-quen-thong-ke-hoc-qua-biem-hoa-p71348.html
        @returns items 1
        @scrapes name name_unidecode price description
        @scrapes url project spider server date
        """
        l = ItemLoader(item=BooksItem(), response=response)

        l.add_value('name', l.get_xpath('//*[@itemprop="title"]/text()')[-1])
        l.add_value(
            'name_unidecode',
            unidecode(l.get_xpath('//*[@itemprop="title"]/text()')[-1]))
        l.add_xpath('price',
                    '//*[contains(@id, "discounted_price")]/span/text()',
                    TakeFirst())
        l.add_xpath('author', '//*[@itemprop="author"]/text()')
        l.add_value(
            'description',
            filter(None, [
                re.sub('<[^<]+?>', '', i)
                for i in l.get_xpath('//*[@class="full-description"]/p')
            ]), Join('\n'))
        l.add_xpath('image_uri', '//*[@itemprop="image"]/@src')

        # Information fields
        l.add_value('url', response.url)
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('date', datetime.datetime.now())

        return l.load_item()
    def parse_item(self, response):
        """
        @url http://www.lazada.vn/tony-buoi-sang-tren-duong-bang-1540897.html
        @returns items 1
        @scrapes name name_unidecode price description
        @scrapes url project spider server date
        """
        l = ItemLoader(item=BooksItem(), response=response)

        l.add_xpath('name', '//*[@id="prod_title"]/text()',
                    MapCompose(unicode.strip, unicode.title))
        l.add_xpath('name_unidecode', '//*[@id="prod_title"]/text()',
                    MapCompose(unidecode, str.strip, str.title))
        l.add_xpath('price', '//*[@id="special_price_box"]/text()')
        l.add_value(
            'description',
            re.sub('<[^<]+?>', '',
                   l.get_xpath('//*[@class="product-description__block"]')
                   [0]).strip())
        l.add_value('image_uri',
                    l.get_xpath('//*[@itemprop="image"]/@content')[1])

        # Information fields
        l.add_value('url', response.url)
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('date', datetime.datetime.now())

        return l.load_item()
Exemple #7
0
    def parse_item(self, response):
        """
        @url http://www.fahasa.com/luat-im-lang-mario-puzo.html
        @returns items 1
        @scrapes name name_unidecode price description
        @scrapes url project spider server date
        """
        l = ItemLoader(item=BooksItem(), response=response)

        l.add_value('name',
                    l.get_xpath('//*[@class="product-name"]/h1/text()')[-1])
        l.add_value(
            'name_unidecode',
            unidecode(l.get_xpath('//*[@class="product-name"]/h1/text()')[-1]))
        l.add_value('price',
                    l.get_xpath('//*[@class="price"]/text()')[1].strip(),
                    TakeFirst(),
                    re=r'\d+\.\d+')
        l.add_value(
            'description',
            filter(None, [
                re.sub('<[^<]+?>', '', i)
                for i in l.get_xpath('//*[@class="std"]')
            ]), Join('\n'))
        l.add_xpath('image_uri', '//*[@id="image"]/@src')

        # Information fields
        l.add_value('url', response.url[response.url.find('cache:') + 6:])
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('date', datetime.datetime.now())

        return l.load_item()
Exemple #8
0
def load_document(response, document):
    l = ItemLoader(item = DocumentItem(), response = response)
    l.default_output_processor = TakeFirst()
    l.add_value('coverpage_url', response.url)
    l.add_xpath('abstract', document['abstract'],Join())
    l.add_xpath('title', document['title'], Join())

    # need to fix
    submission = l.get_xpath(document['submission_path'])[-3].strip()
    l.add_value('submission_path', submission)

    # handle dates
    dates = [i for i in l.get_xpath(document['date'])[0].split(', ')]
    try:
        l.add_value('accepted_date', parse(dates[-1]))
        l.add_value('revision_date',parse(dates[-2]))
        l.add_value('online_date',parse(dates[-3]))
    except:
        pass

    # handle pages
    try:
        pages = submission_page[-1]
        p = pages.split()[-1].split('–',1)

        l.add_value('fpage', int(p[0]))
        l.add_value('lpage', int(p[1]))
        l.add_value('pages', int(p[1])-int(p[0])+1)
    except:
        pass

    return l
    def parse_item(self, response):
        manga = ItemLoader(item=MangaCrawlerItem(), response=response)
        manga.add_xpath('name', '//h1[@class="SeriesName"]/text()')
        manga.add_value('source', response.url)
        manga.add_xpath('image_src', '//meta[@property="og:image"]/@content')
        manga.add_xpath('description', '//*[@class="description"]/text()',
                        Join('\n'))

        if 'Complete (Publish)' in manga.get_xpath(
                '//*[@class="PublishStatus"]/text()'):
            manga.add_value('full', True)
        else:
            manga.add_value('full', False)

        chapter_xpath = '//*[@class="list chapter-list"]/a'

        manga.add_value(
            'total_chap',
            manga.get_xpath(chapter_xpath + '/span/text()',
                            MapCompose(lambda x: re.findall(r'\d+', x)))[0])

        chapter_source = manga.get_xpath(chapter_xpath + '/@href',
                                         MapCompose(make_full_url))
        chapter_name = manga.get_xpath(chapter_xpath + '/span/text()')
        chapters = zip(chapter_name, chapter_source)
        manga.add_value('chapters', chapters)
        manga.add_value('web_source', 'mangaseeonline')

        return manga.load_item()
Exemple #10
0
    def parse(self, response):
        sel = Selector(response)
        articulos = sel.xpath(
            '//*[@id="tt-pageContent"]/div/div/div/div[2]/div/div[2]/div')

        #ITERAR SOBRE TODOS LOS ARTICULOS
        for i, art in enumerate(articulos):
            loader = ItemLoader(item=Articulo(), selector=art)

            loader.add_xpath('nombre', './/div/div[2]/h2/a/text()')

            precio = loader.get_xpath('.//div/div[2]/div[2]/text()')[0]
            precio = precio.split('$')[1].split('.')
            pre = precio[0] + precio[1]
            loader.add_value('precio', pre)

            finalUrl = loader.get_xpath('.//div/div[2]/h2/a/@href')[0]
            finalUrl = finalUrl.split('id=')[-1]
            fullUrl = 'https://www.foxinsumospc.com.ar/?p=home&m=detalleproducto&id=' + finalUrl
            loader.add_value('url', fullUrl)

            cat = response.url.split('cat=')[-1]
            if cat == "45":
                loader.add_value('categoria', 'Gabinetes')
            elif cat == "60":
                loader.add_value('categoria', 'Monitores')
            elif cat == "30" or cat == "31" or cat == "32":
                loader.add_value('categoria', 'Almacenamiento')
            elif cat == "50":
                loader.add_value('categoria', 'Impresoras')
            elif cat == "51":
                loader.add_value('categoria', 'Joysticks')
            elif cat == "69":
                loader.add_value('categoria', 'Parlantes')
            elif cat == "23" or cat == "24" or cat == "25":
                loader.add_value('categoria', 'Refrigeración')
            elif cat == "62":
                loader.add_value('categoria', 'Mouses')
            elif cat == "76":
                loader.add_value('categoria', 'Placas de Video')
            elif cat == "42":
                loader.add_value('categoria', 'Fuentes')
            elif cat == "11":
                loader.add_value('categoria', 'Cables')
            elif cat == "56":
                loader.add_value('categoria', 'Memorias')
            elif cat == "61":
                loader.add_value('categoria', 'Motherboards')
            elif cat == "59":
                loader.add_value('categoria', 'Procesadores')
            elif cat == "105":
                loader.add_value('categoria', 'Webcams')
            yield loader.load_item()  #imprimir salida
Exemple #11
0
    def parse_item(self, response):
        """
        @url http://splash:8050/render.html?&url=http://www.nettruyenco.com/truyen-tranh/boyfriend-17550&wait=1
        @scrapes name source image_src total_chap description chapters web_source full
        """
        manga = ItemLoader(item=MangaCrawlerItem(), response=response)
        manga.add_xpath("unicode_name", '//h1[@class="title-detail"]/text()')
        manga.add_value("name",
                        unidecode(manga.get_output_value("unicode_name")[0]))
        manga.add_value("source", response.url)
        manga.add_xpath("image_src",
                        '//*[@class="col-xs-4 col-image"]/img/@src')
        manga.add_xpath("description",
                        '//*[@class="detail-content"]/p//text()', Join("\n"))
        chapter_xpath = '//*[@id="nt_listchapter"]/nav/ul/li[not(contains (@class, "row heading"))]/div[1]/a'
        chapter_source = manga.get_xpath(chapter_xpath + "/@href")
        chapter_name = manga.get_xpath(chapter_xpath + "/text()")
        chapters = zip(chapter_name, chapter_source)

        if "Hoàn thành" in manga.get_xpath(
                '//*[@class="status row"]/p[2]/text()'):
            manga.add_value("full", True)
            manga.add_value(
                "total_chap",
                manga.get_xpath(
                    chapter_xpath + "/text()",
                    MapCompose(lambda x: re.findall(r"\d+", x)),
                    MapCompose(int),
                )[0],
            )
        else:
            manga.add_value("full", False)
            manga.add_value(
                "total_chap",
                manga.get_xpath(
                    "//title/text()",
                    MapCompose(
                        lambda x: re.findall(r" Chapter \d+| Chap \d+", x)),
                    MapCompose(lambda x: re.findall(r"\d+", x)),
                    MapCompose(float),
                    MapCompose(int),
                    TakeFirst(),
                ),
            )

        manga.add_value("chapters", chapters)
        manga.add_value("web_source", "nettruyen")
        print(manga.load_item())

        return manga.load_item()
Exemple #12
0
def load_document(response, document):
    l = ItemLoader(item = DocumentItem(), response = response)
    l.default_output_processor = TakeFirst()

    l.add_value('coverpage_url', response.url)
    l.add_xpath('abstract', document['abstract'])
    l.add_xpath('title', document['title'])
    try:
        meta = l.get_xpath(document['meta'])
        l.add_value('submission_path', normalize('NFKD', meta[1] + meta[2]))
        pages = meta[-1].split(' ')[-1]
        if '–' in pages:
            fp = int(pages.split('–')[0])
            lp = int(pages.split('–')[1])
        elif '-' in pages:
            fp = int(pages.split('-')[0])
            lp = int(pages.split('-')[1])
        l.add_value('fpage', fp)
        l.add_value('lpage', lp)
        l.add_value('pages', lp-fp+1)
    except:
        pass

    l.add_value('publication_date', parse(response.xpath(document['publication_date']).extract()[0]))

    # mark it down, with source's publication_title
    return l
Exemple #13
0
    def parse_item(self, response):
        """
        @url https://tiki.vn/hieu-ng-canh-buom-p146105.html
        @returns items 1
        @scrapes name name_unidecode price description
        @scrapes url project spider server date
        """
        l = ItemLoader(item=BooksItem(), response=response)

        l.add_xpath('name', '//*[@class="item-name"]/text()',
                    MapCompose(unicode.strip, unicode.title))
        l.add_xpath('name_unidecode', '//*[@class="item-name"]/text()',
                    MapCompose(unidecode, str.strip, str.title))
        l.add_xpath('author', '//*[@class="item-brand"]/p/a/text()')
        l.add_xpath('price',
                    '//*[@id="span-price"]/text()',
                    TakeFirst(),
                    re=r'\d+\.\d+')
        l.add_value('description', [
            re.sub('<[^<]+?>', '', i)
            for i in l.get_xpath('//*[@id="gioi-thieu"]/p')
        ], Join('\n'))
        l.add_xpath('image_uri', '//*[@itemprop="image"]/@src')

        # Information fields
        l.add_value('url', response.url)
        l.add_value('project', self.settings.get('BOT_NAME'))
        l.add_value('spider', self.name)
        l.add_value('server', socket.gethostname())
        l.add_value('date', datetime.datetime.now())

        return l.load_item()
Exemple #14
0
 def renxuan_2(self, response):
     trs = response.xpath(
         '//table[@id="OutSpeltyEP1_gridMain"]/tbody/tr[re:test(@class,"tdcolour\d$")]'
     )
     for tr in trs:
         loader = ItemLoader(response.meta['item'], selector=tr)
         cid = loader.get_xpath('./td[3]/text()')
         loader.add_xpath('name', './td[2]/text()')
         loader.add_xpath('cid', './td[3]/text()')
         loader.add_xpath('credit', './td[6]/text()')
         # yield Request(url=TEST_LESSON_URL,
         #         dont_filter=True,
         #         meta= {'item':loader.load_item()},
         #         callback = self.lesson_parser
         # )
         new_meta = {'item': loader.load_item()}
         new_meta.update(response.meta)
         yield FormRequest.from_response(
             response,
             dont_filter=True,
             formdata={
                 'OutSpeltyEP1$dpYx': response.meta['item']['course_type'],
                 'OutSpeltyEP1$dpNj': response.meta['item']['grade'],
                 'myradiogroup': cid,
                 'OutSpeltyEP1$lessonArrange': '课程安排'
             },
             meta=new_meta,
             callback=self.lesson_parser)
Exemple #15
0
    def parse_item(self, response):
        manga = ItemLoader(item=MangaCrawlerItem(), response=response)

        manga.add_xpath(
            "name", "//title/text()", MapCompose(lambda x: x.split(" | ")[0], str.strip)
        )
        manga.add_value("source", response.url)
        manga.add_xpath("image_src", '//*[@class="thumbnail"]/img/@src')
        manga.add_xpath(
            "description",
            '//*[@class="content"]//text()',
            MapCompose(str.strip),
            Join("\n"),
            MapCompose(str.strip),
        )
        manga.add_value(
            "total_chap",
            max(
                [
                    int(i)
                    for i in manga.get_xpath(
                        '//*[@id="list-chapters"]/p/span/a/text()',
                        MapCompose(lambda x: re.findall(r"\d+", x)),
                    )
                ]
            ),
        )

        get_chapter_source = manga.get_xpath(
            '//*[@id="list-chapters"]/p/span/a/@href', MapCompose(mc)
        )
        chapter_source = [
            chap for chap in get_chapter_source if "mediafire" not in chap
        ]
        chapter_name = manga.get_xpath('//*[@id="list-chapters"]/p/span/a/text()')
        chapters = zip(chapter_name, chapter_source)

        manga.add_value("chapters", chapters)
        manga.add_value("web_source", "blogtruyen")

        if "Đã hoàn thành" in manga.get_xpath('//*[@class="description"]//text()'):
            manga.add_value("full", True)
        else:
            manga.add_value("full", False)

        return manga.load_item()
Exemple #16
0
def load_source(response, source):
    website_url = 'https://us.sagepub.com'
    l = ItemLoader(item=SourceItem(), response=response)
    l.default_output_processor = TakeFirst()
    l.add_value("issn",
                response.xpath(source['issn']).extract()[1].split()[-1])
    l.add_value('chief_editor',
                response.xpath(source['chief_editor']).extract()[0])
    l.add_xpath('publication_title', source['publication_title'])
    l.add_value('coverimage',
                website_url + l.get_xpath(source['coverimage'])[0])
    l.add_xpath('description', './/div[@class="field-item even"]', Join(),
                cleanhtml,
                lambda x: x.replace('\n', '').replace('  ', '').strip())
    l.add_value('home_url', response.url)
    publication_title = l.get_xpath(source['publication_title'])
    return l
    def parse_item(self, response):
        """
        @url https://doctruyen3q.info/truyen-tranh/dao-hai-tac/77
        @scrapes name source image_src total_chap description chapters web_source full unicode_name
        """
        manga = ItemLoader(item=MangaCrawlerItem(), response=response)
        category = manga.get_xpath("//*[@class='category row']/p[2]//text()")
        categories = re.sub(r'\s+', '', "".join(category))
        if any(i in unidecode(categories).lower() for i in ["18+", "smut", "yaoi", "ntr", "yuri", 'adult', 'dammy']):
            return
        manga.add_xpath("unicode_name", '//h1[@class="title-manga"]/text()')
        manga.add_value("name", unidecode(
            manga.get_output_value("unicode_name")[0]))
        manga.add_value("source", response.url)
        manga.add_xpath(
            "image_src", '//*[@class="image-comic"]/@src')
        manga.add_xpath(
            "description", '//*[@class="detail-summary"]/text()'
        )
        chapter_xpath = '//*[@id="list-chapter-dt"]/nav/ul/li/div[1]/a'
        chapter_source = manga.get_xpath(chapter_xpath + "/@href")
        chapter_name = manga.get_xpath(chapter_xpath + "/text()")
        chapters = zip(chapter_name, chapter_source)

        if "Đã hoàn thành" in manga.get_xpath('//*[@class="status row"]//text()'):
            manga.add_value("full", True)
        else:
            manga.add_value("full", False)

        manga.add_value(
            "total_chap",
            manga.get_xpath(
                '//*[@id="list-chapter-dt"]/nav/ul/li[1]/div[1]/a/text()',
                MapCompose(lambda x: re.findall(r"(\d+(?:\.\d+)?)", x)),
                MapCompose(float),
                MapCompose(int),
                TakeFirst(),
            ),
        )

        manga.add_value("chapters", chapters)
        manga.add_value("web_source", "doctruyen3q")
        print(manga.load_item())

        return manga.load_item()
Exemple #18
0
def load_source(response, source):
    l = ItemLoader(item = SourceItem(), response = response)
    l.default_output_processor = TakeFirst()
    l.add_xpath("issn", source['issn'])
    l.add_xpath('publication_title', source['publication_title'])
    l.add_xpath('coverimage', source['coverimage'])
    l.add_xpath('description',source['description'], Join() )
    l.add_value('home_url', response.url)
    publication_title = l.get_xpath(source['publication_title'])
    return l
Exemple #19
0
def load_document(response, document):
    l = ItemLoader(item=DocumentItem(), response=response)
    l.default_output_processor = TakeFirst()

    l.add_value('coverpage_url', response.url)
    l.add_xpath('abstract', document['abstract'])
    l.add_value('title',
                (l.get_xpath(document['title'])[0]).replace('\n', '').strip())
    l.add_value(
        'submission_path',
        l.get_xpath(document['submission_path'])[0].replace('\n', '').strip())

    # handle dates
    try:
        dates = [
            i.replace('\n', '').replace(';', '').strip()
            for i in response.xpath(document['dates']).extract()[-2:]
        ]
        d = [parse(i) for i in dates]
        l.add_value('online_date', d[0])
        l.add_value('publication_date', d[1])
    except:
        pass

    # handle pages
    try:
        pages = response.xpath(document['pages']).extract()[0].strip().split(
            '\n')[-1].strip().split(':')[-1]
        if '–' in pages:
            fp = int(pages.split('–')[0])
            lp = int(pages.split('–')[1])
        elif '-' in pages:
            fp = int(pages.split('-')[0])
            lp = int(pages.split('-')[1])
        l.add_value('fpage', fp)
        l.add_value('lpage', lp)
        l.add_value('pages', lp - fp + 1)
    except:
        pass

    # mark it down, with source's publication_title
    return l
Exemple #20
0
    def parse(self, response):
        sel = Selector(response)
        articulos = sel.xpath('//div[@class="padder"]/ul/li')

        #ITERAR SOBRE TODOS LOS ARTICULOS
        for i, art in enumerate(articulos):
            loader = ItemLoader(item=Articulo(), selector=art)

            loader.add_xpath('nombre', './/div/h3/a/text()')
            #loader.add_xpath('precio', './/div/a/text()')
            precio = loader.get_xpath('.//div/a/text()')[0]
            precio = precio.split('AR$')[-1].strip()
            loader.add_value('precio', precio)

            cat = response.url.split("/")[-2]
            finalUrl = loader.get_xpath('.//a/@href')
            fullUrl = 'http://www.starcomputacion.com.ar/' + finalUrl[0]

            loader.add_value('url', fullUrl)

            if cat == "teclados-45":
                loader.add_value('categoria', 'Teclados')
            elif cat == "monitores-23":
                loader.add_value('categoria', 'Monitores')
            elif cat == "discos-rigidos-67":
                loader.add_value('categoria', 'Almacenamiento')
            elif cat == "impresoras-24":
                loader.add_value('categoria', 'Impresoras')
            elif cat == "mouses-y-pads-41":
                loader.add_value('categoria', 'Mouses')
            elif cat == "parlantes-42":
                loader.add_value('categoria', 'Parlantes')
            elif cat == "webcams-46":
                loader.add_value('categoria', 'Webcams')
            elif cat == "estabilizador-de-tension-39":
                loader.add_value('categoria', 'Estabilizadores')
            elif cat == "auriculares-gamers-141":
                loader.add_value('categoria', 'Auriculares')
            elif cat == "pendrives-13":
                loader.add_value('categoria', 'Pendrives')
            yield loader.load_item()  #imprimir salida
Exemple #21
0
    def parse_prof(self, response):
        departxpath = "//*[@id='mainContent']//div[@class='result-title']/text()"
        univerxpath = "//*[@id='mainContent']//div[@class='result-title']//a[@class='school']/text()"
        difficxpath = "//*[@id='mainContent']//div[@class='rating-breakdown']//div[@class='breakdown-header']/div[2]/div[@class='grade']/text()"
        kxpath = "//*[@id='mainContent']//div[@class='rating-breakdown']/div[2]/div[@class='tag-box']/span[@class='tag-box-choosetags']/text()"
        vxpath = "//*[@id='mainContent']//div[@class='rating-breakdown']/div[2]/div[@class='tag-box']/span[@class='tag-box-choosetags']/b/text()"

        l = ItemLoader(item=ProfessorItem(), response=response)
        l.default_output_processor = TakeFirst()

        l.add_value('tid', response.meta['tid'])
        l.add_value('sid', response.meta['sid'])
        l.add_value('pfname', response.meta['pfname'])
        l.add_value('plname', response.meta['plname'])
        l.add_value('pname', response.meta['pfname'])
        l.add_value('pname', response.meta['plname'])
        l.add_value('quality', response.meta['quality'])
        l.add_value('n_rating', response.meta['n_rating'])

        l.add_xpath('department',
                    departxpath,
                    re='Professor in the (.+) department')
        l.add_xpath('university', univerxpath)
        l.add_xpath('difficulty', difficxpath, MapCompose(self.diff2float))

        keys = l.get_xpath(kxpath, MapCompose(lambda p: p.replace(' ', '')))
        values = l.get_xpath(
            vxpath, MapCompose(lambda p: int(p.strip('(').strip(')'))))
        l.add_value('tags', json.dumps(dict(zip(keys, values))))

        yield l.load_item()
        n_rating = response.meta['n_rating']
        if n_rating != 0:
            tid = response.meta['tid']
            for pn in range(math.ceil(n_rating / 20)):
                url = 'http://www.ratemyprofessors.com/paginate/professors/ratings?tid=%d&page=%d' % (
                    tid, pn + 1)  # a int is needed here
                yield Request(url,
                              meta=response.meta,
                              callback=self.parse_rating)
    def parse_items(self, response):

        item = ItemLoader(MediamarktItem(), response)

        name = item.get_xpath('//*[@id="product-details"]/div[1]/h1/text()')
        name = str(name[0])
        result = ' '
        for a in name:
            if (a == ','):
                break
            result += a

        item.add_value('nombre', result)

        ram = item.get_xpath('//*[@id="features"]/section[1]/dl/dd[7]/text()')
        ram = str(ram[0])
        contenido = ram[0].rstrip(' GB')
        item.add_value('ram', contenido)

        item.add_xpath('sistemaOperativo',
                       '//*[@id="features"]/section[1]/dl/dd[2]/text()')

        alm = item.get_xpath('//*[@id="features"]/section[1]/dl/dd[5]/text()')
        alm = str(alm[0])
        contenido = alm.rstrip(' GB')
        item.add_value('almacenamiento', contenido)

        item.add_value('url', response.url)

        item.add_xpath(
            'precio',
            '//*[@id="product-details"]/div[2]/div[1]/meta[2]/@content')

        img = item.get_xpath('//*[@id="product-sidebar"]/div[1]/a/img/@src')
        contenido = 'https:' + str(img[0])
        item.add_value('imagen', contenido)

        # Faltaría eliminar los atributos del objeto que aparecen cada vez que se descargan los datos de la web

        yield item.load_item()
    def parse_item(self, response):
        sel = Selector(response)
        il = ItemLoader(item=Product(), response=response)

        cat = il.get_xpath(
            '//div[contains(@id, "ctl00_pnlBreadCrumbs")]/a[last()]/text()')
        availability = il.get_xpath(
            '//a[contains(@id,"hplddToCart") or contains(@class,"addToCart")]/text()'
        )
        price = il.get_css(
            'span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblRegprice > font::text'
        )
        sale = il.get_css(
            'span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblSaleprice > font::text'
        )
        """If the xpath doesn't retunr a category, the product belongs to the Bundle category"""
        if not cat:
            il.add_value("category", "Bundle")
        else:
            il.add_value("category", cat)

        il.add_css(
            "title",
            "span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblProdTitle::text"
        )
        il.add_value("url", response.url)
        """If a product can be added to the cart, the product is available online, if not, the product is not available online"""
        if "ADD TO CART" in availability:
            il.add_value("availability", "Product is available online")
        else:
            il.add_value("availability", "Product is not available online")
        """If there's a sale price present but not a regular price present, it switches the sale price for the regular price as shown in the website"""
        if not price:
            il.add_value("regPrice", sale)
            il.add_value("salePrice", None)
        else:
            il.add_value("regPrice", price)
            il.add_value("salePrice", sale)
        return il.load_item()
Exemple #24
0
def load_document(response, document):
    l = ItemLoader(item = DocumentItem(), response = response)
    l.default_output_processor = TakeFirst()
    l.add_value('coverpage_url', response.url)
    l.add_xpath('abstract', document['abstract'])
    l.add_xpath('title', document['title'])
    l.add_xpath('submission_path', document['submission_path'])

    # handle dates
    dates = [i for i in l.get_xpath(document['date'])[0].split(', ')]
    d = getdate(dates)
    l.add_value('submission_date',d['submission_date'])
    l.add_value('revision_date',d['revision_date'])
    l.add_value('accepted_date', d['accepted_date'])
    l.add_value('online_date', d['online_date'])

    date_page = l.get_xpath(document['dp'])[0].split(', ')
    try:
        l.add_value('publication_date', parse(date_page[-2]))
    except:
        pass

    # handle pages
    try:
        pages = date_page[-1].split()[-1]
        if '–' in pages:
            fp = int(pages.split('–')[0])
            lp = int(pages.split('–')[1])
        elif '-' in pages:
            fp = int(pages.split('-')[0])
            lp = int(pages.split('-')[1])
        l.add_value('fpage', fp)
        l.add_value('lpage', lp)
        l.add_value('pages', lp-fp+1)
    except:
        pass

    # mark it down, with source's publication_title
    return l
Exemple #25
0
    def parse(self, response):
        sel = Selector(response)
        articulos = sel.xpath('//*[@id="main"]/ul/li')

        #ITERAR SOBRE TODOS LOS ARTICULOS
        for i, art in enumerate(articulos):
            loader = ItemLoader(item=Articulo(), selector=art)

            loader.add_xpath('nombre', './/div/div/div[1]/a/h2/text()')
            pre = loader.get_xpath(
                './/div/div/div[3]/div[1]/span/span/ins/span/text()')[0]
            pre = pre.split(',')
            precio = ""
            for i in pre:
                precio += str(i) + ""
            precio = precio[:-1]
            precio = precio.strip()
            loader.add_value('precio', precio)

            cat = response.url.split("/")[-2]
            loader.add_xpath('url', './/div/div/div[1]/a/@href')

            if cat == "procesadores":
                loader.add_value('categoria', 'Procesadores')
            elif cat == "motherboards":
                loader.add_value('categoria', 'Motherboards')
            elif cat == "memorias":
                loader.add_value('categoria', 'Memorias')
            elif cat == "almacenamiento":
                loader.add_value('categoria', 'Almacenamiento')
            elif cat == "impresoras":
                loader.add_value('categoria', 'Impresoras')
            elif cat == "placas-de-video":
                loader.add_value('categoria', 'Placas de Video')
            elif cat == "mouses-y-teclados":
                loader.add_value('categoria', 'Mouses Y Teclados')
            elif cat == "fuentes":
                loader.add_value('categoria', 'Fuentes')
            elif cat == "gabinetes":
                loader.add_value('categoria', 'Gabinetes')
            elif cat == "monitores":
                loader.add_value('categoria', 'Monitores')
            elif cat == "webcams":
                loader.add_value('categoria', 'Webcams')
            elif cat == "auriculares":
                loader.add_value('categoria', 'Auriculares')
            elif cat == "parlantes-pc":
                loader.add_value('categoria', 'Parlantes')
            elif cat == "refrigeracion":
                loader.add_value('categoria', 'Refrigeración')
            yield loader.load_item()  #imprimir salida
Exemple #26
0
    def parse_item(self, response):
        """
        @url http://hocvientruyentranh.com/manga/2/shokugeki-no-souma-
        @returns items 1
        @scrapes name source total_chap chapters description
        """
        manga = ItemLoader(item=MangaCrawlerItem(), response=response)

        manga.add_xpath("name", '//h3[@class="__name"]/text()', MapCompose(str.strip))
        manga.add_value("source", response.url)
        manga.add_xpath("image_src", '//*[@class="__image"]/img/@src')
        manga.add_xpath(
            "description", '//*[@class="__description"]//p/text()', Join("\n")
        )
        manga.add_value(
            "total_chap",
            max(
                [
                    int(i)
                    for i in manga.get_xpath(
                        '//*[@class="table table-hover"]/tbody//tr//td//a//text()',
                        MapCompose(lambda x: re.findall(r"\d+", x)),
                    )
                ]
            ),
        )

        chapter_source = manga.get_xpath(
            '//*[@class="table table-hover"]/tbody//tr//td//a/@href'
        )
        chapter_name = manga.get_xpath(
            '//*[@class="table table-hover"]/tbody//tr//td//a//text()'
        )
        chapters = zip(chapter_name, chapter_source)

        manga.add_value("chapters", chapters)

        return manga.load_item()
Exemple #27
0
    def parse_item(self, response):
        """
        @url http://splash:8050/render.html?&url=https://vlogtruyen.net/bokutachi-wa-hanshoku-wo-yameta.html&wait=1
        @scrapes name unicode_name source image_src total_chap description chapters web_source full
        """

        manga = ItemLoader(item=MangaCrawlerItem(), response=response)
        manga.add_xpath("unicode_name",
                        '//h1[@class="title-commic-detail"]/text()')
        manga.add_value("name",
                        unidecode(manga.get_output_value("unicode_name")[0]))
        manga.add_value("source", response.url)
        manga.add_xpath("image_src", '//meta[@property="og:image"]/@content')
        manga.add_xpath("description",
                        '//*[@class="desc-commic-detail"]/text()', Join("\n"))
        chapter_xpath = '//*[@class="ul-list-chaper-detail-commic"]/li/a'
        chapter_source = manga.get_xpath(chapter_xpath + "/@href")
        chapter_name = manga.get_xpath(chapter_xpath + "/h3/text()")
        chapters = zip(chapter_name, chapter_source)

        if "Đã hoàn thành" in manga.get_xpath(
                '//*[@class="manga-status"]/p/text()'):
            manga.add_value("full", True)
        else:
            manga.add_value("full", False)

        manga.add_value(
            "total_chap",
            manga.get_xpath(
                '//*[@class="ul-list-chaper-detail-commic"]/li[1]/a/h3/text()',
                MapCompose(lambda x: re.findall(r"(\d+(?:\.\d+)?)", x)),
                TakeFirst(),
            ),
        )
        manga.add_value("chapters", chapters)
        manga.add_value("web_source", "vlogtruyen")

        return manga.load_item()
Exemple #28
0
def load_author(response,author):
    auths = response.xpath(author['auth'])
    for auth in auths:
        l = ItemLoader(item = AuthorItem(), response = response)
        l.default_onput_processor = TakeFirst()

        # author's first name and last name
        fn = auth.xpath(author['fn']).extract()[0]
        ln = auth.xpath(author['ln']).extract()[0]
        l.add_value('fname', fn)
        l.add_value('lname', ln)

        # author's email
        try:
            email = auth.xpath(author['email']).extract()[0][7:]
            l.add_value('email', email)
        except:
            pass

        # author's address and institution
        try:
            fid = auth.xpath(author['fid']).extract()[0][1:]
            address = l.get_xpath(author['address'] %fid)

            for i in address[0].split(', '):
                if 'niversity' in i:
                    institution = i
                    break
            l.add_value('address', address)
            l.add_value('institution', institution)
        except:
            pass

        # author's vitae
        try:
            href = auth.xpath(author['href']).extract()[0][1:]
            vitae = response.xpath(author['vitae'] %href).extract()[0]
            l.add_value('vitae', fn+' '+ln+vitae)
        except:
            pass

        # author's avatar
        try:
            href = auth.xpath(author['href']).extract()[0][1:]
            avatar = response.xpath(author['avatar'] %href).extract()[0]
            l.add_value('avatar', avatar)
        except:
            pass

        yield l
    def parse_item(self, response):
        """
        @url https://mangasee123.com/manga/Kingdom
        @scrapes name source image_src total_chap description chapters web_source full
        """
        manga = ItemLoader(item=MangaCrawlerItem(), response=response)
        manga.add_xpath(
            "unicode_name",
            "//div[@class='container MainContainer']//li[1]/h1/text()")
        manga.add_value("name",
                        unidecode(manga.get_output_value("unicode_name")[0]))
        manga.add_value("source", response.url)
        manga.add_xpath("image_src", '//meta[@property="og:image"]/@content')
        manga.add_xpath("description", "//div[@class='top-5 Content']/text()",
                        Join("\n"))

        if "Complete (Publish)" in manga.get_xpath(
                '//*[@class="PublishStatus"]/text()'):
            manga.add_value("full", True)
        else:
            manga.add_value("full", False)

        rss = manga.get_xpath("//a[normalize-space()='RSS Feed']/@href")
        rss_url = BASE_URL + rss[0]

        feed = feedparser.parse(rss_url, agent="Mozilla/5.0")

        manga.add_value(
            "total_chap",
            re.findall(r"\d+", feed['entries'][0]['title'])[0],
        )

        chapters = [(i['title'], i['link']) for i in feed['entries']]
        manga.add_value("chapters", chapters)
        manga.add_value("web_source", "mangaseeonline")

        return manga.load_item()
Exemple #30
0
    def parse_item(self, response):
        manga = ItemLoader(item=MangaCrawlerItem(), response=response)

        manga.add_xpath('name', '//title/text()',
                        MapCompose(lambda x: x.split(' | ')[0], str.strip))
        manga.add_value('source', response.url)
        manga.add_xpath('image_src', '//*[@class="thumbnail"]/img/@src')
        manga.add_xpath('description', '//*[@class="content"]//text()',
                        MapCompose(str.strip), Join('\n'),
                        MapCompose(str.strip))
        manga.add_value(
            'total_chap',
            max([
                int(i) for i in manga.get_xpath(
                    '//*[@id="list-chapters"]/p/span/a/text()',
                    MapCompose(lambda x: re.findall(r'\d+', x)))
            ]))

        get_chapter_source = manga.get_xpath(
            '//*[@id="list-chapters"]/p/span/a/@href', MapCompose(mc))
        chapter_source = [
            chap for chap in get_chapter_source if 'mediafire' not in chap
        ]
        chapter_name = manga.get_xpath(
            '//*[@id="list-chapters"]/p/span/a/text()')
        chapters = zip(chapter_name, chapter_source)

        manga.add_value('chapters', chapters)
        manga.add_value('web_source', 'blogtruyen')

        if 'Đã hoàn thành' in manga.get_xpath(
                '//*[@class="description"]//text()'):
            manga.add_value('full', True)
        else:
            manga.add_value('full', False)

        return manga.load_item()
Exemple #31
0
    def parse_item(self, response):
        # Items提供保存抓取数据的容器,而Item Loder提供的是填充容器的机制。
        # 直接赋值取值的方式,会出现一下几个问题:
        # 代码量一多,各种css和xpath选择器,充斥整个代码逻辑,没有规则,可读性差、不利于维护
        # 对于一个字段的预处理,不明确,也不应该出现在主逻辑中

        l = ItemLoader(item=SpiderScrapyBcyItem(), response=response)
        l.add_xpath('name', "//h1[@class='js-post-title']/text()")
        l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()")
        urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src')
        urls = [url.replace('/w650', '') for url in urls]
        l.add_value('image_urls', urls)
        l.add_value('url', response.url)

        return l.load_item()