def parse_items(self, response): item = ItemLoader(PhoneHouseItem(), response) item.add_xpath( 'nombre', '//*[@id="top-contenedor-principal"]/main/section[1]/div[1]/div/h1/text()' ) item.add_xpath( 'sistemaOperativo', '//*[@id="modulo-caracteristicas"]/div/div/div[5]/ul/li[1]/div[2]/text()' ) ram = item.get_xpath( '//*[@id="modulo-caracteristicas"]/div/div/div[3]/ul/li[1]/div[2]/text()' ) ram = str(ram[0]) contenido = ram[0].rstrip('GB') item.add_value('ram', contenido) alm = item.get_xpath( '//*[@id="modulo-caracteristicas"]/div/div/div[3]/ul/li[2]/div[2]/text()' ) alm = str(alm[0]) contenido = alm.rstrip(' GB') item.add_value('almacenamiento', contenido) item.add_value('url', response.url) item.add_xpath('precio', '//*[@id="precios"]/div[2]/div[1]/h3/span[2]/text()') img = item.get_xpath( '//*[@id="top-contenedor-principal"]/main/section[1]/div[2]/div/div[1]/div[2]/div[1]/div[1]/img/@src' ) contenido = 'https:' + str(img[0]) item.add_value('imagen', contenido) # Faltaría eliminar los atributos del objeto que aparecen cada vez que se descargan los datos de la web # Arreglar el problema de indexerror list index out of range yield item.load_item()
def parse_item(self, response): manga = ItemLoader(item=MangaCrawlerItem(), response=response) manga.add_xpath('name', '//title/text()', MapCompose(lambda x: x.split(' | ')[0], str.strip)) manga.add_value('source', response.url) manga.add_xpath('image_src', '//*[@class="thumbnail"]/img/@src') manga.add_xpath('description', '//*[@class="content"]//text()', MapCompose(str.strip), Join('\n'), MapCompose(str.strip)) manga.add_value( 'total_chap', max([ int(i) for i in manga.get_xpath( '//*[@id="list-chapters"]/p/span/a/text()', MapCompose(lambda x: re.findall(r'\d+', x))) ])) chapter_source = manga.get_xpath( '//*[@id="list-chapters"]/p/span/a/@href') chapter_name = manga.get_xpath( '//*[@id="list-chapters"]/p/span/a/text()') chapters = zip(chapter_name, chapter_source) manga.add_value('chapters', chapters) return manga.load_item()
def parse_item(self, response): """ @url http://hocvientruyentranh.com/manga/2/shokugeki-no-souma- @returns items 1 @scrapes name source total_chap chapters """ manga = ItemLoader(item=MangaCrawlerItem(), response=response) manga.add_xpath('name', '//h3[@class="__name"]/text()', MapCompose(str.strip)) manga.add_value('source', response.url) manga.add_xpath('image_src', '//*[@class="__image"]/img/@src') manga.add_value( 'total_chap', max( [int(i) for i in manga.get_xpath( '//*[@class="table table-hover"]/tbody//tr//td//a//text()', MapCompose(lambda x: re.findall(r'\d+', x)))] ) ) chapter_source = manga.get_xpath('//*[@class="table table-hover"]/tbody//tr//td//a/@href') chapter_name = manga.get_xpath('//*[@class="table table-hover"]/tbody//tr//td//a//text()') chapters = zip(chapter_name, chapter_source) manga.add_value('chapters', chapters) return manga.load_item()
def parse_item(self,response): sel = Selector(response) il = ItemLoader(item=Product(), response=response) cat = il.get_xpath('//div[contains(@id, "ctl00_pnlBreadCrumbs")]/a[last()]/text()') availability = il.get_xpath('//a[contains(@id,"hplddToCart") or contains(@class,"addToCart")]/text()') price = il.get_css('span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblRegprice > font::text') sale = il.get_css('span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblSaleprice > font::text') """If the xpath doesn't retunr a category, the product belongs to the Bundle category""" if not cat: il.add_value("category", "Bundle") else: il.add_value("category", cat) il.add_css("title", "span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblProdTitle::text") il.add_value("url",response.url) """If a product can be added to the cart, the product is available online, if not, the product is not available online""" if "ADD TO CART" in availability: il.add_value("availability", "Product is available online") else: il.add_value("availability", "Product is not available online") """If there's a sale price present but not a regular price present, it switches the sale price for the regular price as shown in the website""" if not price: il.add_value("regPrice",sale) il.add_value("salePrice", None) else: il.add_value("regPrice", price) il.add_value("salePrice",sale) return il.load_item()
def parse_item(self, response): """ @url https://www.vinabook.com/lam-quen-thong-ke-hoc-qua-biem-hoa-p71348.html @returns items 1 @scrapes name name_unidecode price description @scrapes url project spider server date """ l = ItemLoader(item=BooksItem(), response=response) l.add_value('name', l.get_xpath('//*[@itemprop="title"]/text()')[-1]) l.add_value( 'name_unidecode', unidecode(l.get_xpath('//*[@itemprop="title"]/text()')[-1])) l.add_xpath('price', '//*[contains(@id, "discounted_price")]/span/text()', TakeFirst()) l.add_xpath('author', '//*[@itemprop="author"]/text()') l.add_value( 'description', filter(None, [ re.sub('<[^<]+?>', '', i) for i in l.get_xpath('//*[@class="full-description"]/p') ]), Join('\n')) l.add_xpath('image_uri', '//*[@itemprop="image"]/@src') # Information fields l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item()
def parse_item(self, response): """ @url http://www.lazada.vn/tony-buoi-sang-tren-duong-bang-1540897.html @returns items 1 @scrapes name name_unidecode price description @scrapes url project spider server date """ l = ItemLoader(item=BooksItem(), response=response) l.add_xpath('name', '//*[@id="prod_title"]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath('name_unidecode', '//*[@id="prod_title"]/text()', MapCompose(unidecode, str.strip, str.title)) l.add_xpath('price', '//*[@id="special_price_box"]/text()') l.add_value( 'description', re.sub('<[^<]+?>', '', l.get_xpath('//*[@class="product-description__block"]') [0]).strip()) l.add_value('image_uri', l.get_xpath('//*[@itemprop="image"]/@content')[1]) # Information fields l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item()
def parse_item(self, response): """ @url http://www.fahasa.com/luat-im-lang-mario-puzo.html @returns items 1 @scrapes name name_unidecode price description @scrapes url project spider server date """ l = ItemLoader(item=BooksItem(), response=response) l.add_value('name', l.get_xpath('//*[@class="product-name"]/h1/text()')[-1]) l.add_value( 'name_unidecode', unidecode(l.get_xpath('//*[@class="product-name"]/h1/text()')[-1])) l.add_value('price', l.get_xpath('//*[@class="price"]/text()')[1].strip(), TakeFirst(), re=r'\d+\.\d+') l.add_value( 'description', filter(None, [ re.sub('<[^<]+?>', '', i) for i in l.get_xpath('//*[@class="std"]') ]), Join('\n')) l.add_xpath('image_uri', '//*[@id="image"]/@src') # Information fields l.add_value('url', response.url[response.url.find('cache:') + 6:]) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item()
def load_document(response, document): l = ItemLoader(item = DocumentItem(), response = response) l.default_output_processor = TakeFirst() l.add_value('coverpage_url', response.url) l.add_xpath('abstract', document['abstract'],Join()) l.add_xpath('title', document['title'], Join()) # need to fix submission = l.get_xpath(document['submission_path'])[-3].strip() l.add_value('submission_path', submission) # handle dates dates = [i for i in l.get_xpath(document['date'])[0].split(', ')] try: l.add_value('accepted_date', parse(dates[-1])) l.add_value('revision_date',parse(dates[-2])) l.add_value('online_date',parse(dates[-3])) except: pass # handle pages try: pages = submission_page[-1] p = pages.split()[-1].split('–',1) l.add_value('fpage', int(p[0])) l.add_value('lpage', int(p[1])) l.add_value('pages', int(p[1])-int(p[0])+1) except: pass return l
def parse_item(self, response): manga = ItemLoader(item=MangaCrawlerItem(), response=response) manga.add_xpath('name', '//h1[@class="SeriesName"]/text()') manga.add_value('source', response.url) manga.add_xpath('image_src', '//meta[@property="og:image"]/@content') manga.add_xpath('description', '//*[@class="description"]/text()', Join('\n')) if 'Complete (Publish)' in manga.get_xpath( '//*[@class="PublishStatus"]/text()'): manga.add_value('full', True) else: manga.add_value('full', False) chapter_xpath = '//*[@class="list chapter-list"]/a' manga.add_value( 'total_chap', manga.get_xpath(chapter_xpath + '/span/text()', MapCompose(lambda x: re.findall(r'\d+', x)))[0]) chapter_source = manga.get_xpath(chapter_xpath + '/@href', MapCompose(make_full_url)) chapter_name = manga.get_xpath(chapter_xpath + '/span/text()') chapters = zip(chapter_name, chapter_source) manga.add_value('chapters', chapters) manga.add_value('web_source', 'mangaseeonline') return manga.load_item()
def parse(self, response): sel = Selector(response) articulos = sel.xpath( '//*[@id="tt-pageContent"]/div/div/div/div[2]/div/div[2]/div') #ITERAR SOBRE TODOS LOS ARTICULOS for i, art in enumerate(articulos): loader = ItemLoader(item=Articulo(), selector=art) loader.add_xpath('nombre', './/div/div[2]/h2/a/text()') precio = loader.get_xpath('.//div/div[2]/div[2]/text()')[0] precio = precio.split('$')[1].split('.') pre = precio[0] + precio[1] loader.add_value('precio', pre) finalUrl = loader.get_xpath('.//div/div[2]/h2/a/@href')[0] finalUrl = finalUrl.split('id=')[-1] fullUrl = 'https://www.foxinsumospc.com.ar/?p=home&m=detalleproducto&id=' + finalUrl loader.add_value('url', fullUrl) cat = response.url.split('cat=')[-1] if cat == "45": loader.add_value('categoria', 'Gabinetes') elif cat == "60": loader.add_value('categoria', 'Monitores') elif cat == "30" or cat == "31" or cat == "32": loader.add_value('categoria', 'Almacenamiento') elif cat == "50": loader.add_value('categoria', 'Impresoras') elif cat == "51": loader.add_value('categoria', 'Joysticks') elif cat == "69": loader.add_value('categoria', 'Parlantes') elif cat == "23" or cat == "24" or cat == "25": loader.add_value('categoria', 'Refrigeración') elif cat == "62": loader.add_value('categoria', 'Mouses') elif cat == "76": loader.add_value('categoria', 'Placas de Video') elif cat == "42": loader.add_value('categoria', 'Fuentes') elif cat == "11": loader.add_value('categoria', 'Cables') elif cat == "56": loader.add_value('categoria', 'Memorias') elif cat == "61": loader.add_value('categoria', 'Motherboards') elif cat == "59": loader.add_value('categoria', 'Procesadores') elif cat == "105": loader.add_value('categoria', 'Webcams') yield loader.load_item() #imprimir salida
def parse_item(self, response): """ @url http://splash:8050/render.html?&url=http://www.nettruyenco.com/truyen-tranh/boyfriend-17550&wait=1 @scrapes name source image_src total_chap description chapters web_source full """ manga = ItemLoader(item=MangaCrawlerItem(), response=response) manga.add_xpath("unicode_name", '//h1[@class="title-detail"]/text()') manga.add_value("name", unidecode(manga.get_output_value("unicode_name")[0])) manga.add_value("source", response.url) manga.add_xpath("image_src", '//*[@class="col-xs-4 col-image"]/img/@src') manga.add_xpath("description", '//*[@class="detail-content"]/p//text()', Join("\n")) chapter_xpath = '//*[@id="nt_listchapter"]/nav/ul/li[not(contains (@class, "row heading"))]/div[1]/a' chapter_source = manga.get_xpath(chapter_xpath + "/@href") chapter_name = manga.get_xpath(chapter_xpath + "/text()") chapters = zip(chapter_name, chapter_source) if "Hoàn thành" in manga.get_xpath( '//*[@class="status row"]/p[2]/text()'): manga.add_value("full", True) manga.add_value( "total_chap", manga.get_xpath( chapter_xpath + "/text()", MapCompose(lambda x: re.findall(r"\d+", x)), MapCompose(int), )[0], ) else: manga.add_value("full", False) manga.add_value( "total_chap", manga.get_xpath( "//title/text()", MapCompose( lambda x: re.findall(r" Chapter \d+| Chap \d+", x)), MapCompose(lambda x: re.findall(r"\d+", x)), MapCompose(float), MapCompose(int), TakeFirst(), ), ) manga.add_value("chapters", chapters) manga.add_value("web_source", "nettruyen") print(manga.load_item()) return manga.load_item()
def load_document(response, document): l = ItemLoader(item = DocumentItem(), response = response) l.default_output_processor = TakeFirst() l.add_value('coverpage_url', response.url) l.add_xpath('abstract', document['abstract']) l.add_xpath('title', document['title']) try: meta = l.get_xpath(document['meta']) l.add_value('submission_path', normalize('NFKD', meta[1] + meta[2])) pages = meta[-1].split(' ')[-1] if '–' in pages: fp = int(pages.split('–')[0]) lp = int(pages.split('–')[1]) elif '-' in pages: fp = int(pages.split('-')[0]) lp = int(pages.split('-')[1]) l.add_value('fpage', fp) l.add_value('lpage', lp) l.add_value('pages', lp-fp+1) except: pass l.add_value('publication_date', parse(response.xpath(document['publication_date']).extract()[0])) # mark it down, with source's publication_title return l
def parse_item(self, response): """ @url https://tiki.vn/hieu-ng-canh-buom-p146105.html @returns items 1 @scrapes name name_unidecode price description @scrapes url project spider server date """ l = ItemLoader(item=BooksItem(), response=response) l.add_xpath('name', '//*[@class="item-name"]/text()', MapCompose(unicode.strip, unicode.title)) l.add_xpath('name_unidecode', '//*[@class="item-name"]/text()', MapCompose(unidecode, str.strip, str.title)) l.add_xpath('author', '//*[@class="item-brand"]/p/a/text()') l.add_xpath('price', '//*[@id="span-price"]/text()', TakeFirst(), re=r'\d+\.\d+') l.add_value('description', [ re.sub('<[^<]+?>', '', i) for i in l.get_xpath('//*[@id="gioi-thieu"]/p') ], Join('\n')) l.add_xpath('image_uri', '//*[@itemprop="image"]/@src') # Information fields l.add_value('url', response.url) l.add_value('project', self.settings.get('BOT_NAME')) l.add_value('spider', self.name) l.add_value('server', socket.gethostname()) l.add_value('date', datetime.datetime.now()) return l.load_item()
def renxuan_2(self, response): trs = response.xpath( '//table[@id="OutSpeltyEP1_gridMain"]/tbody/tr[re:test(@class,"tdcolour\d$")]' ) for tr in trs: loader = ItemLoader(response.meta['item'], selector=tr) cid = loader.get_xpath('./td[3]/text()') loader.add_xpath('name', './td[2]/text()') loader.add_xpath('cid', './td[3]/text()') loader.add_xpath('credit', './td[6]/text()') # yield Request(url=TEST_LESSON_URL, # dont_filter=True, # meta= {'item':loader.load_item()}, # callback = self.lesson_parser # ) new_meta = {'item': loader.load_item()} new_meta.update(response.meta) yield FormRequest.from_response( response, dont_filter=True, formdata={ 'OutSpeltyEP1$dpYx': response.meta['item']['course_type'], 'OutSpeltyEP1$dpNj': response.meta['item']['grade'], 'myradiogroup': cid, 'OutSpeltyEP1$lessonArrange': '课程安排' }, meta=new_meta, callback=self.lesson_parser)
def parse_item(self, response): manga = ItemLoader(item=MangaCrawlerItem(), response=response) manga.add_xpath( "name", "//title/text()", MapCompose(lambda x: x.split(" | ")[0], str.strip) ) manga.add_value("source", response.url) manga.add_xpath("image_src", '//*[@class="thumbnail"]/img/@src') manga.add_xpath( "description", '//*[@class="content"]//text()', MapCompose(str.strip), Join("\n"), MapCompose(str.strip), ) manga.add_value( "total_chap", max( [ int(i) for i in manga.get_xpath( '//*[@id="list-chapters"]/p/span/a/text()', MapCompose(lambda x: re.findall(r"\d+", x)), ) ] ), ) get_chapter_source = manga.get_xpath( '//*[@id="list-chapters"]/p/span/a/@href', MapCompose(mc) ) chapter_source = [ chap for chap in get_chapter_source if "mediafire" not in chap ] chapter_name = manga.get_xpath('//*[@id="list-chapters"]/p/span/a/text()') chapters = zip(chapter_name, chapter_source) manga.add_value("chapters", chapters) manga.add_value("web_source", "blogtruyen") if "Đã hoàn thành" in manga.get_xpath('//*[@class="description"]//text()'): manga.add_value("full", True) else: manga.add_value("full", False) return manga.load_item()
def load_source(response, source): website_url = 'https://us.sagepub.com' l = ItemLoader(item=SourceItem(), response=response) l.default_output_processor = TakeFirst() l.add_value("issn", response.xpath(source['issn']).extract()[1].split()[-1]) l.add_value('chief_editor', response.xpath(source['chief_editor']).extract()[0]) l.add_xpath('publication_title', source['publication_title']) l.add_value('coverimage', website_url + l.get_xpath(source['coverimage'])[0]) l.add_xpath('description', './/div[@class="field-item even"]', Join(), cleanhtml, lambda x: x.replace('\n', '').replace(' ', '').strip()) l.add_value('home_url', response.url) publication_title = l.get_xpath(source['publication_title']) return l
def parse_item(self, response): """ @url https://doctruyen3q.info/truyen-tranh/dao-hai-tac/77 @scrapes name source image_src total_chap description chapters web_source full unicode_name """ manga = ItemLoader(item=MangaCrawlerItem(), response=response) category = manga.get_xpath("//*[@class='category row']/p[2]//text()") categories = re.sub(r'\s+', '', "".join(category)) if any(i in unidecode(categories).lower() for i in ["18+", "smut", "yaoi", "ntr", "yuri", 'adult', 'dammy']): return manga.add_xpath("unicode_name", '//h1[@class="title-manga"]/text()') manga.add_value("name", unidecode( manga.get_output_value("unicode_name")[0])) manga.add_value("source", response.url) manga.add_xpath( "image_src", '//*[@class="image-comic"]/@src') manga.add_xpath( "description", '//*[@class="detail-summary"]/text()' ) chapter_xpath = '//*[@id="list-chapter-dt"]/nav/ul/li/div[1]/a' chapter_source = manga.get_xpath(chapter_xpath + "/@href") chapter_name = manga.get_xpath(chapter_xpath + "/text()") chapters = zip(chapter_name, chapter_source) if "Đã hoàn thành" in manga.get_xpath('//*[@class="status row"]//text()'): manga.add_value("full", True) else: manga.add_value("full", False) manga.add_value( "total_chap", manga.get_xpath( '//*[@id="list-chapter-dt"]/nav/ul/li[1]/div[1]/a/text()', MapCompose(lambda x: re.findall(r"(\d+(?:\.\d+)?)", x)), MapCompose(float), MapCompose(int), TakeFirst(), ), ) manga.add_value("chapters", chapters) manga.add_value("web_source", "doctruyen3q") print(manga.load_item()) return manga.load_item()
def load_source(response, source): l = ItemLoader(item = SourceItem(), response = response) l.default_output_processor = TakeFirst() l.add_xpath("issn", source['issn']) l.add_xpath('publication_title', source['publication_title']) l.add_xpath('coverimage', source['coverimage']) l.add_xpath('description',source['description'], Join() ) l.add_value('home_url', response.url) publication_title = l.get_xpath(source['publication_title']) return l
def load_document(response, document): l = ItemLoader(item=DocumentItem(), response=response) l.default_output_processor = TakeFirst() l.add_value('coverpage_url', response.url) l.add_xpath('abstract', document['abstract']) l.add_value('title', (l.get_xpath(document['title'])[0]).replace('\n', '').strip()) l.add_value( 'submission_path', l.get_xpath(document['submission_path'])[0].replace('\n', '').strip()) # handle dates try: dates = [ i.replace('\n', '').replace(';', '').strip() for i in response.xpath(document['dates']).extract()[-2:] ] d = [parse(i) for i in dates] l.add_value('online_date', d[0]) l.add_value('publication_date', d[1]) except: pass # handle pages try: pages = response.xpath(document['pages']).extract()[0].strip().split( '\n')[-1].strip().split(':')[-1] if '–' in pages: fp = int(pages.split('–')[0]) lp = int(pages.split('–')[1]) elif '-' in pages: fp = int(pages.split('-')[0]) lp = int(pages.split('-')[1]) l.add_value('fpage', fp) l.add_value('lpage', lp) l.add_value('pages', lp - fp + 1) except: pass # mark it down, with source's publication_title return l
def parse(self, response): sel = Selector(response) articulos = sel.xpath('//div[@class="padder"]/ul/li') #ITERAR SOBRE TODOS LOS ARTICULOS for i, art in enumerate(articulos): loader = ItemLoader(item=Articulo(), selector=art) loader.add_xpath('nombre', './/div/h3/a/text()') #loader.add_xpath('precio', './/div/a/text()') precio = loader.get_xpath('.//div/a/text()')[0] precio = precio.split('AR$')[-1].strip() loader.add_value('precio', precio) cat = response.url.split("/")[-2] finalUrl = loader.get_xpath('.//a/@href') fullUrl = 'http://www.starcomputacion.com.ar/' + finalUrl[0] loader.add_value('url', fullUrl) if cat == "teclados-45": loader.add_value('categoria', 'Teclados') elif cat == "monitores-23": loader.add_value('categoria', 'Monitores') elif cat == "discos-rigidos-67": loader.add_value('categoria', 'Almacenamiento') elif cat == "impresoras-24": loader.add_value('categoria', 'Impresoras') elif cat == "mouses-y-pads-41": loader.add_value('categoria', 'Mouses') elif cat == "parlantes-42": loader.add_value('categoria', 'Parlantes') elif cat == "webcams-46": loader.add_value('categoria', 'Webcams') elif cat == "estabilizador-de-tension-39": loader.add_value('categoria', 'Estabilizadores') elif cat == "auriculares-gamers-141": loader.add_value('categoria', 'Auriculares') elif cat == "pendrives-13": loader.add_value('categoria', 'Pendrives') yield loader.load_item() #imprimir salida
def parse_prof(self, response): departxpath = "//*[@id='mainContent']//div[@class='result-title']/text()" univerxpath = "//*[@id='mainContent']//div[@class='result-title']//a[@class='school']/text()" difficxpath = "//*[@id='mainContent']//div[@class='rating-breakdown']//div[@class='breakdown-header']/div[2]/div[@class='grade']/text()" kxpath = "//*[@id='mainContent']//div[@class='rating-breakdown']/div[2]/div[@class='tag-box']/span[@class='tag-box-choosetags']/text()" vxpath = "//*[@id='mainContent']//div[@class='rating-breakdown']/div[2]/div[@class='tag-box']/span[@class='tag-box-choosetags']/b/text()" l = ItemLoader(item=ProfessorItem(), response=response) l.default_output_processor = TakeFirst() l.add_value('tid', response.meta['tid']) l.add_value('sid', response.meta['sid']) l.add_value('pfname', response.meta['pfname']) l.add_value('plname', response.meta['plname']) l.add_value('pname', response.meta['pfname']) l.add_value('pname', response.meta['plname']) l.add_value('quality', response.meta['quality']) l.add_value('n_rating', response.meta['n_rating']) l.add_xpath('department', departxpath, re='Professor in the (.+) department') l.add_xpath('university', univerxpath) l.add_xpath('difficulty', difficxpath, MapCompose(self.diff2float)) keys = l.get_xpath(kxpath, MapCompose(lambda p: p.replace(' ', ''))) values = l.get_xpath( vxpath, MapCompose(lambda p: int(p.strip('(').strip(')')))) l.add_value('tags', json.dumps(dict(zip(keys, values)))) yield l.load_item() n_rating = response.meta['n_rating'] if n_rating != 0: tid = response.meta['tid'] for pn in range(math.ceil(n_rating / 20)): url = 'http://www.ratemyprofessors.com/paginate/professors/ratings?tid=%d&page=%d' % ( tid, pn + 1) # a int is needed here yield Request(url, meta=response.meta, callback=self.parse_rating)
def parse_items(self, response): item = ItemLoader(MediamarktItem(), response) name = item.get_xpath('//*[@id="product-details"]/div[1]/h1/text()') name = str(name[0]) result = ' ' for a in name: if (a == ','): break result += a item.add_value('nombre', result) ram = item.get_xpath('//*[@id="features"]/section[1]/dl/dd[7]/text()') ram = str(ram[0]) contenido = ram[0].rstrip(' GB') item.add_value('ram', contenido) item.add_xpath('sistemaOperativo', '//*[@id="features"]/section[1]/dl/dd[2]/text()') alm = item.get_xpath('//*[@id="features"]/section[1]/dl/dd[5]/text()') alm = str(alm[0]) contenido = alm.rstrip(' GB') item.add_value('almacenamiento', contenido) item.add_value('url', response.url) item.add_xpath( 'precio', '//*[@id="product-details"]/div[2]/div[1]/meta[2]/@content') img = item.get_xpath('//*[@id="product-sidebar"]/div[1]/a/img/@src') contenido = 'https:' + str(img[0]) item.add_value('imagen', contenido) # Faltaría eliminar los atributos del objeto que aparecen cada vez que se descargan los datos de la web yield item.load_item()
def parse_item(self, response): sel = Selector(response) il = ItemLoader(item=Product(), response=response) cat = il.get_xpath( '//div[contains(@id, "ctl00_pnlBreadCrumbs")]/a[last()]/text()') availability = il.get_xpath( '//a[contains(@id,"hplddToCart") or contains(@class,"addToCart")]/text()' ) price = il.get_css( 'span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblRegprice > font::text' ) sale = il.get_css( 'span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblSaleprice > font::text' ) """If the xpath doesn't retunr a category, the product belongs to the Bundle category""" if not cat: il.add_value("category", "Bundle") else: il.add_value("category", cat) il.add_css( "title", "span#ctl00_ContentPlaceHolder1_ctrlProdDetailUC_lblProdTitle::text" ) il.add_value("url", response.url) """If a product can be added to the cart, the product is available online, if not, the product is not available online""" if "ADD TO CART" in availability: il.add_value("availability", "Product is available online") else: il.add_value("availability", "Product is not available online") """If there's a sale price present but not a regular price present, it switches the sale price for the regular price as shown in the website""" if not price: il.add_value("regPrice", sale) il.add_value("salePrice", None) else: il.add_value("regPrice", price) il.add_value("salePrice", sale) return il.load_item()
def load_document(response, document): l = ItemLoader(item = DocumentItem(), response = response) l.default_output_processor = TakeFirst() l.add_value('coverpage_url', response.url) l.add_xpath('abstract', document['abstract']) l.add_xpath('title', document['title']) l.add_xpath('submission_path', document['submission_path']) # handle dates dates = [i for i in l.get_xpath(document['date'])[0].split(', ')] d = getdate(dates) l.add_value('submission_date',d['submission_date']) l.add_value('revision_date',d['revision_date']) l.add_value('accepted_date', d['accepted_date']) l.add_value('online_date', d['online_date']) date_page = l.get_xpath(document['dp'])[0].split(', ') try: l.add_value('publication_date', parse(date_page[-2])) except: pass # handle pages try: pages = date_page[-1].split()[-1] if '–' in pages: fp = int(pages.split('–')[0]) lp = int(pages.split('–')[1]) elif '-' in pages: fp = int(pages.split('-')[0]) lp = int(pages.split('-')[1]) l.add_value('fpage', fp) l.add_value('lpage', lp) l.add_value('pages', lp-fp+1) except: pass # mark it down, with source's publication_title return l
def parse(self, response): sel = Selector(response) articulos = sel.xpath('//*[@id="main"]/ul/li') #ITERAR SOBRE TODOS LOS ARTICULOS for i, art in enumerate(articulos): loader = ItemLoader(item=Articulo(), selector=art) loader.add_xpath('nombre', './/div/div/div[1]/a/h2/text()') pre = loader.get_xpath( './/div/div/div[3]/div[1]/span/span/ins/span/text()')[0] pre = pre.split(',') precio = "" for i in pre: precio += str(i) + "" precio = precio[:-1] precio = precio.strip() loader.add_value('precio', precio) cat = response.url.split("/")[-2] loader.add_xpath('url', './/div/div/div[1]/a/@href') if cat == "procesadores": loader.add_value('categoria', 'Procesadores') elif cat == "motherboards": loader.add_value('categoria', 'Motherboards') elif cat == "memorias": loader.add_value('categoria', 'Memorias') elif cat == "almacenamiento": loader.add_value('categoria', 'Almacenamiento') elif cat == "impresoras": loader.add_value('categoria', 'Impresoras') elif cat == "placas-de-video": loader.add_value('categoria', 'Placas de Video') elif cat == "mouses-y-teclados": loader.add_value('categoria', 'Mouses Y Teclados') elif cat == "fuentes": loader.add_value('categoria', 'Fuentes') elif cat == "gabinetes": loader.add_value('categoria', 'Gabinetes') elif cat == "monitores": loader.add_value('categoria', 'Monitores') elif cat == "webcams": loader.add_value('categoria', 'Webcams') elif cat == "auriculares": loader.add_value('categoria', 'Auriculares') elif cat == "parlantes-pc": loader.add_value('categoria', 'Parlantes') elif cat == "refrigeracion": loader.add_value('categoria', 'Refrigeración') yield loader.load_item() #imprimir salida
def parse_item(self, response): """ @url http://hocvientruyentranh.com/manga/2/shokugeki-no-souma- @returns items 1 @scrapes name source total_chap chapters description """ manga = ItemLoader(item=MangaCrawlerItem(), response=response) manga.add_xpath("name", '//h3[@class="__name"]/text()', MapCompose(str.strip)) manga.add_value("source", response.url) manga.add_xpath("image_src", '//*[@class="__image"]/img/@src') manga.add_xpath( "description", '//*[@class="__description"]//p/text()', Join("\n") ) manga.add_value( "total_chap", max( [ int(i) for i in manga.get_xpath( '//*[@class="table table-hover"]/tbody//tr//td//a//text()', MapCompose(lambda x: re.findall(r"\d+", x)), ) ] ), ) chapter_source = manga.get_xpath( '//*[@class="table table-hover"]/tbody//tr//td//a/@href' ) chapter_name = manga.get_xpath( '//*[@class="table table-hover"]/tbody//tr//td//a//text()' ) chapters = zip(chapter_name, chapter_source) manga.add_value("chapters", chapters) return manga.load_item()
def parse_item(self, response): """ @url http://splash:8050/render.html?&url=https://vlogtruyen.net/bokutachi-wa-hanshoku-wo-yameta.html&wait=1 @scrapes name unicode_name source image_src total_chap description chapters web_source full """ manga = ItemLoader(item=MangaCrawlerItem(), response=response) manga.add_xpath("unicode_name", '//h1[@class="title-commic-detail"]/text()') manga.add_value("name", unidecode(manga.get_output_value("unicode_name")[0])) manga.add_value("source", response.url) manga.add_xpath("image_src", '//meta[@property="og:image"]/@content') manga.add_xpath("description", '//*[@class="desc-commic-detail"]/text()', Join("\n")) chapter_xpath = '//*[@class="ul-list-chaper-detail-commic"]/li/a' chapter_source = manga.get_xpath(chapter_xpath + "/@href") chapter_name = manga.get_xpath(chapter_xpath + "/h3/text()") chapters = zip(chapter_name, chapter_source) if "Đã hoàn thành" in manga.get_xpath( '//*[@class="manga-status"]/p/text()'): manga.add_value("full", True) else: manga.add_value("full", False) manga.add_value( "total_chap", manga.get_xpath( '//*[@class="ul-list-chaper-detail-commic"]/li[1]/a/h3/text()', MapCompose(lambda x: re.findall(r"(\d+(?:\.\d+)?)", x)), TakeFirst(), ), ) manga.add_value("chapters", chapters) manga.add_value("web_source", "vlogtruyen") return manga.load_item()
def load_author(response,author): auths = response.xpath(author['auth']) for auth in auths: l = ItemLoader(item = AuthorItem(), response = response) l.default_onput_processor = TakeFirst() # author's first name and last name fn = auth.xpath(author['fn']).extract()[0] ln = auth.xpath(author['ln']).extract()[0] l.add_value('fname', fn) l.add_value('lname', ln) # author's email try: email = auth.xpath(author['email']).extract()[0][7:] l.add_value('email', email) except: pass # author's address and institution try: fid = auth.xpath(author['fid']).extract()[0][1:] address = l.get_xpath(author['address'] %fid) for i in address[0].split(', '): if 'niversity' in i: institution = i break l.add_value('address', address) l.add_value('institution', institution) except: pass # author's vitae try: href = auth.xpath(author['href']).extract()[0][1:] vitae = response.xpath(author['vitae'] %href).extract()[0] l.add_value('vitae', fn+' '+ln+vitae) except: pass # author's avatar try: href = auth.xpath(author['href']).extract()[0][1:] avatar = response.xpath(author['avatar'] %href).extract()[0] l.add_value('avatar', avatar) except: pass yield l
def parse_item(self, response): """ @url https://mangasee123.com/manga/Kingdom @scrapes name source image_src total_chap description chapters web_source full """ manga = ItemLoader(item=MangaCrawlerItem(), response=response) manga.add_xpath( "unicode_name", "//div[@class='container MainContainer']//li[1]/h1/text()") manga.add_value("name", unidecode(manga.get_output_value("unicode_name")[0])) manga.add_value("source", response.url) manga.add_xpath("image_src", '//meta[@property="og:image"]/@content') manga.add_xpath("description", "//div[@class='top-5 Content']/text()", Join("\n")) if "Complete (Publish)" in manga.get_xpath( '//*[@class="PublishStatus"]/text()'): manga.add_value("full", True) else: manga.add_value("full", False) rss = manga.get_xpath("//a[normalize-space()='RSS Feed']/@href") rss_url = BASE_URL + rss[0] feed = feedparser.parse(rss_url, agent="Mozilla/5.0") manga.add_value( "total_chap", re.findall(r"\d+", feed['entries'][0]['title'])[0], ) chapters = [(i['title'], i['link']) for i in feed['entries']] manga.add_value("chapters", chapters) manga.add_value("web_source", "mangaseeonline") return manga.load_item()
def parse_item(self, response): manga = ItemLoader(item=MangaCrawlerItem(), response=response) manga.add_xpath('name', '//title/text()', MapCompose(lambda x: x.split(' | ')[0], str.strip)) manga.add_value('source', response.url) manga.add_xpath('image_src', '//*[@class="thumbnail"]/img/@src') manga.add_xpath('description', '//*[@class="content"]//text()', MapCompose(str.strip), Join('\n'), MapCompose(str.strip)) manga.add_value( 'total_chap', max([ int(i) for i in manga.get_xpath( '//*[@id="list-chapters"]/p/span/a/text()', MapCompose(lambda x: re.findall(r'\d+', x))) ])) get_chapter_source = manga.get_xpath( '//*[@id="list-chapters"]/p/span/a/@href', MapCompose(mc)) chapter_source = [ chap for chap in get_chapter_source if 'mediafire' not in chap ] chapter_name = manga.get_xpath( '//*[@id="list-chapters"]/p/span/a/text()') chapters = zip(chapter_name, chapter_source) manga.add_value('chapters', chapters) manga.add_value('web_source', 'blogtruyen') if 'Đã hoàn thành' in manga.get_xpath( '//*[@class="description"]//text()'): manga.add_value('full', True) else: manga.add_value('full', False) return manga.load_item()
def parse_item(self, response): # Items提供保存抓取数据的容器,而Item Loder提供的是填充容器的机制。 # 直接赋值取值的方式,会出现一下几个问题: # 代码量一多,各种css和xpath选择器,充斥整个代码逻辑,没有规则,可读性差、不利于维护 # 对于一个字段的预处理,不明确,也不应该出现在主逻辑中 l = ItemLoader(item=SpiderScrapyBcyItem(), response=response) l.add_xpath('name', "//h1[@class='js-post-title']/text()") l.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()") urls = l.get_xpath('//img[@class="detail_std detail_clickable"]/@src') urls = [url.replace('/w650', '') for url in urls] l.add_value('image_urls', urls) l.add_value('url', response.url) return l.load_item()