def get_player_info(self, response): loader = ItemLoader(item=NFL_Player_2015(), response=response) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() number_and_position = response.xpath('//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[1]/text()').extract()[0] number_and_position = response.xpath('//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[1]/text()').extract() if type(number_and_position) is list: number_and_position = number_and_position[0] number = number_and_position.split()[0] position = number_and_position.split()[1] else: number = '' position = '' loader.add_value('number', number) loader.add_value('position', position) loader.add_xpath('name', '//*[@id="content"]/div[3]/div[2]/h1/text()') loader.add_xpath('team', '//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[3]/a/text()') # loader.add_xpath('name', '//*[@id="content"]/div[3]/div[2]/h1/text()') # loader.add_xpath('team', '//*[@id="content"]/div[3]/div[2]/div[3]/ul[1]/li[3]/a/text()') yield loader.load_item()
def parse_datasets (self, response): il = ItemLoader(item=Dataset(), response=response) t = datetime.datetime.now() timestamp = t.isoformat() il.default_output_processor = MapCompose(lambda v: v.strip()) il.add_value('scraped_url', response.url) il.add_xpath('desc', '//a[contains(., "Description")]/following-sibling::div/p/text()') il.add_value('soc_id', response.url[-15:-6]) il.add_xpath('name', '//*[@id="datasetName"]/text()') il.add_xpath('tags', '//dt[text()="Tags"]/following-sibling::dd/span/text()') il.add_xpath('permalink', '//dt[text()="Permalink"]/following-sibling::dd[1]/span/a/text()') il.add_xpath('dept', '//dt[text()="Department"]/following-sibling::dd[1]/span/text()') il.add_xpath('provided_by', '//dt[text()="Data Provided By"]/following-sibling::dd[1]/text()') il.add_xpath('category', '//dt[text()="Category"]/following-sibling::dd[1]/text()') il.add_xpath('soc_owner', '//a[@class="aboutAuthorName"]/text()') # il.add_xpath('contact_email', ) il.add_xpath('date_created', '//span[@class="aboutCreateDate"]/span[@class="dateReplace"]/text()') il.add_xpath('date_updated', '//span[@class="aboutUpdateDate"]/span[@class="dateReplace"]/text()') il.add_xpath('num_ratings', '//dd[@class="totalTimesRated"]/text()') il.add_xpath('num_visits', '//dt[text()="Visits"]/following-sibling::dd[1]/text()') il.add_xpath('num_downloads', '//dt[text()="Downloads"]/following-sibling::dd[1]/text()') il.add_xpath('num_comments', '//dd[@class="numberOfComments"]/text()') il.add_xpath('num_contributors', '//dt[text()="Downloads"]/following-sibling::dd[1]/text()') il.add_xpath('permissions', '//dt[text()="Permissions"]/following-sibling::dd[1]/text()') il.add_xpath('row_count', '//span[contains(@class, "row_count")]/text()') il.add_xpath('update_freq', '//dt[text()="Frequency"]/following-sibling::dd[1]/span/text()') il.add_value('timestamp', timestamp) il.add_xpath('set_type', '//span[@class="icon currentViewName"]/text()') return il.load_item()
def parse(self, response): match = re.search('/displaySeminarList/',response.url) if match: urls = response.xpath('//div[@class="internList splitEntry"]//@href').extract() for url in urls: url = response.urljoin(url) yield scrapy.Request(url, self.parse) else: table = response.xpath(self.seminar_list_xpath) corpId = parse_qs(urlparse(response.url).query)['corpId'] for index,semi in enumerate(table): loader = ItemLoader(SeminarItem(),semi) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() loader.add_value('companyid',corpId) loader.add_xpath('name','//div[@id="headerWrap"]//h3/text()') loader.add_xpath('date','.//td[@class="date"]/text()',re='\d+\/\d+\/\d+') loader.add_xpath('time','.//td[@class="time"]/text()') loader.add_xpath('area','.//td[@class="area"]/text()') loader.add_xpath('place','.//td[@class="place"]/text()') loader.add_xpath('loc_n','.//td[@class="place"]//a', re='mycom_loc\|(\d+\/\d+\/\d+\.\d+)\,\d+\/\d+\/\d+\.\d+') loader.add_xpath('loc_e','.//td[@class="place"]//a', re='mycom_loc\|\d+\/\d+\/\d+\.\d+\,(\d+\/\d+\/\d+\.\d+)') loader.add_xpath('target','.//td[@class="target"]/text()') yield loader.load_item()
def parse_depth_chart(self, response): loader = ItemLoader(item=NFL_Team_2015(), response=response) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() loader.add_xpath("division", '//*[@id="sub-branding"]/div[2]/text()') loader.add_xpath("name", '//*[@id="sub-branding"]/h2/a/b/text()') yield loader.load_item()
def parse_article(self, response): # Initialize some I/O processors join_all = Join('') take_first = TakeFirst() identity = Identity() prepend_url = PrependResponseUrl(response.url) strip_all, strip_one = StripAll(), StripOne() add_space_after_punct = AddSpaceAfterPunct() # Load PersonItem person_loader = ItemLoader(item=PersonItem(), response=response) person_loader.default_output_processor = take_first person_loader.add_css('name', 'h3.p-name::text', strip_all) person_loader.add_value('article_url', response.url) person_loader.add_css('pub_date', 'time.dt-published::attr(datetime)') person_loader.add_css('title', 'p.summary.p-summary::text', strip_all) person_loader.add_css('img_src', 'img.portrait::attr(src)', prepend_url) person_loader.add_xpath('bio', '//div[@class="e-content"]/p[count(preceding-sibling::h4)=1]/descendant-or-self::*/text()', join_all, add_space_after_punct) person_loader.add_xpath('hardware', '//div[@class="e-content"]/p[count(preceding-sibling::h4)=2]/descendant-or-self::*/text()', join_all, add_space_after_punct) person_loader.add_xpath('software', '//div[@class="e-content"]/p[count(preceding-sibling::h4)=3]/descendant-or-self::*/text()', join_all, add_space_after_punct) person_loader.add_xpath('dream', '//div[@class="e-content"]/p[count(preceding-sibling::h4)=4]/descendant-or-self::*/text()', join_all, add_space_after_punct) person_item = person_loader.load_item() # @gbrener 8/16/2015: The following line causes a NotImplementedError #object.__setattr__(person_item, 'export_empty_fields', True) person_item.fill_empty_fields() # Load a list of ToolItems tool_items = [] for tool_selector in response.css('div.e-content p a'): tool_loader = ItemLoader(item=ToolItem(), selector=tool_selector, response=response) tool_loader.default_output_processor = take_first tool_loader.add_xpath('tool_name', './descendant-or-self::*/text()', join_all, strip_one) tool_loader.add_xpath('tool_url', './@href') tool_item = tool_loader.load_item() # @gbrener 8/16/2015: The following line causes a NotImplementedError #object.__setattr__(tool_item, 'export_empty_fields', True) tool_item.fill_empty_fields() tool_items.append(tool_item) yield dict(person=person_item, tools=tool_items)
def parse_item(self, response): l = ItemLoader(item=GetEmailsItem(), response=response) l.default_output_processor = MapCompose(lambda v: v.strip(), replace_escape_chars) emails = response.xpath('//text()').re(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,4}") l.add_value('email', emails) l.add_value('url', response.url) return l.load_item()
def parse_auction_item(self, response): loader = ItemLoader(AuctionItems(), response=response) loader.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars) loader.default_output_processor = Join() for field, xpath in auction_item_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse_item(self, response): """Returns scraped data from each individual job link.""" l = ItemLoader(item=GumtreeItem(), response=response) l.default_output_processor = MapCompose(lambda v: v.strip(), replace_escape_chars) l.add_xpath('name', '//h1/text()') l.add_xpath('location', './/*[@id="ad-map"]/span[2]/text()') l.add_xpath('sender', '//*[@class="name"]/text()') l.add_xpath('description', '//*[@id="job-description"]/p/text()') l.add_value('job_type', response.meta['job_type']) return l.load_item()
def parse_post(self, response): data = json.loads(response.text) title = data['title'] description = remove_tags(data['newsDetailContent']) date = data['displayDate'] item = ItemLoader(item=CsobskItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) item.add_value('date', date) return item.load_item()
def parse_post(self, response, title): description = response.xpath( '//div[@class="entry-content"]//text()[normalize-space()]').getall( ) description = [p.strip() for p in description] description = ' '.join(description).strip() item = ItemLoader(item=CreditagricoleroItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) return item.load_item()
def load_author(response, author): string = response.xpath(author['names']).extract()[0].replace('\n', '').strip() names = [str.strip(i) for i in string.replace(' and ', ', ').split(',')] for name in names: l = ItemLoader(item = AuthorItem(), response = response) l.default_output_processor = TakeFirst() # author's first name and last name flname = name.split() fn = flname[0] ln = flname[-1] l.add_value('fname', fn) l.add_value('lname', ln) yield l
def parse_post(self, response): title = response.xpath('//h2[@class="article-title"]/text()').get() description = response.xpath( '//article//text()[normalize-space() and not(ancestor::h2)]' ).getall() description = [p.strip() for p in description] description = ' '.join(description).strip() item = ItemLoader(item=GbkrsiItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) return item.load_item()
def parse_item(self, response): """Returns fields: url_of_item, product, img_url, description, and price.""" l = ItemLoader(item=EmmiscraperItem(), response=response) l.default_output_processor = MapCompose(lambda v: v.strip(), replace_escape_chars) l.add_value('url_of_item', response.url) l.add_value('product', response.meta['product']) l.add_xpath('img_url', '/html/body/div[3]/div[4]/div[2]/div[2]/div[1]/div[1]/div[1]/a/img/@src') l.add_xpath('description', '//*[@class="productListText widthFull noPadding"]/text()') l.add_xpath('price', '//*[@class="price"]/text()[2]') return l.load_item()
def parse(self, response): search_results = response.css('.tpl-append-results>div') for product in search_results: product_loader = ItemLoader(item=SouqItem(), selector=product) product_loader.default_output_processor = TakeFirst() product_loader.add_css('title', '.itemTitle::text') product_loader.add_css('link', '.item-content> .itemLink::attr(href)') product_loader.add_css('price', '.itemPrice::text') print('\n') yield product_loader.load_item()
def parse_node(self, response, selector): self.logger.info("selector %s", selector ) l = ItemLoader(SatItem(), selector=selector, response=response) l.default_output_processor = TakeFirst() l.add_xpath("metadata", "atom:link[@rel='alternative']/@href") l.add_xpath("icon", "atom:link[@rel='icon']/@href") l.add_xpath("download", "atom:link/@href") l.add_xpath('footprint', "atom:str[@name='footprint']/text()") l.add_xpath('id', 'atom:id/text()') l.add_xpath('identifier', "atom:str[@name='identifier']/text()") l.add_value('requestId', self.request_id) i = l.load_item() return i
def parse_post(self, response): title = response.xpath('//h1/text()').get() description = response.xpath( '//div[@class="main-text"]/p/text()').getall() description = ' '.join(description) date = response.xpath('//div[@class="date"]/text()').get() item = ItemLoader(item=KvikaItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) item.add_value('date', date) return item.load_item()
def parse_post(self, response): title = response.xpath('//div[@class="leftColumn"]/h3/text()').get() description = response.xpath('//div[@class="leftColumn"]/table//text()[normalize-space()]').getall() description = [p.strip() for p in description] description = ' '.join(description).strip() date = response.xpath('//div[@align="right"]/span[@style]/text()').get() item = ItemLoader(item=LibrabankItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) item.add_value('date', date) return item.load_item()
def parse_answers(self, response): # use selector to extract answers selector = Selector(response) # iterate over answers for answer in selector.xpath(self.answers_list_xpath): loader = ItemLoader(item=ZhihuAnswer(), selector=answer) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) item = loader.load_item() # convert the full text of answer into html item["answer"] = item["answer"].encode('ascii', 'xmlcharrefreplace') # if summary has image, convert it to html if "summary_img" in item: item["summary_img"] = item["summary_img"].encode( 'ascii', 'xmlcharrefreplace') else: item['summary_img'] = "" # change vote to integer item["vote"] = int(item["vote"]) # in case of anonymous authors if "author" not in item: item["author"] = u'匿名用户' # complete links item["question_link"] = u"http://www.zhihu.com" + item[ "question_link"] if "author_link" in item: item["author_link"] = u"http://www.zhihu.com" + item[ "author_link"] else: item["author_link"] = "" # add the date when scraped item["date"] = date.today() yield item
def parse(self, response): company_results = resposne.xpath( '//*[@id="scr-res-table"]/div[1]/table/tbody/tr') for company in company_results: details_loader = ItemLoader(item=CompanyDetailsItem(), selector=company) details_loader.default_output_processor = TakeFirst() details_loader.add_xpath('company_name', 'td[2]/text()') details_loader.add_xpath('company_price_intraday', 'td[3]/span/text()') yield details_loader.load_item()
def parse_post(self, response, date): print(response) title = response.xpath('//span[@class="page_heading_inner"]/text()').get() description = response.xpath('//div[@class="page_content"]//text()[normalize-space()]').getall() description = [p.strip() for p in description] description = ' '.join(description).strip() item = ItemLoader(item=NbbankItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) item.add_value('date', date) return item.load_item()
def parse_post(self, response, title, date): description = response.xpath( '//div[@class="module module--push2 richtext"]//text()[normalize-space()]' ).getall() description = [p.strip() for p in description] description = ' '.join(description).strip() item = ItemLoader(item=MetzlerItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) item.add_value('date', date) return item.load_item()
def parse_post(self, response): title = response.xpath('//h1/text()').get() description = response.xpath('//div[@class="blurb"]//text()[normalize-space()]').getall() description = [p.strip() for p in description] description = ' '.join(description).strip() date = response.xpath('//div[@class="date"]/text()').get() item = ItemLoader(item=FsbwaItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) item.add_value('date', date) return item.load_item()
def parse_post(self, response): title = response.xpath('//h1/text()').get() description = response.xpath('//div[@class="container containerDetalji"]/div[@class="row"]//text()[normalize-space() and not(ancestor::h1)]').getall() description = [p.strip() for p in description] description = ' '.join(description).strip() date = response.xpath('//div[@class="datumDetalji"]/p/text()').get() item = ItemLoader(item=PrvabankacgItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) item.add_value('date', date) return item.load_item()
def parse(self, response): self.logger.info('%s', response.url) fields = WarrantInfoItem.Meta.fields rows = response.xpath('//tr[count(td)=21]').extract() for row in rows: loader = ItemLoader(item=WarrantInfoItem(), selector=Selector(text=row)) loader.default_input_processor = MapCompose(str, str.strip) loader.default_output_processor = TakeFirst() loader.add_value('date', self.date) for idx, field in enumerate(fields, start=1): if field: loader.add_xpath(field, f'//td[{idx}]/text()') yield loader.load_item()
def parse_post(self, response, date, title): content = response.xpath('//div[@class="post-body"]//text()').getall() content = [p.strip() for p in content if p.strip()] content = re.sub(pattern, "", ' '.join(content)) item = ItemLoader(item=CcubbankItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('link', response.url) item.add_value('content', content) item.add_value('date', date) yield item.load_item()
def parseCubeInfo(self, response, uid, screen_name, symbolList): jsonresponse = json.loads(response.body_as_unicode()) for s in symbolList: loader = ItemLoader(item=StockCubesItem()) loader.default_input_processor = MapCompose(str) loader.default_output_processor = Join(' ') for (field, path) in self.jmes_paths.items(): loader.add_value(field, SelectJmes(path)(jsonresponse[s])) item = loader.load_item() owner = OwnerItem() owner['id'] = uid owner['screen_name'] = screen_name item['owner'] = owner yield item
def parse_partners(self, response): company_url = response.xpath( '//*[@id="profile_header_heading"]/a/@href').extract_first() partner_selectors = response.css('div.partners').xpath( './/ul/li//h4/a') for sel in partner_selectors: loader = ItemLoader(item=Partner(), selector=sel) loader.default_input_processor = processors.MapCompose( w3lib.html.remove_tags) loader.default_output_processor = processors.TakeFirst() loader.add_value('focal_company_url', company_url) loader.add_xpath('partner_url', '@href') yield loader.load_item()
def parse_advisors(self, response): company_url = response.xpath( '//*[@id="profile_header_heading"]/a/@href').extract_first() employee_selector = response.css('div.advisors').xpath('.//ul/li') for sel in employee_selector: loader = ItemLoader(item=BoardMember(), selector=sel) loader.default_input_processor = processors.MapCompose( w3lib.html.remove_tags) loader.default_output_processor = processors.TakeFirst() loader.add_value('company_url', company_url) loader.add_xpath('person_url', './/h4/a/@href') loader.add_xpath('title', './/h5/text()') yield loader.load_item()
def parse_post(self, response): title = response.xpath('//h1//text()').get() description = response.xpath( '(//div[@class="ce-bodytext"])[position()<last()]//text()[normalize-space() and not(ancestor::h1)]' ).getall() description = [p.strip() for p in description] description = ' '.join(description).strip() item = ItemLoader(item=AkabankdeItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) return item.load_item()
def parse_post(self, response): title = response.xpath('//h2/text()').get() description = response.xpath('//div[@class="container main"]//text()[normalize-space()]').getall() description = [p.strip() for p in description if '{' not in p] description = ' '.join(description).strip() date = re.findall(r'[A-Za-z]+\s\d{1,2},\s\d{4}', description) or [''] item = ItemLoader(item=CnbankpaItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) item.add_value('date', date[0]) return item.load_item()
def parse_post(self, response): title = response.xpath('//div[@class="contenuti-header-bold-news-rassegna-stampa-dettaglio"]/text()').get() description = response.xpath('//div[@class="col-xs-12"][@style="paddijng-right; 0px; padding-left: 0px;"]//text()[normalize-space() and not(ancestor::noscript | ancestor::div[@class="contenuti-header-bold-news-rassegna-stampa-dettaglio"])]').getall() description = [p.strip() for p in description] description = ' '.join(description).strip() date = response.xpath('(//div[@class="contenuti-testo-news-rassegna-stampa"]/p/text())[1]').get() item = ItemLoader(item=BancacrsItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) item.add_value('date', date) return item.load_item()
def parse_post(self, response, date, title): content = response.xpath('//div[@class="news-body"]//text() | //div[@class="body field"]//text()[not (ancestor::em)] | //div[contains(@class,"field field-name")]//text()').getall() content = [p.strip() for p in content if p.strip()] content = re.sub(pattern, "",' '.join(content)) item = ItemLoader(item=BanrepItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('link', response.url) item.add_value('content', content) item.add_value('date', date) yield item.load_item()
def parse_post(self, response, date, title): description = response.xpath( '//div[@class="w-auto mw-full rte"]//text()[normalize-space()]' ).getall() description = [p.strip() for p in description] description = ' '.join(description).strip() item = ItemLoader(item=BcrlocuinteroItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) item.add_value('date', date) return item.load_item()
def parse_article(self, response): item = ItemLoader(Article()) item.default_output_processor = TakeFirst() title = response.xpath('//h1[@class="entry-title"]/text()').get().strip() content = response.xpath('//div[@class="entry-content"]//text()').getall() content = [text for text in content if text.strip()] content = "\n".join(content).strip() item.add_value('title', title) item.add_value('link', response.url) item.add_value('content', content) return item.load_item()
def parse_post(self, response): title = response.xpath('//h1[@class="page-title"]/text()').get() description = response.xpath('//div[@class="l-content"]//text()[normalize-space() and not(ancestor::h1 | ancestor::h4)]').getall() description = [p.strip() for p in description] description = ' '.join(description).strip() date = response.xpath('//div[@class="l-content"]//h4/text()').get() item = ItemLoader(item=EssexbankItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) item.add_value('date', date) return item.load_item()
def parse_item(self, response): self.logger.info("starting parse item") l = ItemLoader(item=IndexItem(), response=response) l.default_output_processor = TakeFirst() l.add_value("url", response.url) l.add_value("retrived", 0) l.add_value("source", response.request.url) l.add_value("project", self.settings.get("BOT_NAME")) l.add_value("spider", self.name) l.add_value("server", socket.gethostname()) l.add_value("date", datetime.datetime.utcnow()) yield l.load_item()
def parse_post(self, response): title = response.xpath('//h1[@class="edn_articleTitle"]/text()').get() description = response.xpath('(//article//p | //article//ul)//text()[normalize-space()]').getall() description = [p.strip() for p in description if '{' not in p] description = ' '.join(description).strip() date = response.xpath('//time/text()').get().split('on')[1] item = ItemLoader(item=WaynesavingsItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) item.add_value('date', date) return item.load_item()
def parse_post(self, response): title = response.xpath('//div[@class="banner-content no-offer"]/h1/text()').get() description = response.xpath('//div[@class="article-text text"]//text()[normalize-space()]').getall() description = [p.strip() for p in description] description = ' '.join(description).strip() date = response.xpath('//div[@class="banner-content no-offer"]/p/text()').get() item = ItemLoader(item=CommbankcomauItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) item.add_value('date', date) return item.load_item()
def parse_post(self, response): title = response.xpath('//h1/text()').get() description = response.xpath( '//div[@id="printableContent"]//text()[normalize-space() and not(ancestor::h1 | ancestor::script)]' ).getall() description = [p.strip() for p in description] description = ' '.join(description).strip() item = ItemLoader(item=KbmkItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) return item.load_item()
def parse_post(self, response, date): title = response.xpath('//h1/text()').get() description = response.xpath('//div[@class="entry-content container px-5"]/p/text()').getall() description = [p.strip() for p in description] description = ' '.join(description).strip() item = ItemLoader(item=GuberItem(), response=response) item.default_output_processor = TakeFirst() item.add_value('title', title) item.add_value('description', description) item.add_value('date', date) return item.load_item()
def parse_bids(self, response): selector = Selector(response) for bid in selector.select(self.bid_list_xpath) : loader = ItemLoader(BidItems(), selector=bid) loader.default_input_processor = MapCompose(lambda v: v.split(), replace_escape_chars) loader.default_output_processor = Join() for field, xpath in auction_bid_fields.iteritems(): loader.add_xpath(field, xpath) yield loader.load_item()
def parse_items(self, response): gooseobj = self.g.extract(response.url) fulltext = gooseobj.cleaned_text il = ItemLoader(item=SoloItem(), response=response) il.default_output_processor = MapCompose( lambda v: v.rstrip(), lambda v: re.sub(r"[\',|!]", "", v), lambda v: re.sub(r"\s+", " ", v) ) il.add_value("siteurl", parse_base_url(response.url)) il.add_value("pageurl", response.url) il.add_value("text", fulltext.encode("ascii", "ignore")) il.add_xpath("pagetitle", "//title/text()") return il.load_item()
def parse(self, response): def strip_dollar(x): return x.strip('$') self.driver.get(response.url) try: WebDriverWait(self.driver, 15).until( EC.presence_of_element_located( (By.XPATH, '//*[@id="depart-container"]/div[2]/div[1]/div/[@style="width: 0%;"]'))) except TimeoutException: print 'Page load time out' pass while True: try: try: WebDriverWait(self.driver, 15).until( EC.presence_of_element_located( (By.XPATH, '//*[@id="depart-container"]/div/div/div/button'))) except TimeoutException: break next = self.driver.find_element_by_xpath( '//*[@id="depart-container"]/div/div/div/button') next.click() except ElementNotVisibleException: break for trips in Selector( text=self.driver.page_source).xpath(self.trips_list_xpath): loader = ItemLoader(BusTrip(), selector=trips) loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() loader.price_in = MapCompose(strip_dollar) for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) dateoftrip = str(response.url).split("/")[-1] loader.add_value('dateoftrip', dateoftrip.decode('unicode-escape')) yield loader.load_item()
def parse_answers(self, response): # use selector to extract answers selector = Selector(response) # iterate over answers for answer in selector.xpath(self.answers_list_xpath): loader = ItemLoader(item=ZhihuAnswer(), selector=answer) # define processors loader.default_input_processor = MapCompose(unicode.strip) loader.default_output_processor = Join() # iterate over fields and add xpaths to the loader for field, xpath in self.item_fields.iteritems(): loader.add_xpath(field, xpath) item = loader.load_item() # convert the full text of answer into html item["answer"] = item["answer"].encode('ascii', 'xmlcharrefreplace') # if summary has image, convert it to html if "summary_img" in item: item["summary_img"] = item["summary_img"].encode('ascii', 'xmlcharrefreplace') else: item['summary_img'] = "" # change vote to integer item["vote"] = int(item["vote"]) # in case of anonymous authors if "author" not in item: item["author"] = u'匿名用户' # complete links item["question_link"] = u"http://www.zhihu.com" + item["question_link"] if "author_link" in item: item["author_link"] = u"http://www.zhihu.com" + item["author_link"] else: item["author_link"] = "" # add the date when scraped item["date"] = date.today() yield item
def parse_item(self, response): """ Returns fields from each individual attendee. @url http://openstacksummitnovember2014paris.sched.org/cfb @scrapes name image_url friends title_company_location links about """ l = ItemLoader(item=ItemloadItem(), response=response) l.default_output_processor = MapCompose(lambda v: v.strip(), replace_escape_chars) l.add_xpath('name', '//*[@id="sched-page-me-name"]/text()') l.add_xpath('image_url', '//*[@id="myavatar"]/@src') l.add_xpath('friends', '//*[@id="sched-page-me-connections"]/ul/li/a/@title') l.add_xpath('title_company_location', '//*[@id="sched-page-me-profile-data"]/text()') l.add_xpath('links', '//*[@class="sched-network-link"]/a/@href') l.add_xpath('about', '//*[@id="sched-page-me-profile-about"]/text()') return l.load_item()
def parse_CatalogRecord(self, response): CatalogRecord = ItemLoader(item=catalogscraperItem(), response=response) CatalogRecord.default_output_processor = TakeFirst() keywords = '|'.join(r"\b" + re.escape(word.strip()) + r"\b" for word in open('Catalog_Scraper/spiders/keys.txt')) r = re.compile('.*(%s).*' % keywords, re.IGNORECASE|re.MULTILINE|re.UNICODE) if r.search(response.body_as_unicode()): # The following lines tell the spider how to populate the fields defined in "items.py". The first argument of "CatalogRecord.add_xpath" indicated which field the spider is being directed to fill, while the second provides an xpath, directing the spider to where the relevent information is contained on a give webpage. CatalogRecord.add_xpath('title', './/div[@id="dublin-core-title"]/div[@class="element-text"]/text()') # CatalogRecord.add_xpath('subject', '') # CatalogRecord.add_xpath('description', '') # CatalogRecord.add_xpath('creator', '') # CatalogRecord.add_xpath('source', '') # CatalogRecord.add_xpath('published', '') # CatalogRecord.add_xpath('published', '') # CatalogRecord.add_xpath('rights', '') # CatalogRecord.add_xpath('citation', '') # CatalogRecord.add_xpath('url', '') return CatalogRecord.load_item()
def parse_items(self, response): gooseobj = self.g.extract(response.url) fulltext = gooseobj.cleaned_text il = ItemLoader(item=SoloItem(), response=response) il.default_output_processor = MapCompose( lambda v: v.rstrip(), lambda v: re.sub(r'[\',|!]', '', v), lambda v: re.sub(r'\s+', ' ', v) ) il.add_value('siteurl', parse_base_url(response.url)) il.add_value('pageurl', response.url) il.add_value('text', fulltext.encode('ascii', 'ignore')) il.add_xpath('pagetitle', '//title/text()') return il.load_item()
def parse_items(self, response): # fulltext = self.parse_body_text(response) gooseobj = self.g.extract(response.url) fulltext = gooseobj.cleaned_text il = ItemLoader(item=SuspiderItem(), response=response) il.default_output_processor = MapCompose( lambda v: v.rstrip(), lambda v: re.sub(r'[\',|!]', '', v), lambda v: re.sub(r'\s+', ' ', v) ) il.add_value('siteurl', self.parse_base_url(response.url)) il.add_value('pageurl', response.url) il.add_value('text', fulltext.encode('ascii', 'ignore')) il.add_xpath('pagetitle', '//title/text()') # il.add_xpath('keywords', '//meta[@name="keywords"]/@content') yield il.load_item()
def parse_game_log(self, response): games_table = [] # If default selector returns empty, try second selector rows = response.xpath('//*[@id="content"]/div[6]/div[1]/div/div[4]/div/table[1]/tr') if not bool(rows): rows = response.xpath('//*[@id="content"]/div[6]/div[1]/div/div[3]/div/table/tr') # Build up game stats table to be turned into scrapy items for row in rows: game_row = [] for col in row.xpath("td"): opponent = col.xpath("ul/li[3]/a/text()").extract() game_result = col.xpath("a/text()").extract() other_value = col.xpath("text()").extract() if opponent: game_row.append(opponent[0]) elif game_result: game_row.append(game_result[0]) else: game_row.append(other_value[0]) games_table.append(game_row) # print json.dumps(games_table, indent=1) # Load game stat rows into scrapy items for game_row in games_table: print(game_row) loader = ItemLoader(item=NFL_RB_Game_2015(), response=response) loader.default_output_processor = TakeFirst() loader.add_xpath("player_name", '//*[@id="content"]/div[3]/div[2]/h1/text()', MapCompose(unicode.strip)) # Handle regular season totals row if game_row[0] == "REGULAR SEASON STATS": loader.add_value("is_season_totals", True) loader.add_value("date", None) loader.add_value("opponent", None) loader.add_value("result", None) loader.add_value("rush_attempts", int(game_row[1])) loader.add_value("rush_yards", int(game_row[2])) loader.add_value("avg_yards_per_rush", float(game_row[3])) loader.add_value("longest_rush", int(game_row[4])) loader.add_value("rush_tds", int(game_row[5])) loader.add_value("receptions", int(game_row[6])) loader.add_value("rec_yards", int(game_row[7])) loader.add_value("avg_yards_per_rec", float(game_row[8])) loader.add_value("longest_rec", int(game_row[9])) loader.add_value("rec_tds", int(game_row[10])) loader.add_value("fumbles", int(game_row[11])) loader.add_value("fumbles_lost", int(game_row[12])) yield loader.load_item() # Handle regular season individual game row (ignoes two header rows) elif game_row[0] != "2015 REGULAR SEASON GAME LOG" and game_row[0] != "DATE": # Parse date from string containing day of the week and date day_and_date = game_row[0] date = day_and_date.split()[1] + "/15" loader.add_value("is_season_totals", False) loader.add_value("date", date) loader.add_value("opponent", game_row[1]) loader.add_value("result", game_row[2]) loader.add_value("rush_attempts", int(game_row[3])) loader.add_value("rush_yards", int(game_row[4])) loader.add_value("avg_yards_per_rush", float(game_row[5])) loader.add_value("longest_rush", int(game_row[6])) loader.add_value("rush_tds", int(game_row[7])) loader.add_value("receptions", int(game_row[8])) loader.add_value("rec_yards", int(game_row[9])) loader.add_value("avg_yards_per_rec", float(game_row[10])) loader.add_value("longest_rec", int(game_row[11])) loader.add_value("rec_tds", int(game_row[12])) loader.add_value("fumbles", int(game_row[13])) loader.add_value("fumbles_lost", int(game_row[14])) yield loader.load_item()
def parse_game_log(self, response): games_table = [] # If default selector returns empty, try second selector rows = response.xpath('//*[@id="content"]/div[6]/div[1]/div/div[4]/div/table[1]/tr') if not bool(rows): rows = response.xpath('//*[@id="content"]/div[6]/div[1]/div/div[3]/div/table/tr') # Build up game stats table to be turned into scrapy items for row in rows: game_row = [] for col in row.xpath('td'): opponent = col.xpath('ul/li[3]/a/text()').extract() game_result = col.xpath('a/text()').extract() other_value = col.xpath('text()').extract() if opponent: game_row.append(opponent[0]) elif game_result: game_row.append(game_result[0]) else: game_row.append(other_value[0]) games_table.append(game_row) # print json.dumps(games_table, indent=1) # Load game stat rows into scrapy items for game_row in games_table: print(game_row) # Initialize WR game item to be loaded loader = ItemLoader(item=NFL_WR_Game_2015(), response=response) # Set default output process to take first item from selector loader.default_output_processor = TakeFirst() # Add player name loader.add_xpath('player_name', '//*[@id="content"]/div[3]/div[2]/h1/text()', MapCompose(unicode.strip)) # Handle regular season totals row if game_row[0] == 'REGULAR SEASON STATS': loader.add_value('is_season_totals', True) loader.add_value('date', None) loader.add_value('opponent', None) loader.add_value('result', None) loader.add_value('receptions', int(game_row[1])) loader.add_value('targets', int(game_row[2])) loader.add_value('rec_yards', int(game_row[3])) loader.add_value('avg_yards_per_rec', float(game_row[4])) loader.add_value('longest_rec', int(game_row[5])) loader.add_value('rec_tds', int(game_row[6])) loader.add_value('rush_attempts', int(game_row[7])) loader.add_value('rush_yards', int(game_row[8])) loader.add_value('avg_yards_per_rush', float(game_row[9])) loader.add_value('longest_rush', int(game_row[10])) loader.add_value('rush_tds', int(game_row[10])) loader.add_value('fumbles', int(game_row[11])) loader.add_value('fumbles_lost', int(game_row[12])) yield loader.load_item() # Handle regular season individual game row (ignoes two header rows) elif game_row[0] != '2015 REGULAR SEASON GAME LOG' and game_row[0] != 'DATE': # Parse date from string containing day of the week and date day_and_date = game_row[0] date = day_and_date.split()[1] + '/15' loader.add_value('is_season_totals', False) loader.add_value('date', date) loader.add_value('opponent', game_row[1]) loader.add_value('result', game_row[2]) loader.add_value('receptions', int(game_row[3])) loader.add_value('targets', int(game_row[4])) loader.add_value('rec_yards', int(game_row[5])) loader.add_value('avg_yards_per_rec', float(game_row[6])) loader.add_value('longest_rec', int(game_row[7])) loader.add_value('rec_tds', int(game_row[8])) loader.add_value('rush_attempts', int(game_row[9])) loader.add_value('rush_yards', int(game_row[10])) loader.add_value('avg_yards_per_rush', float(game_row[11])) loader.add_value('longest_rush', int(game_row[12])) loader.add_value('rush_tds', int(game_row[13])) loader.add_value('fumbles', int(game_row[14])) loader.add_value('fumbles_lost', int(game_row[15])) yield loader.load_item()