def parse_first(self, response): print('@@@start first page@@@') sel = scrapy.Selector(response) result_links = sel.xpath('//h3[@class="r"]') print("result counts: " + str(len(result_links))) for result in result_links: url = result.xpath('./a/@href').extract()[0] if url[-4:] == ".pdf": continue yield SplashRequest( url, self.parse, #endpoint='execute', slot_policy=SlotPolicy.SINGLE_SLOT, args={ 'wait': 25, 'timeout': 3600 # , # 'lua_source': tmp_script }) pages = sel.xpath('//td/a[@class="fl"]') count = 0 for page in pages: # if count > 1: # break url = 'https://www.google.com' + page.xpath('@href').extract()[0] print(url) yield SplashRequest( url, self.parse_page, #endpoint='execute', slot_policy=SlotPolicy.SINGLE_SLOT, args={ 'wait': 15, 'timeout': 3600 #, #'lua_source': tmp_script }) count += 1
def parse(self, response): global Page sel = scrapy.Selector(response) image_urls = sel.xpath( '//ol[@class="commentlist"]//div[@class="row"]//div[@class="text"]//img/@src' ).extract() image_names = sel.xpath( '//ol[@class="commentlist"]//div[@class="row"]//div[@class="text"]//span[@class="righttext"]/a/text()' ).extract() new_urls = [] for xxx in image_urls: # url = base64.b64decode(xxx).decode('utf-8') # new_urls.append('https' + url) # print("Debug #############################") # print(xxx) new_urls.append('https:' + xxx) item = JandanPicItem() item['image_url'] = new_urls item['image_name'] = image_names yield item if Page < 3: # print("Before Debug!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") next_page = response.xpath( '//div[@class="comments"]//div[@class="cp-pagenavi"]//a[@class="previous-comment-page"]/@href' ) # print("Debug!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!") if next_page: # print("next_page is %s"%next_page) url = response.urljoin(next_page[1].extract()) print("Debug%s" % url) yield scrapy.Request(url, self.parse) Page += 1
def parse(self, response): now = time.strftime('%Y-%m-%d %H:%M:%S') hxs = scrapy.Selector(response) for h in hxs.css('div.list-article > h1'): item = DapnewsItem() item['categoryId'] = '1' name = h.xpath('a/text()') if not name: print('DAP => [' + now + '] No title') else: item['name'] = name.extract_first() description = h.xpath( 'following-sibling::div[@class="article-content"][1]/p/text()') if not description: print('DAP => [' + now + '] No description') else: item['description'] = description.extract_first() url = h.xpath("a/@href") if not url: print('DAP => [' + now + '] No url') else: item['url'] = url.extract_first() imageUrl = h.xpath( 'following-sibling::div[@class="feature-image"][1]/img/@src') item['imageUrl'] = '' if not imageUrl: print('DAP => [' + now + '] No imageUrl') else: item['imageUrl'] = imageUrl.extract_first() request = scrapy.Request(item['url'], callback=self.parse_detail) request.meta['item'] = item yield request
def _parseRectorOffice(self, response): ''' 校长办公室 :return: ''' selector = scrapy.Selector(response) item = self._inititem() item["url"] = response.url name = selector.xpath( '//section[@class="eight phone-four columns "]/h1/text()').extract( ) if name: item["name"] = StrUtil.delWhiteSpace(name[0]) logger.debug('>>UNU>>leader>>name>>%s' % item["name"]) else: logger.error('爬取UNU领导人姓名失败,网页结构可能改变,建议检查') work = selector.xpath( '//section[@class="eight phone-four columns "]/h4/text()').extract( ) if work: item["work"] = StrUtil.delWhiteSpace(work[0]) else: logger.error('爬取校长办公室成员职位出错') resume = selector.xpath( '//section[@class="eight phone-four columns "]/div/ul/li/div' ).xpath('string(.)').extract() if resume: item["resume"] = StrUtil.delWhiteSpace(resume[0]) else: logger.error('爬取校长办公室成员简历出错') logger.debug('>>>OECDleader>>>校长办公室成员work>>>%s' % item["work"]) logger.debug('>>>OECDleader>>>校长办公室成员name>>>%s' % item["name"]) logger.debug('>>>OECDleader>>>校长办公室成员resume>>>%s' % item["resume"]) yield item
def _get_session_by_login(self): session = requests.session() url1 = 'https://account.sogou.com/connect/login?provider=weixin&client_id=2017&ru=https://weixin.sogou.com&third_appid=wx6634d697e8cc0a29&href=https://dlweb.sogoucdn.com/weixin/css/weixin_join.min.css?v=20170315' res1 = session.get(url1) url2 = res1.url state = re.findall(r'state=(.*)&', url2) url3 = 'https://pb.sogou.com/cl.gif?uigs_t=%s&uigs_productid=vs_web&terminal=web&vstype=weixin&pagetype=index&channel=index_pc&type=weixin_search_pc&wuid=00F83DEFAFA78A1A5C1BAF0649830928&snuid=&uigs_uuid=%s&login=0&uigs_cl=home_login_top&href=javascript:void(0);&uigs_refer=https://weixin.sogou.com/' % ( str(int(round( time.time() * 1000))), str(int(round(time.time() * 1000000)))) session.get(url3) res2 = session.get(url2) res2_html = self._response_decode(res2) selector = scrapy.Selector(text=res2_html) uuid = selector.xpath('//div[@class="wrp_code"]/img').attrib.get( 'src').split('/')[-1] url4 = 'https://open.weixin.qq.com/connect/qrcode/' + uuid res4 = session.get(url4, headers={'Referer': url2}) with open('./weixin/temp/qrcode.png', 'wb') as f: f.write(res4.content) img = Image.open('./weixin/temp/qrcode.png') img.show() time.sleep(10) ck = '' while ck != '405': url5 = 'https://long.open.weixin.qq.com/connect/l/qrconnect?uuid=%s&_=%s' % ( uuid, str(int(round(time.time() * 1000)))) res5 = session.get(url5) fre = re.findall( r"window.wx_errcode=(\d{3});window.wx_code='(.*)'", res5.text)[0] ck = fre[0] code = fre[1] img.close() url5 = 'https://account.sogou.com/connect/callback/weixin?code=%s&state=%s' % ( code, state) session.get(url5) print('logined') return session
def parse_location(self, response): json_data = response.meta["json"] address = json_data["address"] # decode entities name = scrapy.Selector(text=json_data["name"]).xpath("//text()").get() # These are weird enough that there's no hope of parsing them, but # clean the text up hours = response.xpath('//strong[text()="Hours:"]/../text()').extract() hours = ';'.join(s.strip().replace('\xa0', ' ') for s in hours) properties = { "ref": re.search(r"postid-(\d+)", response.css("body").attrib["class"])[1], "lat": address["latitude"], "lon": address["longitude"], "addr_full": address["address"], "city": address["city"], "state": address["state"], "postcode": address["zip_code"], "name": name, "website": response.url, "phone": (response.xpath("//*[starts-with(@href, 'tel:')]/@href").get() or "")[4:], "opening_hours": hours, } return GeojsonPointItem(**properties)
def parse(self, response): for reviewer in response.xpath('//tr[contains(@id, "reviewer")]/td[3]/a'): name = reviewer.xpath('b/text()').extract() href = reviewer.xpath('@href').extract() rev_url = 'http://www.amazon.com' + href[0] self.driver.get(rev_url) rev_id = rev_url.split('/')[-1] if rev_id == '': rev_id = response.url.split('/')[-2] email_xpath = '//a[@id="/gp/profile/' + rev_id + '"]' email = '' try: email_link = self.driver.find_element_by_xpath(email_xpath) email_link.click() time.sleep(1) except: email = '-' sel = scrapy.Selector(text=self.driver.page_source) if email != '-': email = sel.xpath(email_xpath + '/text()').extract()[0] name = sel.xpath('//h1/text()').extract()[0] item = AmazonItem() item['name'] = name item['email'] = email yield item self.i += 1 if self.i <= self.end: yield scrapy.Request('http://www.amazon.com/review/top-reviewers?page=' + str(self.i), callback=self.parse)
def _parseDirectors(self, response): ''' 董事 :return: ''' selector = scrapy.Selector(response) item = self._inititem() item["work"] = "Directors" item["url"] = response.url name = selector.xpath( '//div[@class="col-sm-9 leftnav-content-wrapper"]/h1/text()' ).extract() if name: name[0] = re.sub('-', ',', name[0]) try: item["name"] = StrUtil.delWhiteSpace(name[0].split(',')[0]) except: logger.warning('董事页面可能变化,建议检查') item["name"] = StrUtil.delWhiteSpace(name[0]) elif response.url == "http://www.oecd.org/legal/nicola-bonucci-cv.htm": name = selector.xpath( '//div[@class="span-19 last"]/h1/text()').extract()[0] item["name"] = StrUtil.delWhiteSpace(name.split(',')[0]) else: logger.error('爬取董事姓名出错') resume = selector.xpath('//div[@id="webEditContent"]').xpath( 'string(.)').extract() if resume: item["resume"] = StrUtil.delWhiteSpace(resume[0]) else: logger.error('爬取董事简历出错') logger.debug('>>>OECDleader>>>董事work>>>%s' % item["work"]) logger.debug('>>>OECDleader>>>董事name>>>%s' % item["name"]) logger.debug('>>>OECDleader>>>董事resume>>>%s' % item["resume"]) yield item
def parse(self, response): xs_item = XsmnItem() tmp_data = {} data_resp = scrapy.Selector(response) xs_item['xs_info'] = [ # Thứ data_resp.xpath("//table[@id='MT0']/tr/th[1]/a/text()" ).extract_first(), # Ngày tháng data_resp.xpath("//table[@id='MT0']/tr/th[1]/text()" ).extract_first(), self.now.year ] for i in range(2, 5): # Các tỉnh trong bảng xổ số tmp_location = data_resp.xpath( "//table[@id='MT0']/tr/th[{0}]/a/text()".format( i)).extract_first() if tmp_location is None: continue tmp_data[tmp_location] = {} for j in range(2, 11): # Cột các giải từ giải 8 đến giải đặc biệt tmp_giai = data_resp.xpath( "//table[@id='MT0']/tr[{0}]/td[1]/text()".format( j)).extract_first() # Các số trúng thưởng trong cột theo tỉnh tmp_number = data_resp.xpath( "//table[@id='MT0']/tr[{0}]/td[{1}]//text()".format( j, i)).extract() tmp_data[tmp_location][tmp_giai] = ", ".join(tmp_number) xs_item['xs_data'] = tmp_data yield xs_item
def parse_humans_name(self, response): if self.check_page(response.url) == False: return html = scrapy.Selector(text=response.body) total_page = html.css( "#web-content > div > div > div.pl20.pr20.f14 > div.company_pager > div::text" ).extract() if len(total_page) != 0: pages = int(total_page[0]) + 1 for page in range(1, pages): url = response.url + "/p" + str(page) seed = { "url": url, "formUrl": response.url, "status": 0, "ts": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())) } self.mongo.humans_page_seed_insert(seed) print(response.url + "------------------>" + total_page[0]) else: url = response.url + "/p1" seed = { "url": url, "formUrl": response.url, "status": 0, "ts": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(time.time())) } self.mongo.humans_page_seed_insert(seed) print(response.url + "------------------>1")
def parse_page(self, response): NewSeed = response.meta.get('item', '') selector = scrapy.Selector(response) for _ in selector.xpath('//div[@class="info-box"]/div[@class="info"]'): product = _.xpath('h1/text()').extract() NewSeed['product'] = ''.join(str(i).strip() for i in product) field = _.xpath('ul[@class="subinfo"]/li[@class="l"]/p[1]/a/text()').extract() NewSeed['field'] = ''.join(str(i).strip() for i in field) platform = _.xpath('ul[@class="subinfo"]/li[@class="l"]/p[2]/span[1]/text()').extract() NewSeed['platform'] = ''.join(str(i).strip() for i in platform) location = _.xpath('ul[@class="subinfo"]/li[@class="l"]/p[2]/span[2]/text()').extract() NewSeed['location'] = ''.join(str(i).strip() for i in location) homepage = _.xpath('ul[@class="subinfo"]/li[@class="l"]/p[3]/span[1]/descendant::text()').extract() NewSeed['homepage'] = ''.join(str(i).strip() for i in homepage) establish_time = _.xpath('ul[@class="subinfo"]/li[@class="r box-fix-r"]/p[1]/text()').extract() NewSeed['establish_time'] = ''.join(str(i).strip() for i in establish_time) status = _.xpath('ul[@class="subinfo"]/li[@class="r box-fix-r"]/p[2]/text()').extract() NewSeed['status'] = ''.join(str(i).strip() for i in status) tags = selector.xpath('//div[@class="project-top"]/div[@class="txt"]/div[1]/a/text()').extract() NewSeed['tags'] = ''.join(str(i).strip() for i in tags) description = selector.xpath('//div[@class="box-plate"]/div[@class="desc"]/text()').extract() NewSeed['description'] = re.sub(r'[\n\r ]', '', ''.join(str(i).strip() for i in description)) contact = _.xpath( '//div[@class="project-status"]/div[@class="people-list"]/h4[@class="title"]/a/text()').extract() NewSeed['contact'] = ''.join(str(i).strip() for i in contact) NewSeed['project_status'] = _.xpath('//div[@class="project-status"]/a/text()').extract_first(default='N/A') leadership = selector.xpath( '//div[@class="item-list people-list"]/ul/li/div[2]/descendant::text()').extract() leadership = list(filter(lambda x: len(x) > 1, [str(_).strip() for _ in leadership])) NewSeed['leadership'] = ''.join(str(i).strip() for i in leadership) logo_url = selector.xpath('//div[@class="img"]/span[@class="img-middle"]/img/@src').extract() NewSeed['company_name'] = selector.xpath('//div[@class="company-box"]/dl[1]/p/a/text()').extract_first( default='N/A') brief_intro = selector.xpath('//div[@class="company-box"]/dl[1]/dd//text()').extract() NewSeed['brief_intro'] = re.sub('r[\n\r ]','',''.join(str(i).strip() for i in brief_intro)) NewSeed['logo_url'] = ''.join(str(i).strip() for i in logo_url) NewSeed['url'] = response.url yield NewSeed
def parse(self, response): sel = scrapy.Selector(response=response) nodes = sel.xpath('//ul[@class="listContent"]/li') for node in nodes: item = Residential_Brief() eles = node.xpath('./div[@class="info"]/div[@class="title"]/a/@href').extract() item['residential_id'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath('./div[@class="info"]/div[@class="title"]/a/text()').extract() item['residential_name'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath('./div[@class="info"]/div[@class="positionInfo"]/a[@class="district"]/text()').extract() item['district'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath('./div[@class="info"]/div[@class="positionInfo"]/a[@class="bizcircle"]/text()').extract() item['bizcircle'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath('./div[@class="info"]/div[@class="positionInfo"]/text()').extract() item['build_year'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath('./div[@class="xiaoquListItemRight"]/div[@class="xiaoquListItemPrice"]/div[@class="totalPrice"]/span/text()').extract() item['avg_price'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath('./div[@class="xiaoquListItemRight"]/div[@class="xiaoquListItemPrice"]/div[@class="priceDesc"]/text()').extract() item['avg_price_date'] = format(eles[0]) if len(eles) > 0 else '' eles = node.xpath('./div[@class="xiaoquListItemRight"]/div[@class="xiaoquListItemSellCount"]/a[@class="totalSellCount"]/span/text()').extract() item['on_sale_count'] = format(eles[0]) if len(eles) > 0 else '' yield item for node in nodes: eles = node.xpath('./div[@class="info"]/div[@class="title"]/a/@href').extract() url = format(eles[0]) if len(eles) > 0 else '' if url != '': yield scrapy.Request(url, callback=self.resident_detail_parse) else: continue
def get_map(self, response): value = response.meta['value'] city = response.meta['city'] text = response.text text = re.sub(r'\\\n', '', text) data = json.loads(text) items = [] if 'markers' in data: for d in data['markers']: address = d['address'] address = address.replace(' ', ' ') # for the shit like this: # u'<p>\r\nул.Торайгырова, 53/23\r\n</p>' address = self.html_parser.unescape(address) address = scrapy.Selector( text=address).xpath('//text()').get("").strip() address = address.replace(u'\xa0', ' ') name = d['name'] name = name.replace(u'\xa0', ' ') items.append( dict( name=name.strip(), address=address.strip(), lat=d['lat'], lon=d['lng'], )) return response.follow( 'https://www.bcc.kz/local/tmpl/ajax/getmapdata.php?type={}&city={}&lang=s1' .format(item_type, value), self.get_map_data, meta=dict(items=items, city=city))
def parse(self, response): #print response.body #item = DoubanbookItem() self.count += 1 self.filename = "news.txt" URL = 'https://www.2cto.com' selector = scrapy.Selector(response) books = selector.xpath('//li[@class="clearfix"]') #每条新闻 # 每一页有15条新闻 for each in books: tag = "" author = [] auth = each.xpath('div/p[@class="tags"]/a').xpath( 'string(.)').extract() for i in range(len(auth)): tag = tag + auth[i] + "," author.append(tag) title = each.xpath('a/text()').extract() web = each.xpath('a/@href').extract() #print(title) with open(self.filename, "a", encoding="utf8") as f: # 将我们获取到的信息保存到本地 #for i in range(len(web)): # 以某个属性的长度来循环 # 我们将这些信息保存起来,并用;来分隔 f.write(web[0] + ";") f.write(title[0] + ";") f.write(author[0] + "\n") #yield item print("爬取一页") sleep(0.1) nextPage = selector.xpath( '//div[@class="text-c"]/a[contains(text(),"下一页")]/@href').extract( )[0] if self.count <= 1317: next = URL + nextPage yield scrapy.http.Request(next, callback=self.parse) else: self.database(self.filename) # 当超过20页时跳出返回,调用database函数存信息到数据库
def explore(self, start_url, scrape_func, next_xpath, max_count=-1): try: self.browser.get(start_url) old_page = self.browser.page_source counter = 0 while True: # refer to the following blog for wait trick: # http://www.obeythetestinggoat.com/how-to-get-selenium-to-wait-for-page-load-after-a-click.html WebDriverWait(self.browser, NEXT_WAIT_TIMEOUT) \ .until(EC.element_to_be_clickable((By.XPATH, next_xpath))) # always sleep for a while to be polite time.sleep(0.3) if old_page == self.browser.page_source or \ (max_count != -1 and counter >= max_count): break else: old_page = self.browser.page_source counter += 1 response = scrapy.Selector(text=self.browser.page_source) yield scrape_func(response) next_elem = self.browser.find_element_by_xpath(next_xpath) cnt = 0 while cnt < MAX_RETRY: try: ActionChains(self.browser).move_to_element( next_elem).click().perform() break except WebDriverException as we: time.sleep(1) cnt += 1 except TimeoutException as te: sys.stderr.write( "Fail to wait for page to be loaded. Error:{}\n".format(te)) except Exception as oe: sys.stderr.write("unexpected exception:{}".format(oe)) import traceback traceback.print_exc() raise
def list_parse(self, response): check(response) data = json.loads(response.body) selector = scrapy.Selector(text=data['data'], type="html") cards = selector.xpath("//div[@class='card']") for card in cards: item = ListItem() item['image'] = card.xpath("a/img/@data-src").extract_first() item['code'] = card.xpath("div/a[1]/h4/text()").extract_first() if item['code'] in self.database: item['title'] = "Crawled" yield item continue item['url'] = "https://www5.javmost.com/{}/".format(item['code']) item['title'] = card.xpath("div/a[2]/h5/text()").extract_first() item['release_time'] = card.xpath("div/p/text()[2]").extract_first( ).split('\t')[0].split(" ")[-1] item['rating'] = card.xpath("div/p/text()[5]").extract_first( ).split('\t')[0].split(" ")[-1] item['duration'] = card.xpath("div/p/span/text()").extract_first() item['genre'] = card.xpath( "div/p/a[@class='btn btn-warning btn-xs m-r-5 m-t-2']/text()" ).extract() item['star'] = card.xpath( "div/p/a[@class='btn btn-danger btn-xs m-r-5 m-t-2']/text()" ).extract() item['maker'] = card.xpath( "div/p/a[@class='btn btn-info btn-xs m-r-5 m-t-2']/text()" ).extract() item['director'] = card.xpath( "div/p/a[@class='btn btn-success btn-xs m-r-5 m-t-2']/text()" ).extract() item['tags'] = card.xpath( "div/p/a[@class='btn btn-inverse btn-xs m-r-5 m-t-2']/text()" ).extract() yield scrapy.Request(item['url'], callback=self.parse, meta={'item': item})
def request_captcha(self, response): selector = scrapy.Selector(response) captcha_url = selector.xpath("//img[@class='verifyimg']").xpath("./@src").extract_first() randomKey = selector.xpath("//input[@class='randomkey']").xpath("./@value").extract_first() full_captcha_url = self.host + captcha_url fileName = self.captcha_file_path() urlretrieve(full_captcha_url, fileName) open_image_command = "open "+fileName os.system(open_image_command) captcha_str = input("请输入验证码:") return scrapy.FormRequest.from_response( response, formdata={"regionCode": "+86", "account": "手机号", "password": "******", "captcha": captcha_str, "randomKey": randomKey}, meta={'cookiejar': response.meta['cookiejar']}, callback=self.after_login )
def parse(self, response): sel = scrapy.Selector(response) title = sel.xpath('//div[@id="content"]') main_topic = sel.xpath( '//div[@class="topic-doc"]//div[@class="topic-content"]') reply = sel.xpath( '//ul[@class="topic-reply"]//div[@class="reply-doc content"]') items = [] title_item = DoubantopicItem() title_item['content'] = title.xpath('h1/text()').extract()[0].strip() items.append(title_item) main_topic_item = DoubantopicItem() main_topic_item['content'] = main_topic.xpath('p/text()').extract()[0] items.append(main_topic_item) for each in reply: item = DoubantopicItem() item['content'] = each.xpath('p/text()').extract()[0].strip(' ') items.append(item) return items
def data_parse(self, response): data = scrapy.Selector(response).xpath('//p/text()').extract_first() data = json.loads(data)['returndata'] data_nodes = data['datanodes'] tag_nodes = data['wdnodes'][0]['nodes'] # 储存指标名 for i in tag_nodes: item = DataNameItem() item['name'] = i['name'] item['memo'] = i['memo'] item['zb'] = i['code'] yield item # 储存数据 for j in data_nodes: item = DataItem() item['zb'] = j['wds'][0]['valuecode'] item['sj'] = j['wds'][1]['valuecode'] if j['data']['hasdata']: item['data_str'] = j['data']['strdata'] else: item['data_str'] = 'null' yield item
def parseBooks(self, response): selector = scrapy.Selector(response=response) VIEWSTATE= selector.xpath('//*[@id="__VIEWSTATE"]/@value').extract_first() EVENTVALIDATION = selector.xpath('//*[@id="__EVENTVALIDATION"]/@value').extract_first() VIEWSTATEGENERATOR = selector.xpath('//*[@id="__VIEWSTATEGENERATOR"]/@value').extract_first() script = 'ctl00$ContentPlaceHolder1$updatepanelread|ctl00$ContentPlaceHolder1$ddbook' formdata={ # change pages here "__EVENTTARGET": "ctl00$ContentPlaceHolder1$ddbook", "__LASTFOCUS":"", "__VIEWSTATE": VIEWSTATE, "__VIEWSTATEGENERATOR": VIEWSTATEGENERATOR, "__EVENTVALIDATION": EVENTVALIDATION, "ctl00$ContentPlaceHolder1$ddbook": "1", "__ASYNCPOST": "true" } header = {'User-Agent': u'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36', 'Cookie': 'ASP.NET_SessionId=3ikackn3wx5ujb5hc2d4y3cx', 'X-MicrosoftAjax': 'Delta=true', 'X-Requested-With': 'XMLHttpRequest'} yield scrapy.FormRequest(url=self.url, formdata=formdata, headers=header, callback=self.parseSections)
def parse(self, response): data = response.body.decode() selector = scrapy.Selector(text=data) total = selector.xpath('//*[@id="main-container"]/div[2]/div') for article in total: item = MarvelpttItem() x1 = article.xpath('./div[2]/a/text()').extract() if len(x1) == 0: continue else: item['title'] = x1[0] item['postUser'] = article.xpath( './div[3]/div[1]/text()').extract()[0] item['time'] = article.xpath( './div[3]/div[3]/text()').extract()[0] x2 = article.xpath('./div/span/text()').extract() if len(x2) == 0: item['push'] = '0' else: item['push'] = x2[0] yield item
def parse_result(self, response): json_data = json.loads(response.body_as_unicode()) #extracting html from the json response data_html = scrapy.Selector(text=json_data["html"], type="html") JOBPOSTS = data_html.xpath('//div[@class="iconcontentpanel"]') matched_jobs = {} for jobpost in JOBPOSTS: JOBTITLE_SELECTOR = 'div div div div h3 span a ::text' # selects div containing job JOBLOC_SELECTOR = '.morelocation span ::text' # selects span containing job location JOBID_SELECTOR = '.text ::text' # selects element containing jobid job = jobpost.css(JOBTITLE_SELECTOR).extract_first( ) + " - " + jobpost.css(JOBID_SELECTOR).extract_first() matched_jobs[job] = jobpost.css(JOBLOC_SELECTOR).extract_first() #print result print() print("====================Search Result-Jobs====================") print() for job, location in matched_jobs.items(): print(job, " - ", location) print("==========================================================") print()
def parse(self, response): sel=scrapy.Selector(response) articles=sel.css('h2 a[href^="/wenxue/"]').css('a[href$=".html"]') for article in articles: articles_url = self.base_url + article.css('a::attr(href)').extract()[0] if(articles_url is not None): yield scrapy.Request(articles_url, meta = { 'dont_redirect': True, 'handle_httpstatus_list': [302] }, callback=self.parsearticle, dont_filter=True) time.sleep(3) next=sel.css('a[href^="/wenxue/"]') for ne in next: if(ne.css("::text").extract()[0] == "下一页"): next_url = self.base_url + ne.css("::attr(href)").extract()[0] yield scrapy.Request(next_url, meta = { 'dont_redirect': True, 'handle_httpstatus_list': [302] }, callback=self.parse, dont_filter=True)
def parse(self, response): sel = scrapy.Selector(response) div_list = sel.xpath('//div[@class="zp-jobNavigater-popContainer"]') for div_item in div_list: zwlb_big = div_item.xpath( 'div[@class="zp-jobNavigater-pop-title"]/text()' ).extract_first() for zwlb in div_item.xpath( 'div[@class="zp-jobNavigater-pop-list"]/a/text()').extract( ): url = 'https://fe-api.zhaopin.com/c/i/sou?start=0&pageSize=60&cityId=489&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw={}&kt=3'.format( quote(zwlb)) yield scrapy.Request(url=url, callback=self.parse_list, dont_filter=True, meta={ 'zwlb_big': zwlb_big, 'zwlb': zwlb, 'p': 1, 'size': 60, 'start': 60 }, headers=self.headers)
def parse_info(self, response): #得到所有的info/*.html的页面链接 selector = scrapy.Selector(response) infos = selector.xpath('//li[re:test(@id, "line_u7_\\d*")]//a//@href' ).extract() #得到每个菜单栏链接页面的所有info/*.html链接 next = selector.xpath("//a[@class='Next']/@href").extract() for info in infos: if "../" in info: info = info.replace("../", "") if "http" not in info: info = self.url + info if "soc" in info: yield scrapy.Request(url=info, callback=self.parse_text) if next: # 假如有下一页继续爬取 next = next[0] if "?" in next: head_url = response.url.split("?")[0] else: word = "".join(response.url.split("/")[-1:]) head_url = response.url.replace(word, "") if "../" in next: next = next.replace("../", "") nextUrl = head_url + next yield scrapy.Request(url=nextUrl, callback=self.parse_info)
def parse(self, response): sel = scrapy.Selector(response) # Get different page format here # if searching company_name only return 1 result, extract that CIK if sel.xpath('//span[@class="companyName"]'): self.company_name = sel.xpath( '//span[@class="companyName"]/text()').extract_first() cik_temp = sel.xpath( '//span[@class="companyName"]/a/text()').extract_first() self.CIK = cik_temp.split(" ")[0] yield scrapy.Request(self.CIK_lookup_url % self.CIK, callback=self.CIK_parse) # if searching company_name return multiple result, extract the first result's CIK else: sites = sel.xpath('//div/table/tr') for site in sites: if site.xpath('td/a/@href'): self.CIK = site.xpath('td[1]/a/text()').extract_first() yield scrapy.Request(self.CIK_lookup_url % self.CIK, callback=self.CIK_parse) if self.CIK is not None: break
def parse(self, response): print("CALLING PARSE") selector = scrapy.Selector(response) # print(response.body) urls = selector.xpath( '//a[@class="LkLjZd ScJHi U8Ww7d xjAeve nMZKrb id-track-click "]/@href' ).extract() link_flag = 0 links = [] for link in urls: # print("LINK" + str(link)) links.append(link) for each in urls: yield Request(url="http://play.google.com" + links[link_flag], callback=self.parse_next, dont_filter=True) print("http://play.google.com" + links[link_flag]) link_flag += 1
def parse(self, response): data = json.loads(response.text) item = scrapy.Selector(text=data['content'], type="html") if not item.css('div.eventon_list_event p.no_events'): data = { '_type': 'event', 'id': self._parse_id(item), 'name': self._parse_name(item), 'description': self._parse_description(item), 'classification': self._parse_classification(item), 'start_time': self._parse_start(item), 'end_time': self._parse_end(item), 'all_day': self._parse_all_day(item), 'timezone': 'America/Chicago', 'status': self._parse_status(item), 'location': self._parse_location(item), 'sources': self._parse_sources(item) } data['id'] = self._generate_id(data) yield data else: yield
def parse_youku(self, response): self.loggerWithTime("==================>>>>>>>>>>>>>") uid = response.meta["uid"] url = response.url firstresponse = scrapy.Selector(response) if response.status == 200: titleurl = firstresponse.xpath( '//div[@class="tvinfo"]/h2/a/@href').extract() #视频标题 titlelist = firstresponse.xpath( '//div[@class="tvinfo"]/h2/a/text()').extract() title = titlelist[0] if titlelist else '' #导演和演员列表 meta = {"uid": uid, "title": title, "url": url} fullurl = 'https:' + titleurl[0] if titleurl else '' if fullurl != '': yield scrapy.http.Request(url=fullurl, callback=self.parse_youku_second, meta=meta, dont_filter=True) else: self.updateOnSuccess(response.meta["uid"], '', '', 99, '', '') self.loggerWithTime(u"youku-else-video[%s]" % url)
def parse(self, response): # current_url = response.url #爬取时请求的URL # body = response.body #返回的Html # unicode_body = response.body_as_uncode()#返回的html unicode编码 hxs = scrapy.Selector(response) if re.match('http://www.xiaohuar.com/list-1-\d+.html', response.url): items = hxs.xpath('//div[@class="item_list infinite_scroll"]/div') for i in range(len(items)): src = hxs.xpath( '//div[@class="item_list infinite_scroll"]/div[%d]//div[@class="img"]/a/img/@src' % i).extract() name = hxs.xpath( '//div[@class="item_list infinite_scroll"]/div[%d]//div[@class="img"]/span/text()' % i).extract() school = hxs.xpath( '//div[@class="item_list infinite_scroll"]/div[%d]//div[@class="img"]/div[@class="btns"]/a/text()' % i).extract() if src: ab_src = "http://www.xiaohuar.com" + src[0] file_name = "%s_%s.jpg" % (school[0], name[0]) file_path = os.path.join("/data/scrapy/pic/", file_name) urllib.urlretrieve(ab_src, file_path)