def parse(self, response): """ URL routing takes place here (for the most part) """ # @TODO figure out how to due away with this upkeep # do not allow crawl_depth value to balloon to a high number if response.meta.get('crawl_depth', self._crawl_depth) < 0: response.meta['crawl_depth'] = 0 # @TODO log if response.data.get('html'): # @FIXME is there a better way to yield from generator? route_generator = self.handle_html( response, selector.Selector(text=response.data['html'])) for route in route_generator: yield route #if len(response.data.get('childFrames', [])) > 0: for child_frames in response.data.get('childFrames', []): for frame_item in child_frames: frame_html = frame_item.get('html', '') # @FIXME is there a better way to yield from generator? route_generator = self.handle_html( response, selector.Selector(text=frame_html)) for route in route_generator: yield route
def htmlToJson(html): sel = selector.Selector(text=html) table_heads = sel.xpath('//*[@id="xsjbxx"]/tr/td/text()').extract() name = table_heads[1] id = table_heads[3] idcard = table_heads[5] major = table_heads[9] _class = table_heads[11] college = table_heads[7] term = "" every_score = [] detail = OrderedDict() tr = sel.xpath('//*[@id="xscjxx"]/tr').extract() tr.append("colspan") for i in tr: if i.find(u"colspan") != -1: if len(every_score) > 1: detail[term] = every_score if i == 'colspan': break every_score = [] try: term = re.search('>.*<', i).group()[6:-1] except Exception, e: pass else: title, grade, score, Null = re.findall('(?<=\>).*(?=<)', i) item = {'title': title, 'grade': grade, 'score': score} every_score.append(item)
def test_three_td_with_three_four_five_colspan(self): # 18 td with attribute "colspan" tr_text = ''' <tr bgcolor="WHITE" align="CENTER"> <td nowrap=""><font size="1" face="ARIAL"><a target="_self" href="../../Resultspro/2013/ResultsPro13659.asp">02-06-13</a></font></td> <td align="CENTER"><font size="1" face="ARIAL">659 D</font></td> <td align="LEFT"><font size="1" face="ARIAL">ST tf g/f -A+3 </font></td> <td><font size="1" face="ARIAL">1200</font></td> <td><font size="1" face="ARIAL">4</font></td> <td><font size="1">DISQ</font></td> <td align="LEFT"><font size="1" face="ARIAL">P. F. Yiu</font></td> <td colspan="3"><font size="1" face="ARIAL">125</font></td> <td align="LEFT"><font size="1" face="ARIAL">T. Berry</font></td> <td colspan="4"><font size="1" face="ARIAL">T-BOLT</font></td> <td><font size="1" face="ARIAL">FAY DEEP</font></td> <td><font size="1" face="ARIAL">1:09.5</font></td> <td><font size="1" face="ARIAL">(22.8 )</font></td> <td><font size="1" face="ARIAL">24.71 22.59 25.48</font></td> <td colspan="5"><center><font size="2">DISQ</font></center></td> <td align="CENTER"><font size="1" face="ARIAL">1180</font></td> <td><font size="1" face="ARIAL">52</font></td> </tr> ''' tr = selector.Selector(text=tr_text).xpath('//tr') self.assertEquals(scmp_spider.HorseSpider.get_td_ind(tr, 1), 1) self.assertEquals(scmp_spider.HorseSpider.get_td_ind(tr, 8), 8) self.assertEquals(scmp_spider.HorseSpider.get_td_ind(tr, 9), 8) self.assertEquals(scmp_spider.HorseSpider.get_td_ind(tr, 15), 10) self.assertEquals(scmp_spider.HorseSpider.get_td_ind(tr, 16), 11) self.assertEquals(scmp_spider.HorseSpider.get_td_ind(tr, 24), 15) self.assertEquals(scmp_spider.HorseSpider.get_td_ind(tr, 26), 17)
def test_no_colspan(self): tr_text = ''' <tr bgcolor="WHITE" align="CENTER"> <td nowrap=""><font size="1" face="ARIAL"><a target="_self" href="../../Resultspro/2013/ResultsPro13754.asp">07-07-13</a></font></td> <td align="CENTER"><font size="1" face="ARIAL">754 D</font></td> <td align="LEFT"><font size="1" face="ARIAL">ST tf g/f -B+2 </font></td> <td><font size="1" face="ARIAL">1200</font></td> <td><font size="1" face="ARIAL">4</font></td> <td><font size="1" face="ARIAL">8</font></td> <td align="LEFT"><font size="1" face="ARIAL">P. F. Yiu</font></td> <td><font size="1" face="ARIAL">125</font></td> <td align="LEFT"><font size="1" face="ARIAL">T. Berry</font></td> <td align="CENTER"><font size="1" face="ARIAL">2</font></td> <td align="CENTER"><font size="1" face="ARIAL">B1/XB</font></td> <td align="LEFT"><font size="1" face="ARIAL">DINING STAR</font></td> <td><font size="1" face="ARIAL">PSYCHOLOGIST</font></td> <td><font size="1" face="ARIAL">MULTIEXPRESS</font></td> <td><font size="1" face="ARIAL">1:09.5</font></td> <td><font size="1" face="ARIAL">(22.7 )</font></td> <td><font size="1" face="ARIAL">24.73 22.51 23.08</font></td> <td><font size="1" face="ARIAL">1:10.3</font></td> <td><font size="1" face="ARIAL">9-9-8</font></td> <td align="CENTER"><font size="1" face="ARIAL">5</font></td> <td align="CENTER"><font size="1" face="ARIAL">1178</font></td> <td><font size="1" face="ARIAL">52</font></td> <td align="CENTER"><font size="1" face="ARIAL">5.1</font></td> <td align="CENTER"><blink><font color="#FF0000" size="1" face="ARIAL">3.9</font></blink></td> <td align="CENTER"><a href="../mg_bf_race.asp?Hores_Name=GRACEFUL+KINGDOM&Horse_id=P145&rdate=07-Jul-2013"><img width="25" height="23" border="0" src="/images/morngall-s.gif"></a></td> </tr> ''' tr = selector.Selector(text=tr_text).xpath('//tr') self.assertEquals(scmp_spider.HorseSpider.get_td_ind(tr, 1), 1) self.assertEquals(scmp_spider.HorseSpider.get_td_ind(tr, 25), 25)
def download_mag(url, dir, out): sc = urllib.request.urlopen(url).read() root = sel.Selector(text=sc) articles = root.xpath("//li[contains(@class,'cit')]") pages = [x.xpath(".//span[contains(@class,'cit-first-page')]/text()").extract() for x in articles] links = [x.xpath(".//div[contains(@class,'cit-extra')]" "//a[contains(@rel,'full-text.pdf')]/@href").extract() for x in articles] toc = root.xpath('//*[@id="pdf-matter"]/div[1]/ul/li/a[contains(text(), ' '"Print Table of Contents")]/@href').extract() #remove articles without links no_pdf = [i for i, x in enumerate(links) if not x] pages = [p[0] if p else None for i, p in enumerate(pages) if i not in no_pdf] links = [p[0] if p else None for i, p in enumerate(links) if i not in no_pdf] #articles = [p for i, p in enumerate(articles) if i not in no_pdf] #sort pdfs sorted_page_index = sort_pages(pages) #download pdfs if not dir: dir = download_pdfs([SCIENCE_BASE + x for x in links]) #merge pdfs merge_pdfs([os.path.join(dir, os.path.basename(links[p])) for p in sorted_page_index], out) #remove duplicates remove_duplicates(out)
def parse(self, response): sel = selector.Selector(response) sites = sel.xpath('//img/@src').extract() items = [] for each in sites: item = ImageItem() item['image'] = each items.append(item) return items
def parse(self, response): sel = selector.Selector(response) keyword = sel.response.xpath('//*[@id="keyword"]/@value').extract() dd = sel.xpath('//li[@class="con_list_item default_list"]') for d in dd: position = LagouItem() position['type'] = keyword print position['type'] position['index'] = d.xpath('@data-index').extract() position['salary'] = d.xpath('@data-salary').extract() position['company'] = d.xpath('@data-company').extract() position['position'] = d.xpath( 'div[@class="list_item_top"]/div/div/a/span/em/text()' ).extract() position['positionname'] = d.xpath('@data-positionname').extract() position['time'] = time.strftime('%Y-%m-%d', time.localtime(time.time())) yield position
def transport_scores(data_folder='data', filename=None): """Table of public transportation scores by city ID from ville-ideale.fr.""" if not filename: filename = path.join(data_folder, 'geo/ville-ideale-transports.html') with open(filename, 'rt') as transport_file: page_text = transport_file.read() page_selector = selector.Selector(text=page_text) # Parse the links containing city name and city ID. city_ids = ( link.split('_')[-1] for link in page_selector.xpath('//td[@class="ville"]/a/@href').extract()) # Parse the scores. scores = ( float(note.replace(',', '.')) for note in page_selector.xpath('//td[@class="note"]/text()').extract()) return {city_id: score if score >= .1 else .1 for city_id, score in zip(city_ids, scores)}
def parse(self, response): sel = selector.Selector(response) keyword = sel.response.xpath('//*[@id="keyword"]/@value').extract() dd = sel.xpath('//li[@class="con_list_item default_list"]') i = 1 for d in dd: i += 1 position = LagouItem() position['type'] = keyword position['index'] = d.xpath('@data-index').extract() position['salary'] = d.xpath('@data-salary').extract() position['company'] = d.xpath('@data-company').extract() position['position'] = d.xpath('div[@class="list_item_top"]/div/div/a/span/em/text()').extract() position['positionname'] = d.xpath('@data-positionname').extract() position['time'] = time.strftime('%Y-%m-%d', time.localtime(time.time())) yield position purl = sel.xpath('//div[@class="pager_container"]/a[last()]/@href').extract() if 'http' in purl: yield scrapy.Request(purl, callback=self.parse, dont_filter=True)
def test_one_td_with_three_colspan(self): # 18 td with attribute "colspan" tr_text = ''' <tr bgcolor="WHITE" align="CENTER"> <td nowrap=""><font size="1" face="ARIAL"><a target="_self" href="../../Resultspro/2013/ResultsPro13659.asp">02-06-13</a></font></td> <td align="CENTER"><font size="1" face="ARIAL">659 D</font></td> <td align="LEFT"><font size="1" face="ARIAL">ST tf g/f -A+3 </font></td> <td><font size="1" face="ARIAL">1200</font></td> <td><font size="1" face="ARIAL">4</font></td> <td><font size="1">DISQ</font></td> <td align="LEFT"><font size="1" face="ARIAL">P. F. Yiu</font></td> <td><font size="1" face="ARIAL">125</font></td> <td align="LEFT"><font size="1" face="ARIAL">T. Berry</font></td> <td align="CENTER"><font size="1" face="ARIAL">2</font></td> <td align="CENTER"><font size="1" face="ARIAL"> </font></td> <td align="LEFT"><font size="1" face="ARIAL">REGENCY CHAMPION</font></td> <td><font size="1" face="ARIAL">T-BOLT</font></td> <td><font size="1" face="ARIAL">FAY DEEP</font></td> <td><font size="1" face="ARIAL">1:09.5</font></td> <td><font size="1" face="ARIAL">(22.8 )</font></td> <td><font size="1" face="ARIAL">24.71 22.59 25.48</font></td> <td colspan="3"><center><font size="2">DISQ</font></center></td> <td align="CENTER"><font size="1" face="ARIAL">1180</font></td> <td><font size="1" face="ARIAL">52</font></td> <td align="CENTER"><font size="1" face="ARIAL">2.6</font></td> <td align="CENTER"><blink><font color="#FF0000" size="1" face="ARIAL">2.8</font></blink></td> <td align="CENTER"><a href="../mg_bf_race.asp?Hores_Name=GRACEFUL+KINGDOM&Horse_id=P145&rdate=02-Jun-2013"><img width="25" height="23" border="0" src="/images/morngall-s.gif"></a></td> </tr> ''' tr = selector.Selector(text=tr_text).xpath('//tr') self.assertEquals(scmp_spider.HorseSpider.get_td_ind(tr, 1), 1) self.assertEquals(scmp_spider.HorseSpider.get_td_ind(tr, 17), 17) self.assertEquals(scmp_spider.HorseSpider.get_td_ind(tr, 18), 18) self.assertEquals(scmp_spider.HorseSpider.get_td_ind(tr, 19), 18) self.assertEquals(scmp_spider.HorseSpider.get_td_ind(tr, 20), 18) self.assertEquals(scmp_spider.HorseSpider.get_td_ind(tr, 21), 19) self.assertEquals(scmp_spider.HorseSpider.get_td_ind(tr, 25), 23)
def adie_events2dicts(events_html): """Convert the events page of ADIE into our own Event format before Mongo import. Args: events_html: the HTML content of the ADIE events page. Returns: an iterable of dict with the JSON values of the Event proto. """ with open(events_html, 'rt') as events_file: page_text = events_file.read() page_selector = selector.Selector(text=page_text) # Parse the markers with coordinates. map_div = page_selector.xpath('//div[@class="acf-map"]') markers = [{ 'data-lat': d.xpath('@data-lat').extract_first(), 'data-lng': d.xpath('@data-lng').extract_first(), } for d in map_div.xpath('div[@class="marker"]')] # Parse the other attributes. events_script = page_selector.xpath( '//script[contains(., "var evenements = []")]/text()').extract_first() if not events_script: raise ValueError( '"{}" does not contain the javascript to create events:\n{}'. format(events_html, page_text)) if 'evenement = []' not in events_script: raise ValueError('The [] bug is fixed, please drop the replace code') events_script = events_script.replace('evenement = []', 'evenement = {}') events = js2py.eval_js(events_script + ';evenements') # Join coordinates and other attributes. return [ _adie_event_to_proto(dict(a, **b)) for a, b in zip(markers, events) ]
def parse_item(self, response): numb = response.url.split('/')[-1] item = VeloItem(url=response.url, crawl_date=response.meta['crawl_date'], image_url=response.meta['img'], title=response.meta['title']) hxs = selector.Selector(response, type='html') item['price'] = self.get_decimal_price(''.join( hxs.xpath(self.price_xpath).extract())) item['description'] = ' '.join( hxs.xpath(self.description_xpath).extract()) if not item['image_url']: item['image_url'] = ''.join(hxs.xpath(self.img_xpath).extract()) if not item['title']: item['title'] = ''.join(hxs.xpath(self.title_xpath).extract()) store_data = self.get_store_supply_data( numb, response.meta['jsession_id'], response.url, response.meta['csrf_token'], ) item['stores'] = store_data yield item
def parse(self, response): limit = int(re.search('\d+', response.url).group()) next_url = re.sub('\d+', str(limit + 30), response.url, count=1) yield scrapy.Request(next_url, self.parse) result = re.findall( '<script>pageLoader\(({"id":"tuan-list".*?)</script>', response.text) res = result[0][:-2] res = re.sub("\'", '\"', res) obj = json.loads(res) html_response = obj['html'] html = selector.Selector(text=html_response) li_list = html.xpath('//div[@id="list"]//li') for li in li_list: item = QunaItem() item['djscategory'] = re.search('category=(.*?)&', response.url).group(1) item['_id'] = re.search( '\d+', li.xpath('./a/@href').extract_first()).group() item['djstitle'] = li.xpath( './/div[@class="nm"]/@title').extract_first() item['djsimg_url'] = li.xpath( './/div[@class="imgs loading"]/img/@src').extract_first() item['djstype_gt'] = li.xpath( './/div[@class="type_gt"]/text()').extract_first() item['djsjiejian'] = li.xpath( './/div[@class="sm"]/@title').extract_first() item['djsprice'] = li.xpath( './/div[@class="price"]//em/text()').extract_first() item['djsdate'] = li.xpath( './/div[@class="tip"]//span[1]/text()').extract_first() item['djsnum'] = li.xpath( './/div[@class="tip"]//span[2]/em/text()').extract_first() item['djsdetail_url'] = li.xpath('./a/@href').extract_first() yield item
from scrapy import selector html_str = '''<table> <td>姓名</td><td>年龄</td></tr> <td>龙泽啦啦</td><td>23</td></tr> <td>餐巾空</td><td>25</td></tr> </table>''' html = selector.Selector(text=html_str) html.css("").xpath() name = html.xpath("/html/body/table/td[2]/text()").get() from bs4 import BeautifulSoup soup = BeautifulSoup(html_str,"html5lib") name1 = soup.select("td") print(name,"-----") print((name1))
def handle_html(self, response, html_selector): """ Parse HTML and extract links :type response: scrapy.http.Response :type html_selector: scrapy.selector.Selector :yields: dict, scrapy.Request """ # @TODO handles for different parts of the HTML. eg. body, head, frameset log = structlog.get_logger().bind( event = 'PARSE_HTML', module = __file__, source_url = response.url, content_type = 'HTML') crawl_depth = response.meta.get('crawl_depth', self._crawl_depth) title = response.data.get('title', response.url) try: body = html_selector.xpath('//body')[0] except IndexError: body = selector.Selector(text='') yield dict( source_url = response.url, crawl_timestamp = self._crawl_start_datetime.strftime('%Y-%m-%dT%H:%M:%SZ'), title = title, content_type = 'HTML', content = body.extract()) # add domain to set of traversed domains parsed_resp_url = http.urlparse(response.url.encode('utf')).decode() self._traversed_domains.add(parsed_resp_url.netloc) # extract links linkextractor = LxmlLinkExtractor( allow = self._patterns_url_whitelist, deny = self._patterns_url_blacklist, allow_domains = self._patterns_domain_whitelist, deny_domains = self._patterns_domain_blacklist) href_list = linkextractor.extract_links(response) for link in href_list: # get the URL in string format href = link.url # separate meaningful pieces of URL try: parsed_href = http.urlparse(href.encode('utf8')).decode() except: # typically href URL is invalid log.error(error = "INVALID_URL", href=href) continue # only parse HTTP links if parsed_href.scheme.upper() in ['HTTP', 'HTTPS']: # split the query string from the href, do not follow _href! _href = ''.join([ parsed_href.netloc, parsed_href.path]) # determine file type from the URL content_type = self.identify_type_from_url(_href) # make routing decision based on content type route = None if content_type in ['HTML']: route = response.follow( href, callback = self.parse, errback = self.errback, meta = dict( crawl_depth = crawl_depth - 1, splash = { 'endpoint': 'render.json', 'args': { 'html': 1, 'iframes': 1, 'timeout': 10, } } ) ) elif content_type in self._processable_ext: log.info('@TODO') # @TODO # is crawl at 0 depth? conditions = any([ crawl_depth > 0, all([ crawl_depth <= 0, parsed_href.netloc in self._traversed_domains ]), ]) if conditions and route is not None: yield route
def get_co_authors(filename): ''' filename: 存储着想要爬取学者的scholarID的文件,每行一个 ''' base_url ='http://xueshu.baidu.com/scholarID/%s' mongo_url = 'mongodb://*****:*****@localhost:27017/admin' # mongodb数据库 author_ = [] iter = 0 headers = {'User-Agent': usr_gent} cap = DesiredCapabilities.PHANTOMJS.copy() for key, value in headers.items(): cap['phantomjs.page.customHeaders.{}'.format(key)] = value # 不载入图片,爬页面速度会快很多 cap["phantomjs.page.settings.loadImages"] = False browser = webdriver.PhantomJS(desired_capabilities=cap) # 浏览器初始化,当然也可以使用chrome with open(filename) as f: author = f.readline() while author: try: scholarID = author.strip() if iter % 10 ==0: # 每爬取10位专家存储一次 print(iter,datetime.datetime.now()) client = pymongo.MongoClient(mongo_url) db = client['professor'] for it in author_: db['professor_net'].insert_one(it) author_ = [] if iter % 50 == 0: # 每爬完50位专家重启一次PhantomJS并更换一个新的header browser.quit() new_usr = ua.chrome while new_usr == usr_gent: new_usr = ua.chrome usr_gent = new_usr headers = {'User-Agent': usr_gent} cap = DesiredCapabilities.PHANTOMJS.copy() for key, value in headers.items(): cap['phantomjs.page.customHeaders.{}'.format(key)] = value # 不载入图片,爬页面速度会快很多 cap["phantomjs.page.settings.loadImages"] = False browser = webdriver.PhantomJS(desired_capabilities=cap) try: a = {} a[scholarID] = [] browser.get(base_url % scholarID) WebDriverWait(browser,5).until(EC.presence_of_element_located((By.CSS_SELECTOR,'.p_name'))) try: button = browser.find_element_by_css_selector('.co_author_more') button.click() WebDriverWait(browser, 5).until(EC.presence_of_element_located((By.CSS_SELECTOR, '.co_relmap_wrapper'))) sel = selector.Selector(text=browser.page_source) co_authors = sel.css('.co_person_name') for co_author in co_authors: name = co_author.css('div::text').extract_first() # 获取姓名 num = co_author.xpath('./@paper-count').extract_first() # 获取合作次数 affiliate = co_author.xpath('./@affiliate').extract_first() # 获取合作机构 a[scholarID].append({'name':name,'num':num,'affiliate':affiliate}) author_.append(a) except: print('合作学者不足四位') iter += 1 except: print('error') browser.quit() new_usr = ua.chrome while new_usr == usr_gent: new_usr = ua.chrome usr_gent = new_usr headers = {'User-Agent': usr_gent} cap = DesiredCapabilities.PHANTOMJS.copy() for key, value in headers.items(): cap['phantomjs.page.customHeaders.{}'.format(key)] = value # 不载入图片,爬页面速度会快很多 cap["phantomjs.page.settings.loadImages"] = False browser = webdriver.PhantomJS(desired_capabilities=cap) finally: author = f.readline() except:
"Origin": "http://bxjg.circ.gov.cn", "Upgrade-Insecure-Requests": "1", "Referer": "http://bxjg.circ.gov.cn/tabid/6596/Default.aspx", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36", "Cookie": "__jsluid=646a1aa9a5c9269dde7d980b23569a68; .ASPXANONYMOUS=EDRX24NG1QEkAAAAYjMwOThhNjMtZDRiZC00ZmFlLThmMWYtYTY3ZTJmN2QyNmFj0; Hm_lvt_6a2f36cc16bd9d0b01b10c2961b8900c=1558456002,1559044846,1559741649; ASP.NET_SessionId=zo3kjablw1yc5vyvmrc104vu; language_0=zh-CN; COOKIE_USERID=LTE`" } req1 = requests.get(url1, headers=headers1) print(req1.headers) text1 = req1.text headers1 = req1.headers html1 = selector.Selector(req1) viewstate1 = html1.xpath("input[@name='__VIEWSTATE']/@value").get() print(viewstate1) data1 = { "__EVENTTARGET": "ess$ctr17198$SearchOrganization$lkbSearch", "__EVENTARGUMENT": "", "__VIEWSTATE": viewstate1, "__VIEWSTATEGENERATOR": "CA0B0334", "ScrollTop": "", "__essVariable": "ess$ctr17198$SearchOrganization$lkbSearch", "ess$ctr17198$SearchOrganization$txtComName": "", "ess$ctr17198$SearchOrganization$ddlComType": "-1", "ess$ctr17198$SearchOrganization$txtOrgDateS": "", "ess$ctr17198$SearchOrganization$txtOrgDateE": "", "ess$ctr17198$SearchOrganization$ddlState": "-1", "ess$ctr17198$SearchOrganization$ddlSW": "-1",
# coding: utf-8 import scrapy.selector as sel fetch("https://docs.scrapy.org/en/latest/topics/selectors.html#scrapy.selector.SelectorList") r = response r = sel.Selector(r) r.xpath('//div/a/img').extract() r.xpath('//div[@id="images"]/a/img[@src="image1_thumb.jpg"]').extract() r.xpath('//div[@id="images"]/a/img')[1].attrib['src'] r.xpath('//base/@href')