def parse(self, response): # cookies={"hulaccess": "1.3|209.6.60.59|20151101223537EST|pin|80832397|harvard|FAS-102981.HMS-103040.HMS-100151.FAS.FGS|GRAD.OFFI|2934|hul-prod", # "hulaccess2_prod": "eWbxIkDP1qyKN0iQ7GikUxxfNmdEqF3E7ovdq9zDfjD8w77vOFDNE/5AqG/CedYhSRt8wmv8OqB+YbFQ67NVfyBoo0PssLP5otwdTAWuYHg=", # "user_OpenURL": "http://sfx.hul.harvard.edu:80/sfx_local/", # "ezproxyezpprod1": "qPRIAvBDBbyjmOK", # "BIGipServersdc-web_80": "505545738.20480.0000", # "_ga": "GA1.8.1317565079.1446420959", # "__atuvc": "1%7C44", # "__atuvs": "5636a1dfd90b070e000"} self.driver.get(self.start_urls[0]) # login and go to cyclopedia self.login() self.go_to_cylcopedia() next_page = True num_pages = 1 while next_page: if num_pages in self.skip_pages: try: next_page = self.driver.find_element_by_css_selector( '#ctl00_ctl00_MainContentArea_MainContentArea_bottomMultiPage_lnkNext') next_page.click() num_pages += 1 except: next_page = False if num_pages > self.max_pages + 1: break continue # get page body body = self.driver.page_source # get all links link_list = scrapy.Selector(text=body).xpath( '//div[@class="record-formats-wrapper externalLinks"]/span/a/@href' ).extract() for link in link_list: title, author, characters = self.get_full_text_info(link) # time.sleep(0.4) item = cyclopedia_entry() item['title'] = title item['author'] = author item['characters'] = characters yield item try: next_page = self.driver.find_element_by_css_selector( '#ctl00_ctl00_MainContentArea_MainContentArea_bottomMultiPage_lnkNext') next_page.click() num_pages += 1 except: next_page = False continue if num_pages > self.max_pages + 1: break self.driver.close()
def parse(self, response): self.driver.get(self.start_urls[0]) # login and go to cyclopedia self.login() self.go_to_cylcopedia() next_page = True num_pages = 1 while next_page: if num_pages in self.skip_pages: try: next_page = self.driver.find_element_by_css_selector( '#ctl00_ctl00_MainContentArea_MainContentArea_bottomMultiPage_lnkNext') next_page.click() num_pages += 1 except: next_page = False if num_pages > self.max_pages + 1: break continue # get page body get_body = True while get_body: body = self.driver.page_source link_list = scrapy.Selector(text=body).xpath( '//div[@class="record-formats-wrapper externalLinks"]/span/a/@href' ).extract() if len(link_list) == 50: get_body = False # print "Page {}: {}".format(num_pages, len(link_list)) for link in link_list: title, author, characters = self.get_full_text_info(link) # title = 'a' # author = 'a' # characters = 'a' item = cyclopedia_entry() item['title'] = title item['author'] = author item['characters'] = characters yield item try: next_page = self.driver.find_element_by_css_selector( '#ctl00_ctl00_MainContentArea_MainContentArea_bottomMultiPage_lnkNext') next_page.click() num_pages += 1 except: next_page = False continue if num_pages > self.max_pages: break self.driver.close()
def parse(self, response): # cookies={"hulaccess": "1.3|209.6.60.59|20151101223537EST|pin|80832397|harvard|FAS-102981.HMS-103040.HMS-100151.FAS.FGS|GRAD.OFFI|2934|hul-prod", # "hulaccess2_prod": "eWbxIkDP1qyKN0iQ7GikUxxfNmdEqF3E7ovdq9zDfjD8w77vOFDNE/5AqG/CedYhSRt8wmv8OqB+YbFQ67NVfyBoo0PssLP5otwdTAWuYHg=", # "user_OpenURL": "http://sfx.hul.harvard.edu:80/sfx_local/", # "ezproxyezpprod1": "qPRIAvBDBbyjmOK", # "BIGipServersdc-web_80": "505545738.20480.0000", # "_ga": "GA1.8.1317565079.1446420959", # "__atuvc": "1%7C44", # "__atuvs": "5636a1dfd90b070e000"} self.driver.get(self.start_urls[0]) # login and go to cyclopedia self.login() self.go_to_cylcopedia() next_page = True num_pages = 1 while next_page: if num_pages in self.skip_pages: try: next_page = self.driver.find_element_by_css_selector( '#ctl00_ctl00_MainContentArea_MainContentArea_bottomMultiPage_lnkNext' ) next_page.click() num_pages += 1 except: next_page = False if num_pages > self.max_pages + 1: break continue # get page body body = self.driver.page_source # get all links link_list = scrapy.Selector(text=body).xpath( '//div[@class="record-formats-wrapper externalLinks"]/span/a/@href' ).extract() for link in link_list: title, author, characters = self.get_full_text_info(link) # time.sleep(0.4) item = cyclopedia_entry() item['title'] = title item['author'] = author item['characters'] = characters yield item try: next_page = self.driver.find_element_by_css_selector( '#ctl00_ctl00_MainContentArea_MainContentArea_bottomMultiPage_lnkNext' ) next_page.click() num_pages += 1 except: next_page = False continue if num_pages > self.max_pages + 1: break self.driver.close()
def parse(self, response): self.driver.get(self.start_urls[0]) # login and go to cyclopedia self.login() self.go_to_cylcopedia() next_page = True num_pages = 1 while next_page: if num_pages in self.skip_pages: try: next_page = self.driver.find_element_by_css_selector( '#ctl00_ctl00_MainContentArea_MainContentArea_bottomMultiPage_lnkNext' ) next_page.click() num_pages += 1 except: next_page = False if num_pages > self.max_pages + 1: break continue # get page body get_body = True while get_body: body = self.driver.page_source link_list = scrapy.Selector(text=body).xpath( '//div[@class="record-formats-wrapper externalLinks"]/span/a/@href' ).extract() if len(link_list) == 50: get_body = False # print "Page {}: {}".format(num_pages, len(link_list)) for link in link_list: title, author, characters = self.get_full_text_info(link) # title = 'a' # author = 'a' # characters = 'a' item = cyclopedia_entry() item['title'] = title item['author'] = author item['characters'] = characters yield item try: next_page = self.driver.find_element_by_css_selector( '#ctl00_ctl00_MainContentArea_MainContentArea_bottomMultiPage_lnkNext' ) next_page.click() num_pages += 1 except: next_page = False continue if num_pages > self.max_pages: break self.driver.close()