def parse_other(self, response): papers_list = response.xpath("//ol[@class='content-item-list']/li") for papers_item in papers_list: paper = PaperItem() paper['title'] = "".join( papers_item.xpath("./h2/a//text()").getall()).replace( "\n", "").replace(" ", "") paper['snippet'] = ("".join( papers_item.xpath( "./p[@class='snippet']/text()").getall()).replace( "\n", "")).strip() paper['authors'] = ",".join( papers_item.xpath( "./p[@class='meta']/span[@class='authors']/a/text()"). getall()) paper['publication'] = papers_item.xpath( "./p[@class='meta']/span[@class='enumeration']/a/text()").get( ) paper['publicationDate'] = papers_item.xpath( "./p[@class='meta']/span[@class='enumeration']/span/@title" ).get() paper['publisher'] = 'Springer' # paper['file_urls'] = ['https://link.springer.com' + papers_item.xpath( # ".//span[@class='action'][1]/a/@href").get()] paper['search'] = self.search yield paper
def parse_detail_1(self, response): pjs_obj = webdriver.PhantomJS( executable_path= r'D://phantomjs-2.1.1-windows/phantomjs-2.1.1-windows/bin/phantomjs.exe' ) pjs_obj.get( 'https://openreview.net/group?id=ICLR.cc/2018/Conference#accepted-poster-papers' ) sleep(5) sele = pjs_obj.find_elements_by_xpath( '//*[@id="accepted-poster-papers"]/ul/li') if sele is not None: for i in sele: item = PaperItem() item['title'] = i.find_element_by_xpath('./h4/a[1]').text item['file_urls'] = [ i.find_element_by_xpath('./h4/a[2]').get_attribute('href') ] item['authors'] = i.find_element_by_xpath('./div[1]').text item['publicationDate'] = 'ICLR' item['publication'] = 'ICLR' item['publisher'] = 'ICLR' item['snippet'] = 'ICLR' item['keyword'] = 'ICLR' item['search'] = 'ICLR'
def parse_detail(self, response): url = 'https://openreview.net/group?id=ICLR.cc/2018/Conference#accepted-oral-papers' pjs_obj = webdriver.PhantomJS( executable_path= r'D://phantomjs-2.1.1-windows/phantomjs-2.1.1-windows/bin/phantomjs.exe' ) pjs_obj.get(url) sleep(5) sele = pjs_obj.find_elements_by_xpath( '//*[@id="accepted-oral-papers"]/ul/li') print(sele) for i in sele: item = PaperItem() item['title'] = i.find_element_by_xpath('./h4/a[1]').text item['file_urls'] = [ i.find_element_by_xpath('./h4/a[2]').get_attribute('href') ] item['authors'] = i.find_element_by_xpath('./div[1]').text item['publicationDate'] = 'ICLR' item['publication'] = 'ICLR' item['publisher'] = 'ICLR' item['snippet'] = 'ICLR' item['keyword'] = 'ICLR' item['search'] = 'ICLR' yield item yield scrapy.Request('https://www.baidu.com', callback=self.parse_detail_1, dont_filter=True)
def parse(self, response): sele = response.xpath('//*[@id="content"]/dl/dt') for sel in sele: item = PaperItem() item['title'] = sel.xpath('./a/text()').extract_first() item['authors'] = sel.xpath('./following-sibling::dd/form/a/text()').extract()[0] item['file_urls'] =['http://openaccess.thecvf.com/' + sel.xpath('./following-sibling::dd/a/@href').extract_first()] # item['download_url'] ='http://openaccess.thecvf.com/' + sel.xpath('./following-sibling::dd/a/@href').extract_first() item['publicationDate'] = 'ICCV' item['publication'] = 'ICCV' item['publisher'] = 'ICCV' item['keyword'] = 'ICCV' item['search'] = 'ICCV' item['snippet'] = 'ICCV' yield item
def parse_first(self, response): papers_list = response.xpath("//div[@class='details']") for papers_item in papers_list: paper = PaperItem() if 'title' in str(papers_item.xpath("./div/@class")): paper['title'] = papers_item.xpath( "./div[@class='title']/a/text()").extract_first() if 'authors' in str(papers_item.xpath("./div/@class")): paper['authors'] = ','.join( papers_item.xpath( "./div[@class='authors']/a/text()").extract()) if 'source' in str(papers_item.xpath("./div/@class")): paper['publicationDate'] = papers_item.xpath( "///span[@class='publicationDate']/text()").extract_first( ) paper['publication'] = papers_item.xpath( "./div[@class='source']/span[2]/text()").extract_first() if 'publisher' in str(papers_item.xpath("./div/@class")): paper['publisher'] = papers_item.xpath( "./div[@class='publisher']/text()").extract()[1].replace( "\xa0", "").replace("\n", "") if 'ft' in str(papers_item.xpath("./div/@class")): paper['file_urls'] = [ 'https://dl.acm.org/' + papers_item.xpath( "./div[@class='ft']/a/@href").extract_first() ] # paper['download_url'] = 'https://dl.acm.org/' + papers_item.xpath( # "./div[@class='ft']/a/@href").extract_first() if 'abstract' in str(papers_item.xpath("./div/@class")): paper['snippet'] = papers_item.xpath( "./div[@class='abstract']/text()").extract_first().replace( "\n", "") if 'kw' in str(papers_item.xpath("./div/@class")): paper['keyword'] = papers_item.xpath( "./div[@class='kw']/text()").extract()[1].strip( ':').replace("\n", "") paper['search'] = self.search yield paper start_num = int( re.compile(".*start=(\d*)").findall( str(response.xpath("//div[@class='pagelogic'][1]/a/@href"))) [0]) if start_num / 20 > 20: start_num = 400 for start in range(20, start_num, 20): yield Request(url=self.other_url.format(search=self.search, start=start), callback=self.parse_other)
def parse(self, response): obj_list = response.xpath('//*[@class="section"]')[1:] for obj in obj_list: sele = obj.xpath('./div[2]/div') for sel in sele: item = PaperItem() item['title'] = sel.xpath('./div[1]/text()').extract_first() item['authors'] = sel.xpath('./div[2]/text()').extract_first() URL = sel.xpath('./div[3]/a[1]/@href').extract_first() if URL is not None: item['file_urls'] = ['http://www.ijcai.org/proceedings/%s/'%(Keyword.get_search_three()) + URL] item['publicationDate'] = 'IJCAI' item['publication'] = 'IJCAI' item['publisher'] = 'IJCAI' item['snippet'] = 'IJCAI' item['keyword'] = 'IJCAI' item['search'] = 'IJCAI' yield item
def parse_detail(self, response): item = PaperItem() item['title'] = response.xpath( '/html/body/div[2]/div/h2/text()').extract_first() author = response.xpath( '/html/body/div[2]/div/ul/li//text()').extract() s = '' for i in author: s = s + i + ',' item['authors'] = s.rstrip(',') item['file_urls'] = [ 'http://papers.nips.cc' + response.xpath('/html/body/div[2]/div/a[1]/@href').extract_first() ] item['publicationDate'] = 'NIPS' item['publication'] = 'NIPS' item['publisher'] = 'NIPS' item['snippet'] = 'NIPS' item['keyword'] = 'NIPS' item['search'] = 'NIPS' yield item
def parse_detail(self, response): item = PaperItem() item['title'] = response.xpath( '//*[@id="jouMain"]/div[1]/div[3]/ul/h1/b/text()').extract_first() author = response.xpath( '//*[@id="jouMain"]/div[1]/div[3]/a/text()').extract() num = len(author) result = '' for i in range(num - 1): result += author[i] + ',' item['authors'] = result sel = response.xpath( '//*[@id="ctl00_ContentPlaceHolder1_showJournalIssue"]/span/a/text()' ).extract_first() sel = sel.split('\r\n')[1].lstrip(' ') item['publicationDate'] = sel item['publication'] = 'Hans' item['publisher'] = 'Hans' item['snippet'] = response.xpath( '//*[@id="jouMain"]/div[1]/div[3]/div[1]/div/p[1]/text()' ).extract_first() if item['snippet'] is None: item['snippet'] = response.xpath( '//*[@id="jouMain"]/div[1]/div[3]/div[1]/div/div/span[1]/text()' ).extract_first() sele = response.xpath( '//*[@id="jouMain"]/div[1]/div[3]/p[2]/a/text()').extract() k = '' for j in range(len(sele) - 2): k += sele[j] + ',' item['keyword'] = k item['search'] = Keyword.get_search() url = response.xpath('//*[@id="clicknumber"]/@href').extract_first() url = parse.urljoin(response.url, url) yield scrapy.Request(url, callback=self.parse_download, meta={"item": deepcopy(item)}, dont_filter=True)
def parse_first(self, response): papers_list = response.xpath("//ol[@class='content-item-list']/li") for papers_item in papers_list: paper = PaperItem() paper['title'] = "".join( papers_item.xpath("./h2/a//text()").getall()).replace( "\n", "").replace(" ", "") paper['snippet'] = ("".join( papers_item.xpath( "./p[@class='snippet']/text()").getall()).replace( "\n", "")).strip() paper['authors'] = ",".join( papers_item.xpath( "./p[@class='meta']/span[@class='authors']/a/text()"). getall()) paper['publication'] = papers_item.xpath( "./p[@class='meta']/span[@class='enumeration']/a/text()").get( ) paper['publicationDate'] = papers_item.xpath( "./p[@class='meta']/span[@class='enumeration']/span/@title" ).get() paper['publisher'] = 'Springer' paper['file_urls'] = [ 'https://link.springer.com' + papers_item.xpath(".//span[@class='action'][1]/a/@href").get() ] # paper['download_url'] = 'https://link.springer.com' + papers_item.xpath( # ".//span[@class='action'][1]/a/@href").get() paper['search'] = self.search yield paper page_num = int("".join( response.xpath( "//div[contains(@class,'functions-bar-top')]//span[@class='number-of-pages']/text()" ).get())) if page_num > 20: page_num = 20 for page in range(2, page_num + 1): yield Request(self.other_url.format(page=page, search=self.search), callback=self.parse_other)
def parse_other(self, response): papers_list = response.xpath("//div[@class='details']") for papers_item in papers_list: paper = PaperItem() if 'title' in str(papers_item.xpath("./div/@class")): paper['title'] = papers_item.xpath( "./div[@class='title']/a/text()").extract_first() if 'authors' in str(papers_item.xpath("./div/@class")): paper['authors'] = ','.join( papers_item.xpath( "./div[@class='authors']/a/text()").extract()) if 'source' in str(papers_item.xpath("./div/@class")): paper['publicationDate'] = papers_item.xpath( "///span[@class='publicationDate']/text()").extract_first( ) paper['publication'] = papers_item.xpath( "./div[@class='source']/span[2]/text()").extract_first() if 'publisher' in str(papers_item.xpath("./div/@class")): paper['publisher'] = papers_item.xpath( "./div[@class='publisher']/text()").extract()[1].replace( "\xa0", "").replace("\n", "") if 'ft' in str(papers_item.xpath("./div/@class")): paper['file_urls'] = [ 'https://dl.acm.org/' + papers_item.xpath( "./div[@class='ft']/a/@href").extract_first() ] # paper['download_url'] = 'https://dl.acm.org/' + papers_item.xpath( # "./div[@class='ft']/a/@href").extract_first() if 'abstract' in str(papers_item.xpath("./div/@class")): paper['snippet'] = papers_item.xpath( "./div[@class='abstract']/text()").extract_first().replace( "\n", "") if 'kw' in str(papers_item.xpath("./div/@class")): paper['keyword'] = papers_item.xpath( "./div[@class='kw']/text()").extract()[1].strip( ':').replace("\n", "") paper['search'] = self.search yield paper
def parse(self, response): obj_list = response.xpath('//*[@id="content"]/div/div')[1:] for obj in obj_list: item = PaperItem() item['title'] = obj.xpath('./p[1]/text()').extract_first() item['file_urls'] = [ obj.xpath('./p[3]/a[2]/@href').extract_first() ] author = obj.xpath('./p[2]/span/text()').extract_first() aut_list = author.split(",") s = '' for i in aut_list: s = s + i.lstrip('\n').lstrip(' \n\n \n ').rstrip( '\n\n \n ') + ',' item['authors'] = s[:-1] item['publicationDate'] = 'ICML' item['publication'] = 'ICML' item['publisher'] = 'ICML' item['snippet'] = 'ICML' item['keyword'] = 'ICML' item['search'] = 'ICML' yield item
def parse(self, response): chrome_options = webdriver.ChromeOptions() chrome_options.add_argument('--headless') path = str(os.path.split( os.path.realpath(__file__))[0]) + '\chromedriver.exe' browser = webdriver.Chrome(chrome_options=chrome_options, executable_path=path) try: browser.get(response.request.url) input = browser.find_element_by_id('txt_SearchText') input.send_keys(self.search) input.send_keys(Keys.ENTER) wait = WebDriverWait(browser, 10) wait.until( EC.presence_of_element_located((By.CLASS_NAME, 'newsh_mid'))) browser.switch_to.frame('iframeResult') browser.maximize_window() i = 10 # 取结果的前10页 while i > 0: try: i -= 1 print(i) papers_list = browser.find_elements_by_xpath( "//table[@class='GridTableContent']/tbody/tr") for papers_item in papers_list[1:]: paper = PaperItem() paper[ 'title'] = papers_item.find_element_by_class_name( 'fz14').text authors = papers_item.find_elements_by_class_name( 'KnowledgeNetLink') author = [] for au in authors: author.append(au.text) paper['authors'] = ','.join(author) paper[ 'publicationDate'] = papers_item.find_elements_by_tag_name( 'td')[4].text paper[ 'publication'] = papers_item.find_elements_by_tag_name( 'td')[3].text paper['publisher'] = 'CNKI' paper['snippet'] = None paper['keyword'] = None try: download_url = papers_item.find_elements_by_tag_name( "td")[7].find_element_by_tag_name( 'a').get_attribute('href') paper['file_urls'] = [ urljoin(response.request.url, download_url) ] # paper['download_url'] = urljoin(response.request.url, download_url) except: continue a = 0 paper['search'] = self.search yield paper page = browser.find_element_by_partial_link_text('下一页') # print(page.text) browser.execute_script( "arguments[0].scrollIntoView(false);", page) wait.until( EC.element_to_be_clickable( (By.PARTIAL_LINK_TEXT, '下一页'))).click() except: break finally: browser.close()