Beispiel #1
0
 def parse_other(self, response):
     papers_list = response.xpath("//ol[@class='content-item-list']/li")
     for papers_item in papers_list:
         paper = PaperItem()
         paper['title'] = "".join(
             papers_item.xpath("./h2/a//text()").getall()).replace(
                 "\n", "").replace("  ", "")
         paper['snippet'] = ("".join(
             papers_item.xpath(
                 "./p[@class='snippet']/text()").getall()).replace(
                     "\n", "")).strip()
         paper['authors'] = ",".join(
             papers_item.xpath(
                 "./p[@class='meta']/span[@class='authors']/a/text()").
             getall())
         paper['publication'] = papers_item.xpath(
             "./p[@class='meta']/span[@class='enumeration']/a/text()").get(
             )
         paper['publicationDate'] = papers_item.xpath(
             "./p[@class='meta']/span[@class='enumeration']/span/@title"
         ).get()
         paper['publisher'] = 'Springer'
         # paper['file_urls'] = ['https://link.springer.com' + papers_item.xpath(
         #     ".//span[@class='action'][1]/a/@href").get()]
         paper['search'] = self.search
         yield paper
Beispiel #2
0
 def parse_detail_1(self, response):
     pjs_obj = webdriver.PhantomJS(
         executable_path=
         r'D://phantomjs-2.1.1-windows/phantomjs-2.1.1-windows/bin/phantomjs.exe'
     )
     pjs_obj.get(
         'https://openreview.net/group?id=ICLR.cc/2018/Conference#accepted-poster-papers'
     )
     sleep(5)
     sele = pjs_obj.find_elements_by_xpath(
         '//*[@id="accepted-poster-papers"]/ul/li')
     if sele is not None:
         for i in sele:
             item = PaperItem()
             item['title'] = i.find_element_by_xpath('./h4/a[1]').text
             item['file_urls'] = [
                 i.find_element_by_xpath('./h4/a[2]').get_attribute('href')
             ]
             item['authors'] = i.find_element_by_xpath('./div[1]').text
             item['publicationDate'] = 'ICLR'
             item['publication'] = 'ICLR'
             item['publisher'] = 'ICLR'
             item['snippet'] = 'ICLR'
             item['keyword'] = 'ICLR'
             item['search'] = 'ICLR'
Beispiel #3
0
    def parse_detail(self, response):

        url = 'https://openreview.net/group?id=ICLR.cc/2018/Conference#accepted-oral-papers'
        pjs_obj = webdriver.PhantomJS(
            executable_path=
            r'D://phantomjs-2.1.1-windows/phantomjs-2.1.1-windows/bin/phantomjs.exe'
        )
        pjs_obj.get(url)
        sleep(5)
        sele = pjs_obj.find_elements_by_xpath(
            '//*[@id="accepted-oral-papers"]/ul/li')
        print(sele)
        for i in sele:
            item = PaperItem()
            item['title'] = i.find_element_by_xpath('./h4/a[1]').text
            item['file_urls'] = [
                i.find_element_by_xpath('./h4/a[2]').get_attribute('href')
            ]
            item['authors'] = i.find_element_by_xpath('./div[1]').text
            item['publicationDate'] = 'ICLR'
            item['publication'] = 'ICLR'
            item['publisher'] = 'ICLR'
            item['snippet'] = 'ICLR'
            item['keyword'] = 'ICLR'
            item['search'] = 'ICLR'

            yield item
        yield scrapy.Request('https://www.baidu.com',
                             callback=self.parse_detail_1,
                             dont_filter=True)
Beispiel #4
0
 def parse(self, response):
     sele = response.xpath('//*[@id="content"]/dl/dt')
     for sel in sele:
         item = PaperItem()
         item['title'] = sel.xpath('./a/text()').extract_first()
         item['authors'] = sel.xpath('./following-sibling::dd/form/a/text()').extract()[0]
         item['file_urls'] =['http://openaccess.thecvf.com/' + sel.xpath('./following-sibling::dd/a/@href').extract_first()]
         # item['download_url'] ='http://openaccess.thecvf.com/' + sel.xpath('./following-sibling::dd/a/@href').extract_first()
         item['publicationDate'] = 'ICCV'
         item['publication'] = 'ICCV'
         item['publisher'] = 'ICCV'
         item['keyword'] = 'ICCV'
         item['search'] = 'ICCV'
         item['snippet'] = 'ICCV'
         yield item
Beispiel #5
0
 def parse_first(self, response):
     papers_list = response.xpath("//div[@class='details']")
     for papers_item in papers_list:
         paper = PaperItem()
         if 'title' in str(papers_item.xpath("./div/@class")):
             paper['title'] = papers_item.xpath(
                 "./div[@class='title']/a/text()").extract_first()
         if 'authors' in str(papers_item.xpath("./div/@class")):
             paper['authors'] = ','.join(
                 papers_item.xpath(
                     "./div[@class='authors']/a/text()").extract())
         if 'source' in str(papers_item.xpath("./div/@class")):
             paper['publicationDate'] = papers_item.xpath(
                 "///span[@class='publicationDate']/text()").extract_first(
                 )
             paper['publication'] = papers_item.xpath(
                 "./div[@class='source']/span[2]/text()").extract_first()
         if 'publisher' in str(papers_item.xpath("./div/@class")):
             paper['publisher'] = papers_item.xpath(
                 "./div[@class='publisher']/text()").extract()[1].replace(
                     "\xa0", "").replace("\n", "")
         if 'ft' in str(papers_item.xpath("./div/@class")):
             paper['file_urls'] = [
                 'https://dl.acm.org/' + papers_item.xpath(
                     "./div[@class='ft']/a/@href").extract_first()
             ]
             # paper['download_url'] = 'https://dl.acm.org/' + papers_item.xpath(
             #     "./div[@class='ft']/a/@href").extract_first()
         if 'abstract' in str(papers_item.xpath("./div/@class")):
             paper['snippet'] = papers_item.xpath(
                 "./div[@class='abstract']/text()").extract_first().replace(
                     "\n", "")
         if 'kw' in str(papers_item.xpath("./div/@class")):
             paper['keyword'] = papers_item.xpath(
                 "./div[@class='kw']/text()").extract()[1].strip(
                     ':').replace("\n", "")
         paper['search'] = self.search
         yield paper
     start_num = int(
         re.compile(".*start=(\d*)").findall(
             str(response.xpath("//div[@class='pagelogic'][1]/a/@href")))
         [0])
     if start_num / 20 > 20:
         start_num = 400
     for start in range(20, start_num, 20):
         yield Request(url=self.other_url.format(search=self.search,
                                                 start=start),
                       callback=self.parse_other)
Beispiel #6
0
 def parse(self, response):
     obj_list = response.xpath('//*[@class="section"]')[1:]
     for obj in obj_list:
         sele = obj.xpath('./div[2]/div')
         for sel in sele:
             item = PaperItem()
             item['title'] = sel.xpath('./div[1]/text()').extract_first()
             item['authors'] = sel.xpath('./div[2]/text()').extract_first()
             URL = sel.xpath('./div[3]/a[1]/@href').extract_first()
             if URL is not None:
                 item['file_urls'] = ['http://www.ijcai.org/proceedings/%s/'%(Keyword.get_search_three()) + URL]
             item['publicationDate'] = 'IJCAI'
             item['publication'] = 'IJCAI'
             item['publisher'] = 'IJCAI'
             item['snippet'] = 'IJCAI'
             item['keyword'] = 'IJCAI'
             item['search'] = 'IJCAI'
             yield item
Beispiel #7
0
 def parse_detail(self, response):
     item = PaperItem()
     item['title'] = response.xpath(
         '/html/body/div[2]/div/h2/text()').extract_first()
     author = response.xpath(
         '/html/body/div[2]/div/ul/li//text()').extract()
     s = ''
     for i in author:
         s = s + i + ','
     item['authors'] = s.rstrip(',')
     item['file_urls'] = [
         'http://papers.nips.cc' +
         response.xpath('/html/body/div[2]/div/a[1]/@href').extract_first()
     ]
     item['publicationDate'] = 'NIPS'
     item['publication'] = 'NIPS'
     item['publisher'] = 'NIPS'
     item['snippet'] = 'NIPS'
     item['keyword'] = 'NIPS'
     item['search'] = 'NIPS'
     yield item
Beispiel #8
0
    def parse_detail(self, response):
        item = PaperItem()
        item['title'] = response.xpath(
            '//*[@id="jouMain"]/div[1]/div[3]/ul/h1/b/text()').extract_first()
        author = response.xpath(
            '//*[@id="jouMain"]/div[1]/div[3]/a/text()').extract()
        num = len(author)
        result = ''
        for i in range(num - 1):
            result += author[i] + ','
        item['authors'] = result
        sel = response.xpath(
            '//*[@id="ctl00_ContentPlaceHolder1_showJournalIssue"]/span/a/text()'
        ).extract_first()
        sel = sel.split('\r\n')[1].lstrip(' ')

        item['publicationDate'] = sel
        item['publication'] = 'Hans'
        item['publisher'] = 'Hans'
        item['snippet'] = response.xpath(
            '//*[@id="jouMain"]/div[1]/div[3]/div[1]/div/p[1]/text()'
        ).extract_first()
        if item['snippet'] is None:
            item['snippet'] = response.xpath(
                '//*[@id="jouMain"]/div[1]/div[3]/div[1]/div/div/span[1]/text()'
            ).extract_first()
        sele = response.xpath(
            '//*[@id="jouMain"]/div[1]/div[3]/p[2]/a/text()').extract()
        k = ''
        for j in range(len(sele) - 2):
            k += sele[j] + ','
        item['keyword'] = k
        item['search'] = Keyword.get_search()

        url = response.xpath('//*[@id="clicknumber"]/@href').extract_first()
        url = parse.urljoin(response.url, url)
        yield scrapy.Request(url,
                             callback=self.parse_download,
                             meta={"item": deepcopy(item)},
                             dont_filter=True)
Beispiel #9
0
 def parse_first(self, response):
     papers_list = response.xpath("//ol[@class='content-item-list']/li")
     for papers_item in papers_list:
         paper = PaperItem()
         paper['title'] = "".join(
             papers_item.xpath("./h2/a//text()").getall()).replace(
                 "\n", "").replace("  ", "")
         paper['snippet'] = ("".join(
             papers_item.xpath(
                 "./p[@class='snippet']/text()").getall()).replace(
                     "\n", "")).strip()
         paper['authors'] = ",".join(
             papers_item.xpath(
                 "./p[@class='meta']/span[@class='authors']/a/text()").
             getall())
         paper['publication'] = papers_item.xpath(
             "./p[@class='meta']/span[@class='enumeration']/a/text()").get(
             )
         paper['publicationDate'] = papers_item.xpath(
             "./p[@class='meta']/span[@class='enumeration']/span/@title"
         ).get()
         paper['publisher'] = 'Springer'
         paper['file_urls'] = [
             'https://link.springer.com' +
             papers_item.xpath(".//span[@class='action'][1]/a/@href").get()
         ]
         # paper['download_url'] = 'https://link.springer.com' + papers_item.xpath(
         #     ".//span[@class='action'][1]/a/@href").get()
         paper['search'] = self.search
         yield paper
     page_num = int("".join(
         response.xpath(
             "//div[contains(@class,'functions-bar-top')]//span[@class='number-of-pages']/text()"
         ).get()))
     if page_num > 20:
         page_num = 20
     for page in range(2, page_num + 1):
         yield Request(self.other_url.format(page=page, search=self.search),
                       callback=self.parse_other)
Beispiel #10
0
 def parse_other(self, response):
     papers_list = response.xpath("//div[@class='details']")
     for papers_item in papers_list:
         paper = PaperItem()
         if 'title' in str(papers_item.xpath("./div/@class")):
             paper['title'] = papers_item.xpath(
                 "./div[@class='title']/a/text()").extract_first()
         if 'authors' in str(papers_item.xpath("./div/@class")):
             paper['authors'] = ','.join(
                 papers_item.xpath(
                     "./div[@class='authors']/a/text()").extract())
         if 'source' in str(papers_item.xpath("./div/@class")):
             paper['publicationDate'] = papers_item.xpath(
                 "///span[@class='publicationDate']/text()").extract_first(
                 )
             paper['publication'] = papers_item.xpath(
                 "./div[@class='source']/span[2]/text()").extract_first()
         if 'publisher' in str(papers_item.xpath("./div/@class")):
             paper['publisher'] = papers_item.xpath(
                 "./div[@class='publisher']/text()").extract()[1].replace(
                     "\xa0", "").replace("\n", "")
         if 'ft' in str(papers_item.xpath("./div/@class")):
             paper['file_urls'] = [
                 'https://dl.acm.org/' + papers_item.xpath(
                     "./div[@class='ft']/a/@href").extract_first()
             ]
             # paper['download_url'] = 'https://dl.acm.org/' + papers_item.xpath(
             #     "./div[@class='ft']/a/@href").extract_first()
         if 'abstract' in str(papers_item.xpath("./div/@class")):
             paper['snippet'] = papers_item.xpath(
                 "./div[@class='abstract']/text()").extract_first().replace(
                     "\n", "")
         if 'kw' in str(papers_item.xpath("./div/@class")):
             paper['keyword'] = papers_item.xpath(
                 "./div[@class='kw']/text()").extract()[1].strip(
                     ':').replace("\n", "")
         paper['search'] = self.search
         yield paper
Beispiel #11
0
    def parse(self, response):
        obj_list = response.xpath('//*[@id="content"]/div/div')[1:]
        for obj in obj_list:
            item = PaperItem()
            item['title'] = obj.xpath('./p[1]/text()').extract_first()
            item['file_urls'] = [
                obj.xpath('./p[3]/a[2]/@href').extract_first()
            ]
            author = obj.xpath('./p[2]/span/text()').extract_first()
            aut_list = author.split(",")
            s = ''
            for i in aut_list:
                s = s + i.lstrip('\n').lstrip(' \n\n      \n      ').rstrip(
                    '\n\n      \n    ') + ','

            item['authors'] = s[:-1]
            item['publicationDate'] = 'ICML'
            item['publication'] = 'ICML'
            item['publisher'] = 'ICML'
            item['snippet'] = 'ICML'
            item['keyword'] = 'ICML'
            item['search'] = 'ICML'
            yield item
Beispiel #12
0
    def parse(self, response):
        chrome_options = webdriver.ChromeOptions()
        chrome_options.add_argument('--headless')
        path = str(os.path.split(
            os.path.realpath(__file__))[0]) + '\chromedriver.exe'

        browser = webdriver.Chrome(chrome_options=chrome_options,
                                   executable_path=path)
        try:
            browser.get(response.request.url)
            input = browser.find_element_by_id('txt_SearchText')
            input.send_keys(self.search)
            input.send_keys(Keys.ENTER)
            wait = WebDriverWait(browser, 10)
            wait.until(
                EC.presence_of_element_located((By.CLASS_NAME, 'newsh_mid')))
            browser.switch_to.frame('iframeResult')
            browser.maximize_window()
            i = 10  # 取结果的前10页
            while i > 0:
                try:
                    i -= 1
                    print(i)
                    papers_list = browser.find_elements_by_xpath(
                        "//table[@class='GridTableContent']/tbody/tr")
                    for papers_item in papers_list[1:]:
                        paper = PaperItem()
                        paper[
                            'title'] = papers_item.find_element_by_class_name(
                                'fz14').text
                        authors = papers_item.find_elements_by_class_name(
                            'KnowledgeNetLink')
                        author = []
                        for au in authors:
                            author.append(au.text)
                        paper['authors'] = ','.join(author)
                        paper[
                            'publicationDate'] = papers_item.find_elements_by_tag_name(
                                'td')[4].text
                        paper[
                            'publication'] = papers_item.find_elements_by_tag_name(
                                'td')[3].text
                        paper['publisher'] = 'CNKI'
                        paper['snippet'] = None
                        paper['keyword'] = None
                        try:
                            download_url = papers_item.find_elements_by_tag_name(
                                "td")[7].find_element_by_tag_name(
                                    'a').get_attribute('href')
                            paper['file_urls'] = [
                                urljoin(response.request.url, download_url)
                            ]
                            # paper['download_url'] = urljoin(response.request.url, download_url)
                        except:
                            continue
                        a = 0
                        paper['search'] = self.search
                        yield paper
                    page = browser.find_element_by_partial_link_text('下一页')
                    # print(page.text)
                    browser.execute_script(
                        "arguments[0].scrollIntoView(false);", page)
                    wait.until(
                        EC.element_to_be_clickable(
                            (By.PARTIAL_LINK_TEXT, '下一页'))).click()
                except:
                    break
        finally:
            browser.close()