def parse(self, response):
        # cookies={"hulaccess": "1.3|209.6.60.59|20151101223537EST|pin|80832397|harvard|FAS-102981.HMS-103040.HMS-100151.FAS.FGS|GRAD.OFFI|2934|hul-prod",
        #          "hulaccess2_prod": "eWbxIkDP1qyKN0iQ7GikUxxfNmdEqF3E7ovdq9zDfjD8w77vOFDNE/5AqG/CedYhSRt8wmv8OqB+YbFQ67NVfyBoo0PssLP5otwdTAWuYHg=",
        #          "user_OpenURL": "http://sfx.hul.harvard.edu:80/sfx_local/",
        #          "ezproxyezpprod1": "qPRIAvBDBbyjmOK",
        #          "BIGipServersdc-web_80": "505545738.20480.0000",
        #          "_ga": "GA1.8.1317565079.1446420959",
        #          "__atuvc": "1%7C44",
        #          "__atuvs": "5636a1dfd90b070e000"}

        self.driver.get(self.start_urls[0])

        # login and go to cyclopedia
        self.login()
        self.go_to_cylcopedia()

        next_page = True
        num_pages = 1
        while next_page:

            if num_pages in self.skip_pages:
                try:
                    next_page = self.driver.find_element_by_css_selector(
                        '#ctl00_ctl00_MainContentArea_MainContentArea_bottomMultiPage_lnkNext')
                    next_page.click()
                    num_pages += 1
                except:
                    next_page = False

                if num_pages > self.max_pages + 1:
                    break

                continue

            # get page body
            body = self.driver.page_source

            # get all links
            link_list = scrapy.Selector(text=body).xpath(
                '//div[@class="record-formats-wrapper externalLinks"]/span/a/@href'
            ).extract()

            for link in link_list:
                title, author, characters = self.get_full_text_info(link)

                # time.sleep(0.4)

                item = cyclopedia_entry()
                item['title'] = title
                item['author'] = author
                item['characters'] = characters

                yield item

            try:
                next_page = self.driver.find_element_by_css_selector(
                    '#ctl00_ctl00_MainContentArea_MainContentArea_bottomMultiPage_lnkNext')
                next_page.click()
                num_pages += 1
            except:
                next_page = False
                continue

            if num_pages > self.max_pages + 1:
                break

        self.driver.close()
    def parse(self, response):

        self.driver.get(self.start_urls[0])

        # login and go to cyclopedia
        self.login()
        self.go_to_cylcopedia()

        next_page = True
        num_pages = 1
        while next_page:

            if num_pages in self.skip_pages:
                try:
                    next_page = self.driver.find_element_by_css_selector(
                        '#ctl00_ctl00_MainContentArea_MainContentArea_bottomMultiPage_lnkNext')
                    next_page.click()
                    num_pages += 1
                except:
                    next_page = False

                if num_pages > self.max_pages + 1:
                    break

                continue

            # get page body
            get_body = True
            while get_body:
                body = self.driver.page_source
                link_list = scrapy.Selector(text=body).xpath(
                    '//div[@class="record-formats-wrapper externalLinks"]/span/a/@href'
                ).extract()
                if len(link_list) == 50:
                    get_body = False

            # print "Page {}: {}".format(num_pages, len(link_list))

            for link in link_list:
                title, author, characters = self.get_full_text_info(link)
                # title = 'a'
                # author = 'a'
                # characters = 'a'

                item = cyclopedia_entry()
                item['title'] = title
                item['author'] = author
                item['characters'] = characters

                yield item

            try:
                next_page = self.driver.find_element_by_css_selector(
                    '#ctl00_ctl00_MainContentArea_MainContentArea_bottomMultiPage_lnkNext')
                next_page.click()
                num_pages += 1
            except:
                next_page = False
                continue

            if num_pages > self.max_pages:
                break

        self.driver.close()
Example #3
0
    def parse(self, response):
        # cookies={"hulaccess": "1.3|209.6.60.59|20151101223537EST|pin|80832397|harvard|FAS-102981.HMS-103040.HMS-100151.FAS.FGS|GRAD.OFFI|2934|hul-prod",
        #          "hulaccess2_prod": "eWbxIkDP1qyKN0iQ7GikUxxfNmdEqF3E7ovdq9zDfjD8w77vOFDNE/5AqG/CedYhSRt8wmv8OqB+YbFQ67NVfyBoo0PssLP5otwdTAWuYHg=",
        #          "user_OpenURL": "http://sfx.hul.harvard.edu:80/sfx_local/",
        #          "ezproxyezpprod1": "qPRIAvBDBbyjmOK",
        #          "BIGipServersdc-web_80": "505545738.20480.0000",
        #          "_ga": "GA1.8.1317565079.1446420959",
        #          "__atuvc": "1%7C44",
        #          "__atuvs": "5636a1dfd90b070e000"}

        self.driver.get(self.start_urls[0])

        # login and go to cyclopedia
        self.login()
        self.go_to_cylcopedia()

        next_page = True
        num_pages = 1
        while next_page:

            if num_pages in self.skip_pages:
                try:
                    next_page = self.driver.find_element_by_css_selector(
                        '#ctl00_ctl00_MainContentArea_MainContentArea_bottomMultiPage_lnkNext'
                    )
                    next_page.click()
                    num_pages += 1
                except:
                    next_page = False

                if num_pages > self.max_pages + 1:
                    break

                continue

            # get page body
            body = self.driver.page_source

            # get all links
            link_list = scrapy.Selector(text=body).xpath(
                '//div[@class="record-formats-wrapper externalLinks"]/span/a/@href'
            ).extract()

            for link in link_list:
                title, author, characters = self.get_full_text_info(link)

                # time.sleep(0.4)

                item = cyclopedia_entry()
                item['title'] = title
                item['author'] = author
                item['characters'] = characters

                yield item

            try:
                next_page = self.driver.find_element_by_css_selector(
                    '#ctl00_ctl00_MainContentArea_MainContentArea_bottomMultiPage_lnkNext'
                )
                next_page.click()
                num_pages += 1
            except:
                next_page = False
                continue

            if num_pages > self.max_pages + 1:
                break

        self.driver.close()
    def parse(self, response):

        self.driver.get(self.start_urls[0])

        # login and go to cyclopedia
        self.login()
        self.go_to_cylcopedia()

        next_page = True
        num_pages = 1
        while next_page:

            if num_pages in self.skip_pages:
                try:
                    next_page = self.driver.find_element_by_css_selector(
                        '#ctl00_ctl00_MainContentArea_MainContentArea_bottomMultiPage_lnkNext'
                    )
                    next_page.click()
                    num_pages += 1
                except:
                    next_page = False

                if num_pages > self.max_pages + 1:
                    break

                continue

            # get page body
            get_body = True
            while get_body:
                body = self.driver.page_source
                link_list = scrapy.Selector(text=body).xpath(
                    '//div[@class="record-formats-wrapper externalLinks"]/span/a/@href'
                ).extract()
                if len(link_list) == 50:
                    get_body = False

            # print "Page {}: {}".format(num_pages, len(link_list))

            for link in link_list:
                title, author, characters = self.get_full_text_info(link)
                # title = 'a'
                # author = 'a'
                # characters = 'a'

                item = cyclopedia_entry()
                item['title'] = title
                item['author'] = author
                item['characters'] = characters

                yield item

            try:
                next_page = self.driver.find_element_by_css_selector(
                    '#ctl00_ctl00_MainContentArea_MainContentArea_bottomMultiPage_lnkNext'
                )
                next_page.click()
                num_pages += 1
            except:
                next_page = False
                continue

            if num_pages > self.max_pages:
                break

        self.driver.close()