コード例 #1
0
    def parseofArizonaSecondStage(self, response):
        self.createNestedRelation(response)

        for url in response.xpath(
                '//a[contains(@href,"/programs")]/@href').extract():
            if 'http' not in url:
                exportCrawledLink.exportCrawledLinkToFile(
                    self, "ArizonaStateUniversity.txt", url)
                yield scrapy.Request(url,
                                     callback=self.parseofArizonaThirdStage,
                                     dont_filter=True,
                                     meta={
                                         'info': {
                                             "methodName":
                                             inspect.stack()[0][3],
                                             "Level":
                                             self.stageCount,
                                             "Parent":
                                             response.request.url,
                                             "ParentID":
                                             self.ChildID,
                                             "Child":
                                             self.universityParentSite + url
                                         }
                                     })
            else:
                exportCrawledLink.exportCrawledLinkToFile(
                    self, "ArizonaStateUniversity.txt", url)
                yield scrapy.Request(url,
                                     callback=self.parseofArizonaThirdStage,
                                     dont_filter=True,
                                     meta={
                                         'info': {
                                             "methodName":
                                             inspect.stack()[0][3],
                                             "Level": self.stageCount,
                                             "Parent": response.request.url,
                                             "ParentID": self.ChildID,
                                             "Child": url
                                         }
                                     })
コード例 #2
0
    def parse(self, response):
        self.createNestedRelation(response)
        self.ChildID = uuid.uuid4()
        print("Started Parsing" + self.universityParentSite)
        for url in response.xpath(
                '//a[contains(@href,"/future-students")]/@href').extract():
            if "http" not in url:
                exportCrawledLink.exportCrawledLinkToFile(
                    self, "UOfWaterloo.txt", self.universityParentSite + url)
                yield scrapy.Request(url=self.universityParentSite + url,
                                     callback=self.parse,
                                     meta={
                                         'info': {
                                             "methodName":
                                             inspect.stack()[0][3],
                                             "Level":
                                             self.stageCount,
                                             "Parent":
                                             response.request.url,
                                             "ParentID":
                                             self.ChildID,
                                             "Child":
                                             self.universityParentSite + url
                                         }
                                     })
            else:

                exportCrawledLink.exportCrawledLinkToFile(
                    self, "UOfWaterloo.txt", url)
                yield scrapy.Request(url=url,
                                     callback=self.parse,
                                     meta={
                                         'info': {
                                             "methodName":
                                             inspect.stack()[0][3],
                                             "Level": self.stageCount,
                                             "Parent": response.request.url,
                                             "ParentID": self.ChildID,
                                             "Child": url
                                         }
                                     })
コード例 #3
0
    def parseUOfTorontoSecondStageCrawling(self, response):

        self.createNestedRelation(response)
        regExp = "/academics"
        for url in response.xpath(
                '//a[contains(@href,"/academics")]/@href').extract():
            if "http" not in url:

                exportCrawledLink.exportCrawledLinkToFile(
                    self, "UOfToronto.txt", self.universityParentSite + url)
                yield scrapy.Request(
                    self.universityParentSite + url,
                    callback=self.parseUOfTorontoThirdStageCrawling,
                    meta={
                        'info': {
                            "methodName": inspect.stack()[0][3],
                            "Level": self.stageCount,
                            "Parent": response.request.url,
                            "ParentID": self.ChildID,
                            "Child": self.universityParentSite + url,
                            "regExp": regExp
                        }
                    })

            else:
                exportCrawledLink.exportCrawledLinkToFile(
                    self, "UOfToronto.txt", url)
                yield scrapy.Request(
                    url,
                    callback=self.parseUOfTorontoThirdStageCrawling,
                    meta={
                        'info': {
                            "methodName": inspect.stack()[0][3],
                            "Level": self.stageCount,
                            "Parent": response.request.url,
                            "ParentID": self.ChildID,
                            "Child": url,
                            "regExp": regExp
                        }
                    })
コード例 #4
0
    def parseofArizonaFourthStage(self, response):
        self.createNestedRelation(response)

        undergardExtract = response.xpath(
            '//a[contains(@href,"/programs") and contains(@href,"/undergrad")]/@href'
        ).extract()

        #    print (undergardExtract)

        gradExtract = response.xpath(
            '//a[contains(@href,"/programs") and contains(@href,"/graduate")]/@href'
        ).extract()
        missing_url = 'https://webapp4.asu.edu'

        for url in undergardExtract:
            if 'http' not in url:
                exportCrawledLink.exportCrawledLinkToFile(
                    self, "ArizonaStateUniversity.txt", missing_url + url)
                yield scrapy.Request(missing_url + url,
                                     callback=self.parseofArizonaFourthStage,
                                     dont_filter=True,
                                     meta={
                                         'info': {
                                             "methodName":
                                             inspect.stack()[0][3],
                                             "Level":
                                             self.stageCount,
                                             "Parent":
                                             response.request.url,
                                             "ParentID":
                                             self.ChildID,
                                             "Child":
                                             self.universityParentSite + url
                                         }
                                     })
            else:
                exportCrawledLink.exportCrawledLinkToFile(
                    self, "ArizonaStateUniversity.txt", url)
                yield scrapy.Request(url,
                                     callback=self.parseofArizonaFourthStage,
                                     dont_filter=True,
                                     meta={
                                         'info': {
                                             "methodName":
                                             inspect.stack()[0][3],
                                             "Level": self.stageCount,
                                             "Parent": response.request.url,
                                             "ParentID": self.ChildID,
                                             "Child": url
                                         }
                                     })

        for url in gradExtract:
            if 'http' not in url:
                exportCrawledLink.exportCrawledLinkToFile(
                    self, "ArizonaStateUniversity.txt", missing_url + url)
                yield scrapy.Request(missing_url + url,
                                     callback=self.parseofArizonaFourthStage,
                                     dont_filter=True,
                                     meta={
                                         'info': {
                                             "methodName":
                                             inspect.stack()[0][3],
                                             "Level":
                                             self.stageCount,
                                             "Parent":
                                             response.request.url,
                                             "ParentID":
                                             self.ChildID,
                                             "Child":
                                             self.universityParentSite + url
                                         }
                                     })
            else:
                exportCrawledLink.exportCrawledLinkToFile(
                    self, "ArizonaStateUniversity.txt", url)
                yield scrapy.Request(url,
                                     callback=self.parseofArizonaFourthStage,
                                     dont_filter=True,
                                     meta={
                                         'info': {
                                             "methodName":
                                             inspect.stack()[0][3],
                                             "Level": self.stageCount,
                                             "Parent": response.request.url,
                                             "ParentID": self.ChildID,
                                             "Child": url
                                         }
                                     })
コード例 #5
0
    def parseUOfWaterlooThirdStageCrawling(self, response):

        self.createNestedRelation(response)

        academicsRegExp = '//a[contains(@href,"/academics")]/@href'
        undergradRegExp = '//a[contains(@href,"/progs")]/@href'
        gradRegExp = '//a[contains(@href,"/Pages/Programs")]/@href'

        academicsExtract = response.xpath(academicsRegExp).extract()
        gradExtract = response.xpath(gradRegExp).extract()
        undergradExtract = response.xpath(undergradRegExp).extract()

        for url in academicsExtract:
            if "http" not in url:
                exportCrawledLink.exportCrawledLinkToFile(
                    self, "UOfWaterloo.txt", self.universityParentSite + url)
                yield scrapy.Request(
                    self.universityParentSite + url,
                    callback=self.parseUOfWaterlooThirdStageCrawling,
                    meta={
                        'info': {
                            "methodName": inspect.stack()[0][3],
                            "Level": self.stageCount,
                            "Parent": response.request.url,
                            "ParentID": self.ChildID,
                            "Child": self.universityParentSite + url
                        }
                    })
            else:
                exportCrawledLink.exportCrawledLinkToFile(
                    self, "UOfWaterloo.txt", url)
                yield scrapy.Request(
                    url,
                    callback=self.parseUOfWaterlooThirdStageCrawling,
                    meta={
                        'info': {
                            "methodName": inspect.stack()[0][3],
                            "Level": self.stageCount,
                            "Parent": response.request.url,
                            "ParentID": self.ChildID,
                            "Child": url
                        }
                    })

        for url in undergradExtract:

            exportCrawledLink.exportCrawledLinkToFile(self, "UOfWaterloo.txt",
                                                      url)
            yield scrapy.Request(
                url,
                callback=self.parseUOfWaterlooThirdStageCrawling,
                meta={
                    'info': {
                        "methodName": inspect.stack()[0][3],
                        "Level": self.stageCount,
                        "Parent": response.request.url,
                        "ParentID": self.ChildID,
                        "Child": url
                    }
                })

        for url in gradExtract:

            exportCrawledLink.exportCrawledLinkToFile(self, "UOfWaterloo.txt",
                                                      url)
            yield scrapy.Request(
                url,
                callback=self.parseUOfWaterlooThirdStageCrawling,
                meta={
                    'info': {
                        "methodName": inspect.stack()[0][3],
                        "Level": self.stageCount,
                        "Parent": response.request.url,
                        "ParentID": self.ChildID,
                        "Child": url
                    }
                })