Example #1
0
    def __init__(self):
        rs = ReadSetting() #读取各项参数
        self.start_urls = rs.readurl()
        self.linkmatrix = LinkMatrix(rs.projectname())
        self.linkmatrix.setroot(self.start_urls)

        self.allowed_domains = rs.readalloweddomain()
        self.xpath = rs.readxpath()
        self.rules = [Rule(LinkExtractor(), follow=True, callback="parse_start_url")]
        #设置爬取规则:follow所有url;Request通过spidermiddlewares过滤掉限定域外的url;生成的response传递给parse_start_url
        #所有Request均经过spidermiddlewares

        super(XpathSpider, self).__init__()
Example #2
0
    def __init__(self):
        rs = ReadSetting()  #读取setting文件中的保存参数
        self.savename = rs.savingname()
        self.location = rs.savinglocation()
        self.saveingformat = rs.savingformat()

        if self.savename == 1:  #判断函数self.getpath对应的函数变量(相当于函数指针)
            self.getpath = self.getpath_1
        elif self.savename == 2:
            self.getpath = self.getpath_2
        elif self.savename == 3:
            self.getpath = self.getpath_3

        self.projectname = rs.projectname()

        try:
            os.mkdir(self.location)  #创建下载内容所保存的文件夹(根据保存参数)
        except OSError as e:
            if e.errno == 17: pass
Example #3
0
    def __init__(self):

        rs = ReadSetting()  #读取各项参数
        self.start_urls = rs.readurl()
        self.linkmatrix = LinkMatrix(rs.projectname())
        self.linkmatrix.setroot(self.start_urls)

        self.allowed_domains = rs.readalloweddomain()
        self.allow, self.deny = rs.readurlmatch()

        self.regex_allow = re.compile('({0})'.format('|'.join(
            [re.escape(e) for e in self.allow])))  #生成正则表达式
        self.regex_deny = re.compile('({0})'.format('|'.join(
            [re.escape(e) for e in self.deny])))

        self.rules = [
            Rule(LinkExtractor(), follow=True, callback="parse_match")
        ]
        #设置爬取规则:follow所有url;Request通过spidermiddlewares过滤掉限定域外的url;生成的response传递给parse_match
        #所有Request均经过spidermiddlewares

        super(MatchSpider, self).__init__()