コード例 #1
0
 def __init__(self, path=''):
     super(ConfigLoader, self).__init__()
     self.logger = logger()
     self.path = path if len(path) else commvals.CRAWLER_CONFIG_XML
     self.doc = etree.parse(self.path)
     self.site = self.doc.xpath(u'//site')
     self.sites = [site.attrib.get('value', '') for site in self.site if site.attrib.get('able', 'true') == 'true']
コード例 #2
0
ファイル: crawler.py プロジェクト: hengheng0haha/spiderx
    def __init__(self, m_url, gzip=False, snapshot=False, c_time=0):
        threading.Thread.__init__(self)
        self.agents = [('Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36'
                        ' (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36'),
                       ('Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36'
                        ' (KHTML, like Gecko) Chrome/40.0.2214.94 Safari/537.36'),
                       ('Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.3; WOW64;'
                        ' Trident/7.0; .NET4.0E; .NET4.0C; InfoPath.3)'),
                       ('Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36'
                        ' (KHTML, like Gecko) Chrome/38.0.2125.104 Safari/537.36')]

        self.url = m_url
        self.gzip = gzip
        self.snapshot = snapshot
        self.header = {'User-Agents': self.agents[random.randint(0, len(self.agents) - 1)], 'Referer': self.url}
        self.logger = logger()
        self.c_time = c_time

        self.simple_open_url = lambda: urllib2.urlopen(urllib2.Request(self.url, headers=self.header)).read()