Esempio n. 1
0
class Crawler():

    def __init__(self ):
        self.downloader = DownloadManager()
        self.webpage = None
        self.init_database()
        self.rules = {}

    def init_database(self):
        self.queue = QueueDB('queue.db')
        self.webpagedb = WebpageDB('webpage.db')
        self.duplcheck = DuplCheckDB('duplcheck.db')
    
    def add_seeds(self, links):
        new_links = self.duplcheck.filter_dupl_urls(links)
        self.duplcheck.add_urls(new_links)
        self.queue.push_urls(new_links)
    
    def add_rules(self, rules):
        self.rules = {}
        for url, inurls in rules.items():
            reurl = re.compile(url)
            repatn = []
            for u in inurls:
                repatn.append(re.compile(u))
            self.rules[reurl] = repatn

    def get_patterns_from_rules(self,url):
        patns = []
        for purl,ru in self.rules.items():
            if purl.match(url)!= None:
                patns.extend(ru)
        return list(set(patns))

    def start(self):
        while 1:
            url = self.queue.pop_url()
            print url
            if url == None:
                print "crawling task is done."
                break
            error_msg, url, redirected_url, html = self.downloader.download(url)
            #print error_msg, url, redirected_url, html
            if html !=None:
                self.webpagedb.html2db(url,html)
                
                self.webpage = WebPage(url,html)
                self.webpage.parse_links()
                ruptn = self.get_patterns_from_rules(url)
                print ruptn
                links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)
                self.add_seeds(links)
            self.mysleep(3)        

    def mysleep(self, n):
        for i in range(n):
            time.sleep(1)
            print "sleep",i,"of",n
Esempio n. 2
0
class Crawler():
    def __init__(self):
        self.downloader = DownloadManager()
        self.webpage = None
        self.rules = {}
        self.dbop = OperatorDB()

    def add_seeds(self, links):
        self.dbop.add_seeds(links)

    def add_rules(self, rules):
        self.rules = {}
        for url, inurls in rules.items():
            reurl = re.compile(url)
            repatn = []
            for u in inurls:
                repatn.append(re.compile(u))
            self.rules[reurl] = repatn

    def get_patterns_from_rules(self, url):
        patns = []
        for purl, ru in self.rules.items():
            if purl.match(url) != None:
                patns.extend(ru)
        return list(set(patns))

    def start(self):
        while 1:
            try:
                url = self.dbop.pop_url()
                print "url: %s" % url
                if url == None:
                    print "crawling task is done."
                    break
                error_msg, url, redirected_url, html = self.downloader.download(url)
                #print error_msg, url, redirected_url, html
                if html != None:
                    self.webpage = WebPage(url, html)
                    article = self.webpage.extract()
                    if len(article) > 5:
                        addtime = "%s %s" % (article[1], article[2])
                        self.dbop.html2db(url, html,
                                          article[0],
                                          addtime,
                                          article[3],
                                          article[5])
                    else:
                        self.dbop.html2db(url, html)
                    print self.webpage.parse_links()
                    ruptn = self.get_patterns_from_rules(url)
                    links = self.webpage.filter_links(tags=['a'],
                                                      str_patterns=ruptn)
                    self.add_seeds(links)
                self.mysleep(3)
            except Exception, err:
               print "!!error!! Exception happend! %s %s" % (url, err)
               self.dbop.close()
Esempio n. 3
0
class Crawler(object):
    def __init__(self):
        super(Crawler, self).__init__()
        self.downloader = DownloadManager()
        self.webpage = None
        self.init_database()
        self.rules = {}

    def init_database(self):
        self.queue = QueueDB('queue.db')
        self.webpagedb = WebpageDB('webpage.db')
        self.duplcheck = DuplCheckDB('duplcheck.db')

    def add_seeds(self, links):
        new_links = self.duplcheck.filter_dupl_urls(links)
        self.duplcheck.add_urls(new_links)
        self.queue.push_urls(new_links)

    def add_rules(self, rules):
        self.rules = {}
        for url, inurls in rules.items():
            reurl = re.compile(url)
            repatn = []
            for u in inurls:
                repatn.append(re.compile(u))
            self.rules[reurl] = repatn

    def get_patterns_from_rules(self, url):
        patns = []
        for purl, ru in self.rules.items():
            if purl.match(url) != None:
                patns.extend(ru)
        return list(set(patns))

    def getlinks(self, url, html):
        self.webpage = WebPage(url, html)
        self.webpage.parse_links()
        ruptn = self.get_patterns_from_rules(url)
        #print ruptn
        links = self.webpage.filter_links(tags=['a'], patterns=ruptn)
        return links

    def start(self):
        while 1:
            url = self.queue.pop_url()
            print url
            if url == None:
                print "crawling task is done."
                break
            error_msg, url, redirected_url, html = self.downloader.download(
                url)
            #print error_msg, url, redirected_url, html
            if html != None:
                self.webpagedb.html2db(url, html)
                links = self.getlinks(url, html)
                self.add_seeds(links)
            self.mysleep(3)

    def mysleep(self, n):
        for i in range(n):
            time.sleep(1)
            print "sleep", i, "of", n
Esempio n. 4
0
class Crawler():

    def __init__(self ):
        self.downloader = DownloadManager()
        self.webpage = None
        self.init_database()
        self.rules = {}
        self.files = []
        self.file_rule = ".+"

    def init_database(self):
        self.queue = QueueDB('queue.db')
        self.webpagedb = WebpageDB('webpage.db')
        self.duplcheck = DuplCheckDB('duplcheck.db')
        self.repodb = RepoStateDB()   
 
    def add_seeds(self, links):
        new_links = self.duplcheck.filter_dupl_urls(links)
        self.duplcheck.add_urls(new_links)
        self.queue.push_urls(new_links)
    
    def add_rules(self, rules):
        self.rules = {}
        for url, inurls in rules.items():
            reurl = re.compile(url)
            repatn = []
            for u in inurls:
                repatn.append(re.compile(u))
            self.rules[reurl] = repatn

    def set_file_rule(self, rule):
        self.file_rule = rule

    def get_patterns_from_rules(self,url):
        patns = []
        for purl,ru in self.rules.items():
            if purl.match(url)!= None:
                patns.extend(ru)
        return list(set(patns))
    
    def download_files(self, files):
        for f in files:
            #cmd = "wget --force-directories -c " + f + " -P " + config.repos_dir
            cmd = "wget -c " + f + " -P " + config.repos_dir
            ret_code = os.system(cmd)
            self.repodb.update(f, ret_code == 0)

    def start(self):
        while 1:
            url = self.queue.pop_url()
            print url
            if url == None:
                print "crawling task is done."
                break
            error_msg, url, redirected_url, html = self.downloader.download(url)
    #        print error_msg, url, redirected_url, html
            if html !=None:
                self.webpagedb.html2db(url,html)
 
                self.webpage = WebPage(url,html)
                self.webpage.parse_links()
                ruptn = self.get_patterns_from_rules(url)
                #print ruptn
                links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)
                print links
                self.add_seeds(links)
                file_pattern = []
                file_pattern.append(re.compile(self.file_rule))
                files = self.webpage.filter_links(tags = ['a'], patterns = file_pattern)
                self.files.append(files)
                #TODO:
                self.download_files(files)
                print files

    def mysleep(self, n):
        for i in range(n):
            time.sleep(1)
            print "sleep",i,"of",n
Esempio n. 5
0
class Crawler():

    def __init__(self):
        self.downloader = DownloadManager()#下载网页的对象
        self.webpage = None#解析页面的对象
        self.initDatabase()
        self.rules = {}

    #初始化数据库
    def initDatabase(self):
        self.queue = QueueDB()#TODO 表
        self.webpagedb = WebpageDB()
        self.duplcheck = DuplCheckDB()
    
    #增加种子url
    #参数: links   url 列表
    def addSeeds(self, links):
        new_links = self.duplcheck.filterDuplUrls(links)#把重复的url过滤掉
        self.duplcheck.addUrls(new_links)#已经访问过的url
        self.queue.pushUrls(new_links)#向todo表中增加url
    
    def addRules(self, rules):
        self.rules = {}
        for url, inurls in rules.items():
            reurl = re.compile(url)
            repatn = []
            for u in inurls:
                repatn.append(re.compile(u))
            self.rules[reurl] = repatn

    def get_patterns_from_rules(self,url):
        patns = []
        for purl,ru in self.rules.items():
            if purl.match(url)!= None:
                patns.extend(ru)
        return list(set(patns))

    #开始执行
    def start(self):
        while 1:
            url = self.queue.popUrl()
            print url
            if url == None:
                print "crawling task is done."
                break
            error_msg, url, redirected_url, html = self.downloader.download(url)
            #print error_msg, url, redirected_url, html
            if html !=None:
                self.webpagedb.storeHtmlToDb(url,html)#把网页存储起来
                
                self.webpage = WebPage(url,html)#开始解析网页
                self.webpage.parseLinks()#得到全部的超链接
                ruptn = self.get_patterns_from_rules(url)
                print ruptn
                links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)#得到None
                if links:
                    self.addSeeds(links)
            self.mysleep(3)#休息一下再继续爬

    def mysleep(self, n):
        for i in range(1,n+1):
            time.sleep(1)
            print "sleep",i,"of",n
Esempio n. 6
0
    #'''
    url = "http://www.cnbeta.com/"
    downloader = DownloadManager()
    error_msg, url, redirected_url, html = downloader.download(url)
    print "error_msg=%s" %error_msg
    print "url=%s" %url
    print "redirected_url=%s" %redirected_url
    f = open("www.cnbeta.com.html",'w')
    f.write(html)
    f.close()
    webpage = WebPage(url, html)
    webpage.parse_links()

    website = 'cnbeta\.com'
    patnstr = '^(http|https)://(.*\.' + website + ')(.+)$';
    links = webpage.filter_links(tags=['a'], str_patterns=[patnstr])
    links.sort()

    f_filter_links = open('filter_links_cnbeta.txt', 'w')

    #print links
    f = open('links_regged_cnbeta.txt','w')
    for link in links:
        f_filter_links.write('%s\n' % link)
        f.write('%s\n' % link)
        for elem, attr, lnk, pos in webpage.doc.iterlinks():
            absolute = urlparse.urljoin(webpage.url, lnk.strip())
            if absolute == link and elem.text:
                f.write('%s\n' % elem.text.encode('utf-8'))
    f.close()
    f_filter_links.close()