Example #1
0
class Crawler():

    def __init__(self ):
        self.downloader = DownloadManager()
        self.webpage = None
        self.init_database()
        self.rules = {}

    def init_database(self):
        self.queue = QueueDB('queue.db')
        self.webpagedb = WebpageDB('webpage.db')
        self.duplcheck = DuplCheckDB('duplcheck.db')
    
    def add_seeds(self, links):
        new_links = self.duplcheck.filter_dupl_urls(links)
        self.duplcheck.add_urls(new_links)
        self.queue.push_urls(new_links)
    
    def add_rules(self, rules):
        self.rules = {}
        for url, inurls in rules.items():
            reurl = re.compile(url)
            repatn = []
            for u in inurls:
                repatn.append(re.compile(u))
            self.rules[reurl] = repatn

    def get_patterns_from_rules(self,url):
        patns = []
        for purl,ru in self.rules.items():
            if purl.match(url)!= None:
                patns.extend(ru)
        return list(set(patns))

    def start(self):
        while 1:
            url = self.queue.pop_url()
            print url
            if url == None:
                print "crawling task is done."
                break
            error_msg, url, redirected_url, html = self.downloader.download(url)
            #print error_msg, url, redirected_url, html
            if html !=None:
                self.webpagedb.html2db(url,html)
                
                self.webpage = WebPage(url,html)
                self.webpage.parse_links()
                ruptn = self.get_patterns_from_rules(url)
                print ruptn
                links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)
                self.add_seeds(links)
            self.mysleep(3)        

    def mysleep(self, n):
        for i in range(n):
            time.sleep(1)
            print "sleep",i,"of",n
Example #2
0
 def init_database(self):
     self.queue = QueueDB('queue.db')
     self.webpagedb = WebpageDB('webpage.db')
     self.duplcheck = DuplCheckDB('duplcheck.db')
Example #3
0
class Crawler(object):
    def __init__(self):
        super(Crawler, self).__init__()
        self.downloader = DownloadManager()
        self.webpage = None
        self.init_database()
        self.rules = {}

    def init_database(self):
        self.queue = QueueDB('queue.db')
        self.webpagedb = WebpageDB('webpage.db')
        self.duplcheck = DuplCheckDB('duplcheck.db')

    def add_seeds(self, links):
        new_links = self.duplcheck.filter_dupl_urls(links)
        self.duplcheck.add_urls(new_links)
        self.queue.push_urls(new_links)

    def add_rules(self, rules):
        self.rules = {}
        for url, inurls in rules.items():
            reurl = re.compile(url)
            repatn = []
            for u in inurls:
                repatn.append(re.compile(u))
            self.rules[reurl] = repatn

    def get_patterns_from_rules(self, url):
        patns = []
        for purl, ru in self.rules.items():
            if purl.match(url) != None:
                patns.extend(ru)
        return list(set(patns))

    def getlinks(self, url, html):
        self.webpage = WebPage(url, html)
        self.webpage.parse_links()
        ruptn = self.get_patterns_from_rules(url)
        #print ruptn
        links = self.webpage.filter_links(tags=['a'], patterns=ruptn)
        return links

    def start(self):
        while 1:
            url = self.queue.pop_url()
            print url
            if url == None:
                print "crawling task is done."
                break
            error_msg, url, redirected_url, html = self.downloader.download(
                url)
            #print error_msg, url, redirected_url, html
            if html != None:
                self.webpagedb.html2db(url, html)
                links = self.getlinks(url, html)
                self.add_seeds(links)
            self.mysleep(3)

    def mysleep(self, n):
        for i in range(n):
            time.sleep(1)
            print "sleep", i, "of", n
Example #4
0
 def init_database(self):
     self.queue = QueueDB('queue.db')
     self.webpagedb = WebpageDB('webpage.db')
     self.duplcheck = DuplCheckDB('duplcheck.db')
Example #5
0
 def init_database(self):
     self.queue = QueueDB('queue.db', 0)
     self.webpagedb = WebpageDB('webpage.db')
     self.feedsdb = FeedsDB('duplcheck.db')
Example #6
0
class Crawler():

    def __init__(self ):
        self.downloader = DownloadManager()
        self.webpage = None
        self.init_database()
        self.rules = {}
        self.files = []
        self.file_rule = ".+"

    def init_database(self):
        self.queue = QueueDB('queue.db')
        self.webpagedb = WebpageDB('webpage.db')
        self.duplcheck = DuplCheckDB('duplcheck.db')
        self.repodb = RepoStateDB()   
 
    def add_seeds(self, links):
        new_links = self.duplcheck.filter_dupl_urls(links)
        self.duplcheck.add_urls(new_links)
        self.queue.push_urls(new_links)
    
    def add_rules(self, rules):
        self.rules = {}
        for url, inurls in rules.items():
            reurl = re.compile(url)
            repatn = []
            for u in inurls:
                repatn.append(re.compile(u))
            self.rules[reurl] = repatn

    def set_file_rule(self, rule):
        self.file_rule = rule

    def get_patterns_from_rules(self,url):
        patns = []
        for purl,ru in self.rules.items():
            if purl.match(url)!= None:
                patns.extend(ru)
        return list(set(patns))
    
    def download_files(self, files):
        for f in files:
            #cmd = "wget --force-directories -c " + f + " -P " + config.repos_dir
            cmd = "wget -c " + f + " -P " + config.repos_dir
            ret_code = os.system(cmd)
            self.repodb.update(f, ret_code == 0)

    def start(self):
        while 1:
            url = self.queue.pop_url()
            print url
            if url == None:
                print "crawling task is done."
                break
            error_msg, url, redirected_url, html = self.downloader.download(url)
    #        print error_msg, url, redirected_url, html
            if html !=None:
                self.webpagedb.html2db(url,html)
 
                self.webpage = WebPage(url,html)
                self.webpage.parse_links()
                ruptn = self.get_patterns_from_rules(url)
                #print ruptn
                links = self.webpage.filter_links(tags = ['a'], patterns= ruptn)
                print links
                self.add_seeds(links)
                file_pattern = []
                file_pattern.append(re.compile(self.file_rule))
                files = self.webpage.filter_links(tags = ['a'], patterns = file_pattern)
                self.files.append(files)
                #TODO:
                self.download_files(files)
                print files

    def mysleep(self, n):
        for i in range(n):
            time.sleep(1)
            print "sleep",i,"of",n