def __init__(self,url,dbFile,outputFile,maxCount=None):
     self.url = url # url to be crawled
     if maxCount == None:
         self.maxCount = -1
     else:
         '''
         maxcount is the maximum number of links to be fetched by the crawler.
         It is incremented as we should accommodate the initial user input while 
         counting the total number of links in the repository as the link entered by the user
         will also be persisted in the repository
         (i.e)if user requests to crawl python.org and asks to fetch 2 links , the program should 
         terminate when there are 3 links in repository as python.org is also one of the links in repository   
         '''
         self.maxCount = maxCount + 1
         
     self.extracter = LinkExtracter()
     self.dataHandler = DataHandler(self.maxCount,dbFile,outputFile)
     self.log = CrawlerLogger.getlogger()
Example #2
0
    def __init__(self, url, dbFile, outputFile, maxCount=None):
        self.url = url  # url to be crawled
        if maxCount == None:
            self.maxCount = -1
        else:
            '''
            maxcount is the maximum number of links to be fetched by the crawler.
            It is incremented as we should accommodate the initial user input while 
            counting the total number of links in the repository as the link entered by the user
            will also be persisted in the repository
            (i.e)if user requests to crawl python.org and asks to fetch 2 links , the program should 
            terminate when there are 3 links in repository as python.org is also one of the links in repository   
            '''
            self.maxCount = maxCount + 1

        self.extracter = LinkExtracter()
        self.dataHandler = DataHandler(self.maxCount, dbFile, outputFile)
        self.log = CrawlerLogger.getlogger()
 def __init__(self):
     self.log = CrawlerLogger.getlogger()
         sys.exit()
     if cmdlength == 3:
         try:
             maxlinks = int(sys.argv[2])
         except ValueError:
             print('Invalid maximum links')
             sys.exit()
         if maxlinks < 1:
             print("maximum links should be minimum 1")
             sys.exit()
 else:
     print("Invalid number of arguments")
     sys.exit()
 try:
     signal.signal(signal.SIGINT, signal_handler)
     CrawlerLogger.init()
     log = CrawlerLogger.getlogger()
     if not url[len(url)-1] == '/':
         url = url + '/'
     crawler = Crawler(url,'crawler.db','links.txt',maxlinks)
     print('Crawling ....')
     res = crawler.Crawl()
     if res:
         webbrowser.open("links.txt")
     
 except CrawlerError as ce:
     print(ce)
     
 except Exception as e:
     if not log is None:
         log.error(e,exc_info=sys.exc_info()[2])
 def __init__(self):
     self.log = CrawlerLogger.getlogger()