def __init__(self): Config.__init__(self) self.UserAgentString = "UCI Inf141-CS121 crawler 24427400 59359881 33062456 62838370" if os.path.exists('count.txt'): with open('count.txt','r') as file: self.count = int(file.readline()) else: self.count = 0 self.PolitenessDelay = 1200
def __init__(self): Config.__init__(self) self.UserAgentString = "lordnahor-libseek-MSR-app" self.MaxWorkerThreads = 8 self.DepthFirstTraversal = True self.FrontierTimeOut = 100 self.WorkerTimeOut = 100 self.OutBufferTimeOut = 100 self.PolitenessDelay = 1000 self.MaxPageSize = 1048576*5 self.IgnoreRobotRule = True self.urlToNameMap = shelve.open("urlDataPersist.shelve")
def __init__(self): Config.__init__(self) self.UserAgentString = "UCI Inf141-CS121 crawler 63393716 32393047 22863530 82181685" self.PolitenessDelay = 600 #Timeout(Seconds) for trying to get the next url from the frontier. self.FrontierTimeOut = 60 #Timeout(Seconds) for trying to get a free worker thread, (worker is taking too long maybe?) self.WorkerTimeOut = 60 #Timeout(Seconds) for getting data from the output queue self.OutBufferTimeOut = 60 self.MaxQueueSize = 100 self.urlValidator = UrlValidator(verbose=False) self.dbConf = open('db.conf').read() self.conn = self.connectDatabase() print "Using Postgres shelve implementation..." self.PersistenceObject = NetShelve.PgShelve(self.connectDatabase())
def __init__(self): Config.__init__(self) self.UserAgentString = "Set This Value!"
def __init__(self): Config.__init__(self) self.UserAgentString = "IR W16 WebCrawler 85686586 42686317 79403075"
def __init__(self): Config.__init__(self) self.UserAgentString = "IR W16 WebCrawler 75307532_92707006_48565650" self.MaxWorkerThreads = 12
def __init__(self): Config.__init__(self) self.UserAgentString = ( "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36" )
def __init__(self): Config.__init__(self) self.UserAgentString = "UCI Inf141-CS121 crawler 33819914"
def __init__(self): Config.__init__(self) self.PolitenessDelay = 1000 #self.MaxDepth = 4 self.UserAgentString = "UCI Inf141-CS121 crawler 67995387 90117275 14971857"
def __init__(self): Config.__init__(self) self.UserAgentString = "INF141 <28859606 76439804 50233729 71903006>"
def __init__(self): Config.__init__(self) self.UserAgentString = "UCI Inf141-CS121 crawler ratkins1" self.log = "log.txt" self.contentLog = "content.txt" self.content = defaultdict(dict)