def slowStart(self): print 'slowStart' while not self.kill_received: # Create Threads to fill MAX_THREAD_COUN number = self.MAX_THREAD_COUNT - len(self.threads) print 'Creating ' + str(number) + ' active threads' for i in xrange(number): # Start thread and add to list th = HTTPClient() th.start() self.threads.append(th) # Snooze for 5 secs before evaluating time.sleep(10) # Loop through all threads, checking for errors print 'Evaluating threads' for th in self.threads: # If error, then kill all threads if th.error == True: #self.kill_received = True self.MAX_THREAD_COUNT = len(self.threads) / 2 # Previous level print 'Maximum threads: ' + str(self.MAX_THREAD_COUNT) self.killThreads() print 'selfStart finished. MaxThreads: ' + str(self.MAX_THREAD_COUNT) # Can never be less than one thread, so check if self.MAX_THREAD_COUNT < 1: print 'Setting thread count to 1' self.MAX_THREAD_COUNT = 1 #sys.exit(0) return # If no errors received print 'No threads returned errors' # Outside for loop - if no problem, double # Double MAX_THREAD_COUNT self.MAX_THREAD_COUNT = self.MAX_THREAD_COUNT * 2
def run(self): # Print the maximum thread count print "MAX_THREAD_COUNT: " + str(self.MAX_THREAD_COUNT) # Create threads number = self.MAX_THREAD_COUNT - len(self.threads) print "Creating " + str(number) + " active threads" for i in xrange(number): # Start thread and add to list th = HTTPClient() th.start() self.threads.append(th) # Loop until interrupted by Ctrl + C while not self.kill_received: # Print current date and time and current thread count now = datetime.datetime.now() print "[Client:run] Current date and time:" + str(now) print "[Client:run] Current concurrent thread count:" + str(len(self.threads)) # Get & reset counters getsPerMinute = 0 postsPerMinute = 0 for th in self.threads: getsPerMinute += th.getReqs th.getReqs = 0 postsPerMinute += th.postReqs th.postReqs = 0 getsPerMinute = getsPerMinute / len(self.threads) postsPerMinute = postsPerMinute / len(self.threads) print "[Client:run] Operating at " + str(getsPerMinute) + " GET/m and " + str(postsPerMinute) + " POST/m" # Repeat for bandwidth rxBytes = 0 txBytes = 0 for th in self.threads: rxBytes += th.rxBytes th.rxBytes = 0 txBytes += th.txBytes th.txBytes = 0 rxBytes = rxBytes / len(self.threads) txBytes = txBytes / len(self.threads) # rxBytes = rxBytes / 1000000 # txBytes = txBytes / 1000000 print "[Client:run] Downloading at " + str(rxBytes) + " B/m and Uploading at " + str(txBytes) + " B/m" # Have a snooze... time.sleep(60)
def behaviour(self): self.slowStart() print 'MAX_THREAD_COUNT: ' + str(self.MAX_THREAD_COUNT) time.sleep(60) # Loop until interrupted by Ctrl + C while not self.kill_received: print '[Client:behaviour] in method' now = datetime.datetime.now() print "[Client:behaviour] Current date and time:" + str(now) print "[Client:behaviour] Current concurrent thread count:" + str(len(self.threads)) # Now parse the Config File and show the current thread level counts parseConfig() print "[Client:behaviour] High Thread count: " + threadDict['HIGH'] print "[Client:behaviour] Medium Thread count: " + threadDict['MEDIUM'] print "[Client:behaviour] Low Thread count: " + threadDict['LOW'] # Now check history file for any recent timestamps (past hour) result = checkHistory() # If that result is the "current" signal - stay at the current level # otherwise take action... if result.lower() != 'current': print 'Take action' # Extract values -> values[0]=id, values[1]=time, values[2]=level values = result.split(',') level = values[2] print 'Moving to level: ' + level # Kill current threads self.killThreads() # open x number of threads at this level # where x is defined by the threadDict level x = float(threadDict[level]) # Convert level to percentage x = x / 100 # Multiply percentage by MAX_THREAD_COUNT threads = int(self.MAX_THREAD_COUNT * x) # catch zero threads if threads == 0: threads = 1 print 'Launching ' + str(threads) + ' threads' return for i in xrange(threads): print 'Creating thread' newClient = HTTPClient() # Start thread and add to list newClient.start() self.threads.append(newClient) else: print 'No changes to be made' # Have a snooze... time.sleep(60)
logging.basicConfig( format='%(levelname)7s - %(name)s - %(asctime)s: %(message)s', filename='run.log', level=log_level) console = logging.StreamHandler() console.setFormatter( logging.Formatter('%(levelname)7s - %(name)-8s: %(message)s')) logging.getLogger('').addHandler(console) log = logging.getLogger('main') # ---------------------------------------------------------------- # Load various components, and configure the modules that control # the crawling process # corpus_table = CorpusTable.CorpusTable(args.dbdir) # Storage layer spider = HTTPClient.HTTPClient() # Retrieval code url_normaliser = Normalisation.URLNormaliser() # URL normaliser feature_extractor = Features.Features(url_normaliser, ['title', 'h1']) # Feature extractor # URL Fitness Function #url_rank_function = SimplicityURLRank.SimplicityURLRank() # Prefer simple URLs #url_rank_function = SampleURLRank.SampleURLRank() # Sample code url_rank_function = HumanReadableURLRank.HumanReadableURLRank( ) # Prefer human-readable URLs page_filters = [ # Filters for page rejection # FuzzyDuplicateFilter.FuzzyDuplicateFilter(corpus_table), # Fuzzy hash using ssdeep DuplicateFilter.DuplicateFilter(corpus_table), # Perfect duplicate checker MinimumLengthFilter.MinimumLengthFilter(100), # Min length MaximumLengthFilter.MaximumLengthFilter(800000), # Max length URLCountFilter.URLCountFilter(0, 1000), # URL count MetadataRegexpFilter.MetadataRegexpFilter(