def dumpRunArgs( self, args ): """ Dump all command line arguments and their values """ d = vars( args ) for a in d: dblog.log( 'args', '%s: %s' % ( a, d[ a ] ), level = logging.DEBUG )
def dumpRunArgs(self, args): """ Dump all command line arguments and their values """ d = vars(args) for a in d: dblog.log('args', '%s: %s' % (a, d[a]), level=logging.DEBUG)
def dumpConf( self, level = logging.DEBUG ): """ Dump all configuration variables and their values """ for s in self.configParser.sections( ): for o in sorted( self.configParser.options( s ) ): dblog.log( 'conf', '%s: %s' % ( o.title( ), self.getConf( s, o ) ), level = level )
def visitUrl( self, url ): """ Visit a url with the browser """ dblog.log( 'visit', url ) self.browser( ).get( url ) self.stats[ 'visited' ] += 1
def visitUrl(self, url): """ Visit a url with the browser """ dblog.log('visit', url) self.browser().get(url) self.stats['visited'] += 1
def takeSelfie( db, url ): """ Captures a screenshot of the browser, the url is what was sent to handleUrl(), thus the browser may have fired an action handler, so we shouldn't expect a screenshot to always be the same as browser.get(url). """ # build a unique filename for this running instance filename = '%s-%s.png' % ( time.time( ), selfieName( url ) ) # determine where we are putting these selfies dir = db.getConf( conf.conf.OPTIONS, conf.conf.OPTION_SELFIES_DIR ) # ensure this directory actually exists if not os.path.isdir( dir ): os.makedirs( dir ) # ensure we're only working with an absolute path here dir = os.path.abspath( dir ) # put them together and what do we get? a nice consistent dir of filenames with timestamps # that can be scanned and used as frames in a movie / sequence fileFull = dir + '/' + filename if db.browser( ).get_screenshot_as_file( fileFull ): dblog.log( 'selfie', 'Saved [ %s ]' % fileFull ) else: dblog.log( 'selfie', 'Failed to save [ %s ]' % fileFull, level = logging.WARNING )
def dumpProcessedUrls( self, level = logging.DEBUG ): """ Dump all the URLs that have been processed and when they were processed """ for url in self.processedUrls: for ts in reversed( self.processedUrls[ url ] ): dblog.log( 'urlhist', '%s %s %s' % ( ts.strftime( '%Y-%M-%d %H:%M:%S' ), dblog.LOG_PIPE_CHAR, url ), level = level )
def waitUntil(self, whenStr): startDtm = dateutil.parser.parse(str(whenStr)) nowDtm = datetime.now() diff = (startDtm-nowDtm) diffSec = round(diff.total_seconds(),0) if diffSec > 0: dblog.log('wait','Waiting %s before starting, resuming at %s' % ( diff, startDtm ) ) self.idle( diffSec ) dblog.log('wait','Finished waiting, target time = %s' % ( diff, startDtm ) )
def dumpConf(self, level=logging.DEBUG): """ Dump all configuration variables and their values """ for s in self.configParser.sections(): for o in sorted(self.configParser.options(s)): dblog.log('conf', '%s: %s' % (o.title(), self.getConf(s, o)), level=level)
def dumpProcessedUrls(self, level=logging.DEBUG): """ Dump all the URLs that have been processed and when they were processed """ for url in self.processedUrls: for ts in reversed(self.processedUrls[url]): dblog.log('urlhist', '%s %s %s' % (ts.strftime('%Y-%M-%d %H:%M:%S'), dblog.LOG_PIPE_CHAR, url), level=level)
def logStats( self ): """ Log start-finish stats in a formatted manner """ dur = self.stats[ 'tock' ] - self.stats[ 'tick' ] hrRate = float( ( 60 * 60 ) / dur.total_seconds( ) ) total = self.stats[ 'skipped' ] + self.stats[ 'handled' ] + self.stats[ 'visited' ] dblog.log( 'stat', 'Skipped: %s [ %.2f url/hr ]' % ( self.stats[ 'skipped' ], hrRate * self.stats[ 'skipped' ] ) ) dblog.log( 'stat', 'Handled: %s [ %.2f url/hr ]' % ( self.stats[ 'handled' ], hrRate * self.stats[ 'handled' ] ) ) dblog.log( 'stat', 'Visited: %s [ %.2f url/hr ]' % ( self.stats[ 'visited' ], hrRate * self.stats[ 'visited' ] ) ) dblog.log( 'stat', 'Total: %s [ %.2f url/hr ]' % ( total, hrRate * total ) ) dblog.log( 'stat', 'Elapsed: %s' % dur )
def loadList( self, filename ): """ Load white/black list from the location and read in each line as a new regex entry """ list = [ ] # give lists an existential crisis test if os.path.isfile( filename ): with open( filename ) as f: list = [ x.strip( '\n' ) for x in f.readlines( ) ] dblog.log( 'list', 'Found %d records from %s' % ( len( list ), filename ) ) return list
def loadList(self, filename): """ Load white/black list from the location and read in each line as a new regex entry """ list = [] # give lists an existential crisis test if os.path.isfile(filename): with open(filename) as f: list = [x.strip('\n') for x in f.readlines()] dblog.log('list', 'Found %d records from %s' % (len(list), filename)) return list
def tryUrlHandler( self, url, handler ): """ Try a regex-handleFunc pair for the URL """ pattern = re.compile( handler[ 0 ] ) match = pattern.findall( url ) # do we have a match? if len( match ): handleFunc = handler[ 1 ] dblog.log( 'handler', "'%s' handling url [ %s ]" % ( handleFunc.__name__, url ) ) # use the handler and the regex match for the URL handleFunc( self, url, match ) # mark it handled and move on self.stats[ 'handled' ] += 1 self.postHandleUrl( url ) return True return False
def tryUrlHandler(self, url, handler): """ Try a regex-handleFunc pair for the URL """ pattern = re.compile(handler[0]) match = pattern.findall(url) # do we have a match? if len(match): handleFunc = handler[1] dblog.log('handler', "'%s' handling url [ %s ]" % (handleFunc.__name__, url)) # use the handler and the regex match for the URL handleFunc(self, url, match) # mark it handled and move on self.stats['handled'] += 1 self.postHandleUrl(url) return True return False
def logStats(self): """ Log start-finish stats in a formatted manner """ dur = self.stats['tock'] - self.stats['tick'] hrRate = float((60 * 60) / dur.total_seconds()) total = self.stats['skipped'] + self.stats['handled'] + self.stats[ 'visited'] dblog.log( 'stat', 'Skipped: %s [ %.2f url/hr ]' % (self.stats['skipped'], hrRate * self.stats['skipped'])) dblog.log( 'stat', 'Handled: %s [ %.2f url/hr ]' % (self.stats['handled'], hrRate * self.stats['handled'])) dblog.log( 'stat', 'Visited: %s [ %.2f url/hr ]' % (self.stats['visited'], hrRate * self.stats['visited'])) dblog.log('stat', 'Total: %s [ %.2f url/hr ]' % (total, hrRate * total)) dblog.log('stat', 'Elapsed: %s' % dur)
def run(self): """ Browse the internet in a similar manner to the history database """ # handle all the arg setup / parsing parser = argparse.ArgumentParser( description='Launch cruise-control for websites', formatter_class=argparse.ArgumentDefaultsHelpFormatter, usage='%(prog)s run' if not self.txtFile else '%(prog)s txt [txtFile]') self.addConfigParserArgs(parser) self.addLocationParserArgs(parser) self.addRunParserArgs(parser) self.addLogParserArgs(parser) args = self.parseAndMergeArgs(parser) self.skipHandling = args.skip_urls # setup data manager and browser self.runBootstrap() if self.txtFile: self.urllist = self.loadList(args.txtFile) # wait until out start dtm self.user.waitUntil(args.start) # begin magic # self.urlHistory( ) # self.urlHistory( '%duckduckgo.com/%' ) self.simulateRealtime() # memories to last a lifetime if self.getConf(conf.conf.OPTIONS, conf.conf.OPTION_SELFIES) == 'True': dir = self.getConf(conf.conf.OPTIONS, conf.conf.OPTION_SELFIES_DIR) nameNoExt = '%s-selfies' % str(time.time()).split('.')[0] outputFileName = selfies.compileSelfies(dir, nameNoExt) dblog.log('selfie', 'Compiled selfie movie at [ %s ]' % outputFileName) # Shuuuut iiit doooooown self.shutdown()
def typeKeys( self, db, inputElem, text, sloppify = True, enableDrunkRate = False ): """ Types keys as if the user did it themselves, also has fuzz-factor """ errRate = self.typingRand speedWpm = self.typingWpm if enableDrunkRate: errRate *= self.DRUNK_ERR_FACTOR speedWpm *= self.DRUNK_WPM_FACTOR if sloppify: # mangle and sloppify the string to a more drunk-like string str = self.sloppify( text, errRate ) # log original and sloppy text, replace unicode backspace with '\b' for easier reading in log output dblog.log( 'user', 'Sloppifying [ %s ] -> [ %s ]' % ( text, str.replace( Keys.BACKSPACE, '\\b' ) ) ) text = str # send each character and wait a fuzzed duration for c in text: inputElem.send_keys( c ) self.reactKeyPress( speedWpm )
def urlPassesWhiteBlackLists( self, url ): """ Check if a URL is valid when matched against white and blacklists A whitelist always overrides blacklists """ # no lists equal insta-pass wLen = len( self.whitelist ) bLen = len( self.blacklist ) dblog.log( 'list', 'white: %d | black: %d' % ( wLen, bLen ), level = logging.DEBUG ) if wLen + bLen == 0: dblog.log( 'list', 'Url [ %s ] didn\'t test, empty lists' % url, level = logging.DEBUG ) return True # check if we pass the white/blacklist test wMatch = self.urlHasListMatch( self.whitelist, url ) bMatch = self.urlHasListMatch( self.blacklist, url ) dblog.log( 'list', 'Url [ %s ] tested on lists -> white[%d] = %s | black[%d] = %s ' % ( url, wLen, wMatch, bLen, bMatch ), level = logging.DEBUG ) # if we have white && black list, only pass if # url matches the whitelist and not the blacklist if wLen and bLen: return wMatch and not bMatch # return whitelist match if we have a list if wLen: return wMatch # matching a blacklist is bad, do not pass if bLen: return not bMatch # shouldn't ever get here ... return False
def simulateRealtime(self): """ Use current date/time to build a "typical" browsing pattern """ # continuously loop, refreshing the seed data as needed expiresDtm = None processUrls = True while processUrls: # first up, get history if our time block is up now = datetime.now() if (expiresDtm is None) or (now >= expiresDtm): failures = 0 expiresDtm = now + timedelta(minutes=30) dblog.log('sim', 'Refreshing seed data, valid until %s' % expiresDtm) if self.txtFile: # refresh from the text file - don't need to actually reload all the URLs # since they are cached. Just change the rate at which we browse so there # is a change in behavior at this period urlRows = self.urllist.copy() ratePerHalfHour = random.uniform(150, 500) else: # refresh from the database (urlRows, ratePerHalfHour) = self.historian.getSeedUrlData(now) # if there are no URL results foudn, we have to go home urlRowsOrigLen = len(urlRows) if not urlRowsOrigLen: dblog.log('sim', 'No URLs found :-(') return # convert 30-min rate to secs/url secPerUrl = round(60 / (ratePerHalfHour / 30), 2) # pick a random item in the urls idx = int(random.uniform(0, len(urlRows))) # remove it so it's not seen again urlRow = urlRows.pop(idx) url = urlRow['url'] if not self.txtFile else urlRow # handle it with our main handler / dispatcher if self.handleUrl(url): # wait the appropriate amount of time; we'll fudge the time by +/- 25% sleepSec = abs( secPerUrl + round(random.choice([-1, 1]) * (secPerUrl / 4), 2)) dblog.log('sim', 'Finished URL, waiting %d seconds' % sleepSec) self.user.idle(sleepSec) else: failures += 1 # reset the expiration time if we're out of URLs if not len(urlRows): expiresDtm = None # only keep going if we haven't failed on every URL attempt processUrls = (failures < urlRowsOrigLen)
def run( self ): """ Browse the internet in a similar manner to the history database """ # handle all the arg setup / parsing parser = argparse.ArgumentParser( description = 'Launch cruise-control for websites', formatter_class = argparse.ArgumentDefaultsHelpFormatter, usage = '%(prog)s run' if not self.txtFile else '%(prog)s txt [txtFile]') self.addConfigParserArgs( parser ) self.addLocationParserArgs( parser ) self.addRunParserArgs( parser ) self.addLogParserArgs( parser ) args = self.parseAndMergeArgs( parser ) self.skipHandling = args.skip_urls # setup data manager and browser self.runBootstrap( ) if self.txtFile: self.urllist = self.loadList( args.txtFile ) # wait until out start dtm self.user.waitUntil( args.start ) # begin magic # self.urlHistory( ) # self.urlHistory( '%duckduckgo.com/%' ) self.simulateRealtime( ) # memories to last a lifetime if self.getConf( conf.conf.OPTIONS, conf.conf.OPTION_SELFIES ) == 'True': dir = self.getConf( conf.conf.OPTIONS, conf.conf.OPTION_SELFIES_DIR ) nameNoExt = '%s-selfies' % str( time.time( ) ).split( '.' )[ 0 ] outputFileName = selfies.compileSelfies( dir, nameNoExt ) dblog.log( 'selfie', 'Compiled selfie movie at [ %s ]' % outputFileName ) # Shuuuut iiit doooooown self.shutdown( )
def simulateRealtime( self ): """ Use current date/time to build a "typical" browsing pattern """ # continuously loop, refreshing the seed data as needed expiresDtm = None processUrls = True while processUrls: # first up, get history if our time block is up now = datetime.now() if ( expiresDtm is None ) or ( now >= expiresDtm ): failures = 0 expiresDtm = now + timedelta( minutes = 30 ) dblog.log( 'sim', 'Refreshing seed data, valid until %s' % expiresDtm ) if self.txtFile: # refresh from the text file - don't need to actually reload all the URLs # since they are cached. Just change the rate at which we browse so there # is a change in behavior at this period urlRows = self.urllist.copy() ratePerHalfHour = random.uniform( 150, 500 ) else: # refresh from the database ( urlRows, ratePerHalfHour ) = self.historian.getSeedUrlData( now ) # if there are no URL results foudn, we have to go home urlRowsOrigLen = len( urlRows ) if not urlRowsOrigLen: dblog.log( 'sim', 'No URLs found :-(' ) return # convert 30-min rate to secs/url secPerUrl = round( 60 / ( ratePerHalfHour / 30 ), 2 ) # pick a random item in the urls idx = int( random.uniform( 0, len( urlRows ) ) ) # remove it so it's not seen again urlRow = urlRows.pop( idx ) url = urlRow[ 'url' ] if not self.txtFile else urlRow # handle it with our main handler / dispatcher if self.handleUrl( url ): # wait the appropriate amount of time; we'll fudge the time by +/- 25% sleepSec = abs( secPerUrl + round( random.choice( [-1,1] ) * ( secPerUrl / 4 ), 2 ) ) dblog.log( 'sim', 'Finished URL, waiting %d seconds' % sleepSec ) self.user.idle( sleepSec ) else: failures += 1 # reset the expiration time if we're out of URLs if not len( urlRows ): expiresDtm = None # only keep going if we haven't failed on every URL attempt processUrls = ( failures < urlRowsOrigLen )
def urlPassesWhiteBlackLists(self, url): """ Check if a URL is valid when matched against white and blacklists A whitelist always overrides blacklists """ # no lists equal insta-pass wLen = len(self.whitelist) bLen = len(self.blacklist) dblog.log('list', 'white: %d | black: %d' % (wLen, bLen), level=logging.DEBUG) if wLen + bLen == 0: dblog.log('list', 'Url [ %s ] didn\'t test, empty lists' % url, level=logging.DEBUG) return True # check if we pass the white/blacklist test wMatch = self.urlHasListMatch(self.whitelist, url) bMatch = self.urlHasListMatch(self.blacklist, url) dblog.log( 'list', 'Url [ %s ] tested on lists -> white[%d] = %s | black[%d] = %s ' % (url, wLen, wMatch, bLen, bMatch), level=logging.DEBUG) # if we have white && black list, only pass if # url matches the whitelist and not the blacklist if wLen and bLen: return wMatch and not bMatch # return whitelist match if we have a list if wLen: return wMatch # matching a blacklist is bad, do not pass if bLen: return not bMatch # shouldn't ever get here ... return False
def log(self, msg, level=logging.INFO): """ For consistent logging format within all handler functions """ dblog.log('SQL', msg, level=level)
def skipUrl( self, url, reason = 'skip' ): """ Skip a URL and log it """ dblog.log( reason, url ) self.stats[ 'skipped' ] += 1
def log(self, msg, level=logging.INFO): """ Shortcut to DB's logger """ dblog.log('history', msg, level=level)
def skipUrl(self, url, reason='skip'): """ Skip a URL and log it """ dblog.log(reason, url) self.stats['skipped'] += 1
def log(text): """ For consistent logging format within all handler functions """ dblog.log('handlers', text)
def log( text ): """ For consistent logging format within all handler functions """ dblog.log( 'handlers', text )
def log( self, msg, level = logging.INFO ): """ Shortcut to DB's logger """ dblog.log( 'history', msg, level = level )
def log( self, msg, level = logging.INFO ): """ For consistent logging format within all handler functions """ dblog.log( 'SQL', msg, level = level )