def parseRobotFile(self, domain, robotFile): if not robotFile: # Avoid urls without robots.txt -- debatable issue. return None splitL = robotFile.split('\n') spLen = len(splitL) tokenCreator = lambda v, s=':': tuple( map(lambda a: a.strip(' '), v.split(s))) domainAllows = {} domainDisAllows = {} for i in range(spLen): line = splitL[i] if (not line) or line[0] == '#': continue attrs = tokenCreator(line) if attrs and attrs[0] == 'User-agent': clausePresent = (attrs[1] == utils.CRAWLER_NAME or attrs[1] == '*') if not clausePresent: continue i += 1 while i < spLen: l = splitL[i] cont = tokenCreator(l) if cont[0] == 'User-agent': break i += 1 if (not l) or l[0] == '#': continue selector = domainDisAllows if cont[0] == 'Allow': selector = domainAllows elif not (cont[0] == 'Disallow' and cont[1]): continue firstCh = firstLetterCompile.search(cont[1]) key = '*' if firstCh: key = firstCh.groups(1)[0] try: selector.setdefault(key, []).append( utils.regexCompile(cont[1])) except: pass self.__rulesDict__[domain] = { 'allow': domainAllows, 'disallow': domainDisAllows } return True
def parseRobotFile(self, domain, robotFile): if not robotFile: # Avoid urls without robots.txt -- debatable issue. return None splitL = robotFile.split('\n') spLen = len(splitL) tokenCreator = lambda v, s=':': tuple(map(lambda a: a.strip(' '), v.split(s))) domainAllows = {} domainDisAllows = {} for i in range(spLen): line = splitL[i] if (not line) or line[0] == '#': continue attrs = tokenCreator(line) if attrs and attrs[0] == 'User-agent': clausePresent = (attrs[1] == utils.CRAWLER_NAME or attrs[1] == '*') if not clausePresent: continue i += 1 while i < spLen: l = splitL[i] cont = tokenCreator(l) if cont[0] == 'User-agent': break i += 1 if (not l) or l[0] == '#': continue selector = domainDisAllows if cont[0] == 'Allow': selector = domainAllows elif not (cont[0] == 'Disallow' and cont[1]): continue firstCh = firstLetterCompile.search(cont[1]) key = '*' if firstCh: key = firstCh.groups(1)[0] try: selector.setdefault(key, []).append(utils.regexCompile(cont[1])) except: pass self.__rulesDict__[domain] = {'allow': domainAllows, 'disallow': domainDisAllows} return True
def main(): while True: try: utils.streamPrintFlush( "\nTarget Url: eg [www.example.org or http://www.h.com] ", sys.stderr) lineIn, eofState = readFromStream() if eofState: break baseUrl = lineIn.strip("\n") utils.streamPrintFlush( "Your extensions separated by '|' eg png|html: ", sys.stderr) lineIn, eofState = readFromStream() if eofState: break extensions = lineIn.strip("\n") utils.streamPrintFlush( "\nRecursion Depth(a negative depth indicates you want script to go as far): ", sys.stderr) lineIn, eofState = readFromStream() if eofState: break rDepth = int(lineIn.strip("\n")) formedRegex = utils.extensionify(extensions or utils.DEFAULT_EXTENSIONS_REGEX) extCompile = utils.regexCompile(formedRegex) except ValueError: utils.streamPrintFlush("Recursion depth must be an integer\n", sys.stderr) except KeyboardInterrupt: utils.streamPrintFlush("Ctrl-C applied. Exiting now..\n", sys.stderr) break except Exception: continue else: if not baseUrl: continue if extCompile: getFiles(baseUrl, extCompile, rDepth) utils.streamPrintFlush("Bye..\n", sys.stderr)
def main(): while True: try: utils.streamPrintFlush( "\nTarget Url: eg [www.example.org or http://www.h.com] ", sys.stderr ) lineIn, eofState = readFromStream() if eofState: break baseUrl = lineIn.strip("\n") utils.streamPrintFlush( "Your extensions separated by '|' eg png|html: ", sys.stderr ) lineIn, eofState = readFromStream() if eofState: break extensions = lineIn.strip("\n") utils.streamPrintFlush( "\nRecursion Depth(a negative depth indicates you want script to go as far): " ,sys.stderr) lineIn, eofState = readFromStream() if eofState: break rDepth = int(lineIn.strip("\n")) formedRegex = utils.extensionify(extensions or utils.DEFAULT_EXTENSIONS_REGEX) extCompile = utils.regexCompile(formedRegex) except ValueError: utils.streamPrintFlush("Recursion depth must be an integer\n", sys.stderr) except KeyboardInterrupt: utils.streamPrintFlush("Ctrl-C applied. Exiting now..\n",sys.stderr) break except Exception: continue else: if not baseUrl: continue if extCompile: getFiles(baseUrl, extCompile, rDepth) utils.streamPrintFlush("Bye..\n",sys.stderr)
def main(): args, options = restDriver.cliParser() # Route manager router = Router([ 'http://192.168.1.117:8000', 'http://192.168.1.110:8008', 'http://127.0.0.1:8009' ]) while True: try: utils.streamPrintFlush( "\nTarget Url: eg [www.example.org or http://www.h.com] ", sys.stderr ) lineIn, eofState = readFromStream() if eofState: break if lineIn: baseUrl = lineIn.strip("\n") else: continue utils.streamPrintFlush( "Your extensions separated by '|' eg png|html: ", sys.stderr ) lineIn, eofState = readFromStream() if eofState: break extensions = lineIn.strip("\n") utils.streamPrintFlush( "\nRecursion Depth(a negative depth indicates you want script to go as far): ", sys.stderr ) lineIn, eofState = readFromStream() if eofState: break elif lineIn: rDepth = int(lineIn.strip("\n") or 1) else: rDepth = 1 formedRegex = utils.extensionify(extensions or utils.DEFAULT_EXTENSIONS_REGEX) extCompile = utils.regexCompile(formedRegex) except ValueError: utils.streamPrintFlush("Recursion depth must be an integer\n", sys.stderr) except KeyboardInterrupt: utils.streamPrintFlush("Ctrl-C applied. Exiting now..\n", sys.stderr) break except Exception: # TODO: [Informative exceptions]: # + Handle traceback from sys somehow, since using Exception as e won't # is invalid syntax for x <= Python2.5 print('Generic exception encountered') continue else: if not baseUrl: continue if extCompile: extractFileUrls(baseUrl, extCompile, router, rDepth) utils.streamPrintFlush("Bye..\n",sys.stderr)
#!/usr/bin/env python3 # Author: Emmanuel Odeke <*****@*****.**> import time import utils firstLetterCompile = utils.regexCompile('([a-z])') class RobotParser: def __init__(self): self.__initTime__ = time.time() self.__rulesDict__ = dict() def addRobotRule(self, url): topDomain = utils.getTopDomain(url) if topDomain: robotPath = utils.robotsTxt(topDomain) def parseRobotFile(self, domain, robotFile): if not robotFile: # Avoid urls without robots.txt -- debatable issue. return None splitL = robotFile.split('\n') spLen = len(splitL) tokenCreator = lambda v, s=':': tuple( map(lambda a: a.strip(' '), v.split(s))) domainAllows = {} domainDisAllows = {} for i in range(spLen): line = splitL[i] if (not line) or line[0] == '#':
#!/usr/bin/env python3 # Author: Emmanuel Odeke <*****@*****.**> import time import utils firstLetterCompile = utils.regexCompile('([a-z])') class RobotParser: def __init__(self): self.__initTime__ = time.time() self.__rulesDict__ = dict() def addRobotRule(self, url): topDomain = utils.getTopDomain(url) if topDomain: robotPath = utils.robotsTxt(topDomain) def parseRobotFile(self, domain, robotFile): if not robotFile: # Avoid urls without robots.txt -- debatable issue. return None splitL = robotFile.split('\n') spLen = len(splitL) tokenCreator = lambda v, s=':': tuple(map(lambda a: a.strip(' '), v.split(s))) domainAllows = {} domainDisAllows = {} for i in range(spLen): line = splitL[i] if (not line) or line[0] == '#': continue
def main(): args, options = restDriver.cliParser() # Route manager router = Router([ 'http://192.168.1.117:8000', 'http://192.168.1.110:8008', 'http://127.0.0.1:8009' ]) while True: try: utils.streamPrintFlush( "\nTarget Url: eg [www.example.org or http://www.h.com] ", sys.stderr) lineIn, eofState = readFromStream() if eofState: break if lineIn: baseUrl = lineIn.strip("\n") else: continue utils.streamPrintFlush( "Your extensions separated by '|' eg png|html: ", sys.stderr) lineIn, eofState = readFromStream() if eofState: break extensions = lineIn.strip("\n") utils.streamPrintFlush( "\nRecursion Depth(a negative depth indicates you want script to go as far): ", sys.stderr) lineIn, eofState = readFromStream() if eofState: break elif lineIn: rDepth = int(lineIn.strip("\n") or 1) else: rDepth = 1 formedRegex = utils.extensionify(extensions or utils.DEFAULT_EXTENSIONS_REGEX) extCompile = utils.regexCompile(formedRegex) except ValueError: utils.streamPrintFlush("Recursion depth must be an integer\n", sys.stderr) except KeyboardInterrupt: utils.streamPrintFlush("Ctrl-C applied. Exiting now..\n", sys.stderr) break except Exception: # TODO: [Informative exceptions]: # + Handle traceback from sys somehow, since using Exception as e won't # is invalid syntax for x <= Python2.5 print('Generic exception encountered') continue else: if not baseUrl: continue if extCompile: extractFileUrls(baseUrl, extCompile, router, rDepth) utils.streamPrintFlush("Bye..\n", sys.stderr)