Python regexCompileの例、utils.regexCompile Pythonの例

コード例 #1

0

ファイルを表示

    def parseRobotFile(self, domain, robotFile):
        if not robotFile:  # Avoid urls without robots.txt -- debatable issue.
            return None
        splitL = robotFile.split('\n')
        spLen = len(splitL)
        tokenCreator = lambda v, s=':': tuple(
            map(lambda a: a.strip(' '), v.split(s)))
        domainAllows = {}
        domainDisAllows = {}
        for i in range(spLen):
            line = splitL[i]
            if (not line) or line[0] == '#':
                continue

            attrs = tokenCreator(line)
            if attrs and attrs[0] == 'User-agent':
                clausePresent = (attrs[1] == utils.CRAWLER_NAME
                                 or attrs[1] == '*')
                if not clausePresent:
                    continue

                i += 1

                while i < spLen:
                    l = splitL[i]
                    cont = tokenCreator(l)
                    if cont[0] == 'User-agent':
                        break

                    i += 1
                    if (not l) or l[0] == '#':
                        continue

                    selector = domainDisAllows
                    if cont[0] == 'Allow':
                        selector = domainAllows
                    elif not (cont[0] == 'Disallow' and cont[1]):
                        continue

                    firstCh = firstLetterCompile.search(cont[1])
                    key = '*'
                    if firstCh:
                        key = firstCh.groups(1)[0]

                    try:
                        selector.setdefault(key, []).append(
                            utils.regexCompile(cont[1]))
                    except:
                        pass

        self.__rulesDict__[domain] = {
            'allow': domainAllows,
            'disallow': domainDisAllows
        }
        return True

コード例 #2

0

ファイルを表示

ファイル: RobotParser.py プロジェクト: odeke-em/crawlers

    def parseRobotFile(self, domain, robotFile):
        if not robotFile: # Avoid urls without robots.txt -- debatable issue.
            return None
        splitL = robotFile.split('\n')
        spLen = len(splitL)
        tokenCreator = lambda v, s=':': tuple(map(lambda a: a.strip(' '), v.split(s)))
        domainAllows = {}
        domainDisAllows = {}
        for i in range(spLen):
            line = splitL[i]
            if (not line) or line[0] == '#':
                continue

            attrs = tokenCreator(line)
            if attrs and attrs[0] == 'User-agent':
                clausePresent = (attrs[1] == utils.CRAWLER_NAME or attrs[1] == '*')
                if not clausePresent:
                    continue

                i += 1

                while i < spLen:
                    l = splitL[i]
                    cont = tokenCreator(l)
                    if cont[0] == 'User-agent':
                        break

                    i += 1
                    if (not l) or l[0] == '#':
                       continue 
                   
                    selector = domainDisAllows 
                    if cont[0] == 'Allow':
                        selector = domainAllows
                    elif not (cont[0] == 'Disallow' and cont[1]):
                        continue
                
                    firstCh = firstLetterCompile.search(cont[1])
                    key = '*'
                    if firstCh:
                        key = firstCh.groups(1)[0]

                    try:
                        selector.setdefault(key, []).append(utils.regexCompile(cont[1]))
                    except:
                        pass

        self.__rulesDict__[domain] = {'allow': domainAllows, 'disallow': domainDisAllows}
        return True

コード例 #3

0

ファイルを表示

def main():
    while True:
        try:
            utils.streamPrintFlush(
                "\nTarget Url: eg [www.example.org or http://www.h.com] ",
                sys.stderr)
            lineIn, eofState = readFromStream()
            if eofState: break

            baseUrl = lineIn.strip("\n")

            utils.streamPrintFlush(
                "Your extensions separated by '|' eg png|html: ", sys.stderr)

            lineIn, eofState = readFromStream()
            if eofState: break
            extensions = lineIn.strip("\n")

            utils.streamPrintFlush(
                "\nRecursion Depth(a negative depth indicates you want script to go as far): ",
                sys.stderr)

            lineIn, eofState = readFromStream()
            if eofState: break

            rDepth = int(lineIn.strip("\n"))

            formedRegex = utils.extensionify(extensions
                                             or utils.DEFAULT_EXTENSIONS_REGEX)
            extCompile = utils.regexCompile(formedRegex)

        except ValueError:
            utils.streamPrintFlush("Recursion depth must be an integer\n",
                                   sys.stderr)
        except KeyboardInterrupt:
            utils.streamPrintFlush("Ctrl-C applied. Exiting now..\n",
                                   sys.stderr)
            break
        except Exception:
            continue
        else:
            if not baseUrl:
                continue

            if extCompile:
                getFiles(baseUrl, extCompile, rDepth)

    utils.streamPrintFlush("Bye..\n", sys.stderr)

コード例 #4

0

ファイルを表示

ファイル: fileDownloader.py プロジェクト: odeke-em/crawlers

def main():
  while True:
    try:
      utils.streamPrintFlush(
        "\nTarget Url: eg [www.example.org or http://www.h.com] ", sys.stderr
      )
      lineIn, eofState = readFromStream()
      if eofState: break

      baseUrl = lineIn.strip("\n")

      utils.streamPrintFlush(
       "Your extensions separated by '|' eg png|html: ", sys.stderr
      )

      lineIn, eofState = readFromStream()
      if eofState: break
      extensions = lineIn.strip("\n")
      
      utils.streamPrintFlush(
        "\nRecursion Depth(a negative depth indicates you want script to go as far): "
      ,sys.stderr)

      lineIn, eofState = readFromStream()
      if eofState: break
      
      rDepth = int(lineIn.strip("\n"))

      formedRegex = utils.extensionify(extensions or utils.DEFAULT_EXTENSIONS_REGEX)
      extCompile = utils.regexCompile(formedRegex)

    except ValueError:
      utils.streamPrintFlush("Recursion depth must be an integer\n", sys.stderr)
    except KeyboardInterrupt:
      utils.streamPrintFlush("Ctrl-C applied. Exiting now..\n",sys.stderr)
      break
    except Exception:
      continue
    else:
      if not baseUrl:
        continue

      if extCompile:
        getFiles(baseUrl, extCompile, rDepth)

  utils.streamPrintFlush("Bye..\n",sys.stderr)

コード例 #5

0

ファイルを表示

ファイル: shardy.py プロジェクト: odeke-em/crawlers

def main():
  args, options = restDriver.cliParser()

  # Route manager
  router = Router([
      'http://192.168.1.117:8000', 'http://192.168.1.110:8008', 'http://127.0.0.1:8009'
  ])
  while True:
    try:
      utils.streamPrintFlush(
        "\nTarget Url: eg [www.example.org or http://www.h.com] ", sys.stderr
      )
      lineIn, eofState = readFromStream()
      if eofState: break

      if lineIn:
        baseUrl = lineIn.strip("\n")

      else:
        continue

      utils.streamPrintFlush(
       "Your extensions separated by '|' eg png|html: ", sys.stderr
      )

      lineIn, eofState = readFromStream()
      if eofState: break
      extensions = lineIn.strip("\n")
      
      utils.streamPrintFlush(
        "\nRecursion Depth(a negative depth indicates you want script to go as far): ", sys.stderr
      )

      lineIn, eofState = readFromStream()
      if eofState: break

      elif lineIn:
        rDepth = int(lineIn.strip("\n") or 1)
      else:
        rDepth = 1

      formedRegex = utils.extensionify(extensions or utils.DEFAULT_EXTENSIONS_REGEX)
      extCompile = utils.regexCompile(formedRegex)

    except ValueError:
      utils.streamPrintFlush("Recursion depth must be an integer\n", sys.stderr)
    except KeyboardInterrupt:
      utils.streamPrintFlush("Ctrl-C applied. Exiting now..\n", sys.stderr)
      break
    except Exception:
      # TODO: [Informative exceptions]:
      #       + Handle traceback from sys somehow, since using Exception as e won't
      #         is invalid syntax for x <= Python2.5
      print('Generic exception encountered')
      continue
    else:
      if not baseUrl:
        continue

      if extCompile:
        extractFileUrls(baseUrl, extCompile, router, rDepth)

  utils.streamPrintFlush("Bye..\n",sys.stderr)

コード例 #6

0

ファイルを表示

#!/usr/bin/env python3
# Author: Emmanuel Odeke <*****@*****.**>

import time
import utils

firstLetterCompile = utils.regexCompile('([a-z])')


class RobotParser:
    def __init__(self):
        self.__initTime__ = time.time()
        self.__rulesDict__ = dict()

    def addRobotRule(self, url):
        topDomain = utils.getTopDomain(url)
        if topDomain:
            robotPath = utils.robotsTxt(topDomain)

    def parseRobotFile(self, domain, robotFile):
        if not robotFile:  # Avoid urls without robots.txt -- debatable issue.
            return None
        splitL = robotFile.split('\n')
        spLen = len(splitL)
        tokenCreator = lambda v, s=':': tuple(
            map(lambda a: a.strip(' '), v.split(s)))
        domainAllows = {}
        domainDisAllows = {}
        for i in range(spLen):
            line = splitL[i]
            if (not line) or line[0] == '#':

コード例 #7

0

ファイルを表示

ファイル: RobotParser.py プロジェクト: odeke-em/crawlers

#!/usr/bin/env python3
# Author: Emmanuel Odeke <*****@*****.**>

import time
import utils

firstLetterCompile = utils.regexCompile('([a-z])')

class RobotParser:
    def __init__(self):
        self.__initTime__ = time.time()
        self.__rulesDict__ = dict()

    def addRobotRule(self, url): 
        topDomain = utils.getTopDomain(url)
        if topDomain:
            robotPath = utils.robotsTxt(topDomain)

    def parseRobotFile(self, domain, robotFile):
        if not robotFile: # Avoid urls without robots.txt -- debatable issue.
            return None
        splitL = robotFile.split('\n')
        spLen = len(splitL)
        tokenCreator = lambda v, s=':': tuple(map(lambda a: a.strip(' '), v.split(s)))
        domainAllows = {}
        domainDisAllows = {}
        for i in range(spLen):
            line = splitL[i]
            if (not line) or line[0] == '#':
                continue

コード例 #8

0

ファイルを表示

def main():
    args, options = restDriver.cliParser()

    # Route manager
    router = Router([
        'http://192.168.1.117:8000', 'http://192.168.1.110:8008',
        'http://127.0.0.1:8009'
    ])
    while True:
        try:
            utils.streamPrintFlush(
                "\nTarget Url: eg [www.example.org or http://www.h.com] ",
                sys.stderr)
            lineIn, eofState = readFromStream()
            if eofState: break

            if lineIn:
                baseUrl = lineIn.strip("\n")

            else:
                continue

            utils.streamPrintFlush(
                "Your extensions separated by '|' eg png|html: ", sys.stderr)

            lineIn, eofState = readFromStream()
            if eofState: break
            extensions = lineIn.strip("\n")

            utils.streamPrintFlush(
                "\nRecursion Depth(a negative depth indicates you want script to go as far): ",
                sys.stderr)

            lineIn, eofState = readFromStream()
            if eofState: break

            elif lineIn:
                rDepth = int(lineIn.strip("\n") or 1)
            else:
                rDepth = 1

            formedRegex = utils.extensionify(extensions
                                             or utils.DEFAULT_EXTENSIONS_REGEX)
            extCompile = utils.regexCompile(formedRegex)

        except ValueError:
            utils.streamPrintFlush("Recursion depth must be an integer\n",
                                   sys.stderr)
        except KeyboardInterrupt:
            utils.streamPrintFlush("Ctrl-C applied. Exiting now..\n",
                                   sys.stderr)
            break
        except Exception:
            # TODO: [Informative exceptions]:
            #       + Handle traceback from sys somehow, since using Exception as e won't
            #         is invalid syntax for x <= Python2.5
            print('Generic exception encountered')
            continue
        else:
            if not baseUrl:
                continue

            if extCompile:
                extractFileUrls(baseUrl, extCompile, router, rDepth)

    utils.streamPrintFlush("Bye..\n", sys.stderr)