コード例 #1
0
    def canVisit(self, url):
        topDomain = utils.getTopDomain(url)
        retrRules = self.__rulesDict__.get(topDomain, None)
        if retrRules is None:  # Cache miss
            robotsUrl = utils.robotsTxt(url)
            roboFileBuf = utils.dlAndDecode(robotsUrl)
            if not self.parseRobotFile(topDomain, roboFileBuf):
                return False
            retr = self.__rulesDict__[topDomain]

        sp = tuple(filter(lambda a: a, url.split(topDomain)))
        if sp:
            firstCh = firstLetterCompile.search(sp[0])
            if firstCh:
                # Time to probe
                fCh = firstCh.groups(1)[0]
                retr = self.__rulesDict__[topDomain]['disallow']
                compList = retr.get(fCh, None)
                if compList:
                    for comp in compList:
                        if comp.search(sp[0]):
                            return False

                    return True
        return True
コード例 #2
0
ファイル: RobotParser.py プロジェクト: odeke-em/crawlers
    def canVisit(self, url):
        topDomain = utils.getTopDomain(url)
        retrRules = self.__rulesDict__.get(topDomain, None)
        if retrRules is None: # Cache miss
            robotsUrl = utils.robotsTxt(url)
            roboFileBuf = utils.dlAndDecode(robotsUrl)
            if not self.parseRobotFile(topDomain, roboFileBuf):
                return False
            retr = self.__rulesDict__[topDomain]

        sp = tuple(filter(lambda a: a, url.split(topDomain)))
        if sp:
            firstCh = firstLetterCompile.search(sp[0])
            if firstCh:
                # Time to probe
                fCh = firstCh.groups(1)[0]
                retr = self.__rulesDict__[topDomain]['disallow']
                compList = retr.get(fCh, None)
                if compList:
                    for comp in compList:
                        if comp.search(sp[0]):
                            return False 

                    return True
        return True
コード例 #3
0
def extractFileUrls(url,
                    extCompile,
                    router,
                    depth=5,
                    httpDomain=utils.HTTPS_DOMAIN):
    # Args: url, extCompile=> A pattern object of the extension(s) to match
    #      depth => An integer that indicates how deep to scrap
    #                        Note: A negative recursion depth indicates that you want
    #                          to keep crawling as far as the program can go
    if not depth:
        return
    elif not restDriver.isCallableAttr(extCompile, 'search'):
        utils.streamPrintFlush(
            "Expecting a regex compiled object/result as arg 'extCompile'\n",
            sys.stderr)
        return

    if not utils.httpHeadCompile.search(url):
        url = "%s%s" % (httpDomain, url)

    if not robotParser.canVisit(url):
        print('Cannot visit %s due to /robots.txt rules' % (url))
        return

    decodedData = utils.dlAndDecode(url)
    if not decodedData:
        return
    else:
        urls = utils.urlCompile.findall(decodedData)
        urls = [
            utils.repeatHttpHeadCompile.sub(utils.HTTP_HEAD_REGEX, s)
            for s in urls
        ]

        plainUrls = []
        matchedFileUrls = []

        for u in urls:
            pathSelector = plainUrls
            regSearch = extCompile.search(u)
            if regSearch:
                rGroup = regSearch.groups(1)
                u = '%s.%s' % (rGroup[0], rGroup[1])
                pathSelector = matchedFileUrls

            pathSelector.append(u)

        uniqFileUrls = set(matchedFileUrls)
        dlResults = [
            pushUpJob(eachUrl, router, url) for eachUrl in uniqFileUrls
        ]
        resultsList = [val for val in dlResults if val]

        depth -= 1
        for eachUrl in plainUrls:
            extractFileUrls(eachUrl, extCompile, router, depth)
コード例 #4
0
ファイル: shardy.py プロジェクト: odeke-em/crawlers
def extractFileUrls(url, extCompile, router, depth=5, httpDomain=utils.HTTPS_DOMAIN):
  # Args: url, extCompile=> A pattern object of the extension(s) to match
  #      depth => An integer that indicates how deep to scrap
  #                        Note: A negative recursion depth indicates that you want
  #                          to keep crawling as far as the program can go
  if not depth:
    return
  elif not restDriver.isCallableAttr(extCompile, 'search'):
    utils.streamPrintFlush(
     "Expecting a regex compiled object/result as arg 'extCompile'\n", sys.stderr
    )
    return

  if not utils.httpHeadCompile.search(url): 
    url = "%s%s"%(httpDomain, url)

  if not robotParser.canVisit(url):
    print('Cannot visit %s due to /robots.txt rules'%(url))
    return
  
  decodedData = utils.dlAndDecode(url)
  if not decodedData:
    return
  else:
    urls = utils.urlCompile.findall(decodedData)
    urls = [utils.repeatHttpHeadCompile.sub(utils.HTTP_HEAD_REGEX, s) for s in urls]

    plainUrls = []
    matchedFileUrls = []

    for u in urls:
        pathSelector = plainUrls
        regSearch = extCompile.search(u)
        if regSearch:
            rGroup = regSearch.groups(1)
            u = '%s.%s'%(rGroup[0], rGroup[1])
            pathSelector = matchedFileUrls

        pathSelector.append(u)

    uniqFileUrls = set(matchedFileUrls)
    dlResults = [pushUpJob(eachUrl, router, url) for eachUrl in uniqFileUrls]
    resultsList = [val for val in dlResults if val]

    depth -= 1
    for eachUrl in plainUrls:
      extractFileUrls(eachUrl, extCompile, router, depth)
コード例 #5
0
def getFiles(url,
             extCompile,
             recursionDepth=5,
             httpDomain=utils.HTTPS_DOMAIN,
             baseDir=None):
    # Args: url, extCompile=> A pattern object of the extension(s) to match
    #      recursionDepth => An integer that indicates how deep to scrap
    #                        Note: A negative recursion depth indicates that you want
    #                          to keep crawling as far as the program can go
    if not recursionDepth:
        return
    elif not hasattr(extCompile, 'search'):
        utils.streamPrintFlush(
            "Expecting a pattern object/result of re.compile(..) for arg 'extCompile'\n",
            sys.stderr)
        return

    if not utils.httpHeadCompile.search(url):
        url = "%s%s" % (httpDomain, url)

    decodedData = utils.dlAndDecode(url)
    if not decodedData:
        return
    else:
        urls = utils.urlCompile.findall(decodedData)
        urls = list(
            map(
                lambda s: utils.repeatHttpHeadCompile.sub(
                    utils.HTTP_HEAD_REGEX, s), urls))

        if not urls:
            capableUrls = utils.urlCapableCompile.findall(decodedData)
            trimmedHeadUrl = url.strip('/')

            for capableUrl in capableUrls:
                trimmed = capableUrl.strip('/')
                fixedUrl = '%s/%s' % (trimmedHeadUrl, trimmed)
                urls.append(fixedUrl)

        plainUrls = []
        matchedFileUrls = []

        for u in urls:
            pathSelector = plainUrls
            regSearch = extCompile.search(u)
            if regSearch:
                g = regSearch.groups(1)
                u = '%s.%s' % (g[0], g[1])
                pathSelector = matchedFileUrls

            pathSelector.append(u)

        if not baseDir:
            baseDir = os.path.abspath(".")

        fullUrlToMemPath = os.path.join(baseDir,
                                        utils.pathCleanseCompile.sub('_', url))
        utils.createDir(fullUrlToMemPath)

        # Time to download all the matched files
        dlResults = []
        for eachUrl in matchedFileUrls:
            dlResults.append(dlData(eachUrl, fullUrlToMemPath))

        resultsList = list(filter(lambda val: val, dlResults))

        # Report to user successful saves
        downloadCount = len(resultsList)
        # print(downloadCount)
        if not downloadCount:
            # Mark this url as a bad one/miss and for the sake of crawling
            # not hitting dead ends, we won't crawl it anymore unless otherwise specified
            urlHash = getHash(url)
            urlScoreTuple = missesDict.get(urlHash, None)
            badCrawlCount = 0

            if urlScoreTuple and len(urlScoreTuple) != 2:
                badCrawlCount = (
                    urlScoreTuple[1]) + 1  # Increment the bad crawl score

            missesDict[urlHash] = (url, badCrawlCount, time.time())
            return  # Cut this journey short
        else:
            utils.streamPrintFlush(
                "For url %s downloaded %d files\n" % (url, downloadCount),
                sys.stderr)

        recursionDepth -= 1
        for eachUrl in plainUrls:
            getFiles(eachUrl,
                     extCompile,
                     recursionDepth,
                     baseDir=fullUrlToMemPath)
コード例 #6
0
ファイル: fileDownloader.py プロジェクト: odeke-em/crawlers
def getFiles(url, extCompile, recursionDepth=5, httpDomain=utils.HTTPS_DOMAIN, baseDir=None):
  # Args: url, extCompile=> A pattern object of the extension(s) to match
  #      recursionDepth => An integer that indicates how deep to scrap
  #                        Note: A negative recursion depth indicates that you want
  #                          to keep crawling as far as the program can go
  if not recursionDepth:
    return
  elif not hasattr(extCompile, 'search'):
    utils.streamPrintFlush(
     "Expecting a pattern object/result of re.compile(..) for arg 'extCompile'\n"
    , sys.stderr)
    return

  if not utils.httpHeadCompile.search(url): 
    url = "%s%s"%(httpDomain, url)

  decodedData = utils.dlAndDecode(url)
  if not decodedData:
    return
  else:
    urls = utils.urlCompile.findall(decodedData)
    urls = list(
        map(lambda s: utils.repeatHttpHeadCompile.sub(utils.HTTP_HEAD_REGEX, s), urls)
    )

    if not urls:
       capableUrls = utils.urlCapableCompile.findall(decodedData)
       trimmedHeadUrl = url.strip('/')

       for capableUrl in capableUrls:
          trimmed = capableUrl.strip('/')
          fixedUrl = '%s/%s'%(trimmedHeadUrl, trimmed)
          urls.append(fixedUrl)

    plainUrls = []
    matchedFileUrls = []

    for u in urls:
        pathSelector = plainUrls
        regSearch = extCompile.search(u)
        if regSearch:
            g = regSearch.groups(1)
            u = '%s.%s'%(g[0], g[1])
            pathSelector = matchedFileUrls

        pathSelector.append(u)

    if not baseDir:
      baseDir = os.path.abspath(".")

    fullUrlToMemPath = os.path.join(baseDir, utils.pathCleanseCompile.sub('_', url))
    utils.createDir(fullUrlToMemPath)

    # Time to download all the matched files 
    dlResults = []
    for eachUrl in matchedFileUrls:
        dlResults.append(dlData(eachUrl, fullUrlToMemPath))

    resultsList = list(filter(lambda val: val, dlResults))

    # Report to user successful saves
    downloadCount = len(resultsList)
    # print(downloadCount) 
    if not downloadCount:
      # Mark this url as a bad one/miss and for the sake of crawling 
      # not hitting dead ends, we won't crawl it anymore unless otherwise specified
      urlHash = getHash(url)
      urlScoreTuple = missesDict.get(urlHash, None)
      badCrawlCount = 0

      if urlScoreTuple and len(urlScoreTuple) != 2: 
         badCrawlCount = (urlScoreTuple[1]) + 1 # Increment the bad crawl score

      missesDict[urlHash] = (url, badCrawlCount, time.time())
      return # Cut this journey short
    else:
      utils.streamPrintFlush(
       "For url %s downloaded %d files\n"%(url, downloadCount), sys.stderr
      )

    recursionDepth -= 1
    for eachUrl in plainUrls:
      getFiles(eachUrl, extCompile, recursionDepth, baseDir=fullUrlToMemPath)