Beispiel #1
0
def extractFileUrls(url,
                    extCompile,
                    router,
                    depth=5,
                    httpDomain=utils.HTTPS_DOMAIN):
    # Args: url, extCompile=> A pattern object of the extension(s) to match
    #      depth => An integer that indicates how deep to scrap
    #                        Note: A negative recursion depth indicates that you want
    #                          to keep crawling as far as the program can go
    if not depth:
        return
    elif not restDriver.isCallableAttr(extCompile, 'search'):
        utils.streamPrintFlush(
            "Expecting a regex compiled object/result as arg 'extCompile'\n",
            sys.stderr)
        return

    if not utils.httpHeadCompile.search(url):
        url = "%s%s" % (httpDomain, url)

    if not robotParser.canVisit(url):
        print('Cannot visit %s due to /robots.txt rules' % (url))
        return

    decodedData = utils.dlAndDecode(url)
    if not decodedData:
        return
    else:
        urls = utils.urlCompile.findall(decodedData)
        urls = [
            utils.repeatHttpHeadCompile.sub(utils.HTTP_HEAD_REGEX, s)
            for s in urls
        ]

        plainUrls = []
        matchedFileUrls = []

        for u in urls:
            pathSelector = plainUrls
            regSearch = extCompile.search(u)
            if regSearch:
                rGroup = regSearch.groups(1)
                u = '%s.%s' % (rGroup[0], rGroup[1])
                pathSelector = matchedFileUrls

            pathSelector.append(u)

        uniqFileUrls = set(matchedFileUrls)
        dlResults = [
            pushUpJob(eachUrl, router, url) for eachUrl in uniqFileUrls
        ]
        resultsList = [val for val in dlResults if val]

        depth -= 1
        for eachUrl in plainUrls:
            extractFileUrls(eachUrl, extCompile, router, depth)
Beispiel #2
0
def extractFileUrls(url, extCompile, router, depth=5, httpDomain=utils.HTTPS_DOMAIN):
  # Args: url, extCompile=> A pattern object of the extension(s) to match
  #      depth => An integer that indicates how deep to scrap
  #                        Note: A negative recursion depth indicates that you want
  #                          to keep crawling as far as the program can go
  if not depth:
    return
  elif not restDriver.isCallableAttr(extCompile, 'search'):
    utils.streamPrintFlush(
     "Expecting a regex compiled object/result as arg 'extCompile'\n", sys.stderr
    )
    return

  if not utils.httpHeadCompile.search(url): 
    url = "%s%s"%(httpDomain, url)

  if not robotParser.canVisit(url):
    print('Cannot visit %s due to /robots.txt rules'%(url))
    return
  
  decodedData = utils.dlAndDecode(url)
  if not decodedData:
    return
  else:
    urls = utils.urlCompile.findall(decodedData)
    urls = [utils.repeatHttpHeadCompile.sub(utils.HTTP_HEAD_REGEX, s) for s in urls]

    plainUrls = []
    matchedFileUrls = []

    for u in urls:
        pathSelector = plainUrls
        regSearch = extCompile.search(u)
        if regSearch:
            rGroup = regSearch.groups(1)
            u = '%s.%s'%(rGroup[0], rGroup[1])
            pathSelector = matchedFileUrls

        pathSelector.append(u)

    uniqFileUrls = set(matchedFileUrls)
    dlResults = [pushUpJob(eachUrl, router, url) for eachUrl in uniqFileUrls]
    resultsList = [val for val in dlResults if val]

    depth -= 1
    for eachUrl in plainUrls:
      extractFileUrls(eachUrl, extCompile, router, depth)
Beispiel #3
0
def getFiles(
  url, extCompile, recursionDepth=5, httpDomain=HTTPS_DOMAIN, baseDir=None):
  #Args: url, extCompile=> A pattern object of the extension(s) to match
  #      recursionDepth => An integer that indicates how deep to scrap
  #                        Note: A negative recursion depth indicates that you want
  #                          to keep crawling as far as the program can go
  if not recursionDepth: return
  if not hasattr(extCompile, 'search'):
    streamPrintFlush(
     "Expecting a pattern object/result of re.compile(..) for arg 'extCompile'\n"
    , sys.stderr)
    return

  if not re.search(HTTP_HEAD_REGEX,url): 
    url = "%s%s"%(httpDomain, url)
    print("URL ", url)

  try:
    data = urlGetter.urlopen(url) #, timeout=DEFAULT_TIMEOUT)
    if pyVersion >= 3:decodedData = data.read().decode()
    else: decodedData = data.read()
    
  except Exception: pass
  else:
    urls = re.findall(URL_REGEX, decodedData, re.MULTILINE)
    urls = list(map(lambda s : re.sub(REPEAT_HTTP,HTTP_HEAD_REGEX,s), urls))

    matchedFileUrls = filter(lambda s : extCompile.search(s), urls)
    plainUrls = filter(lambda s : s not in matchedFileUrls, urls)
    # print(matchedFileUrls)
    # First create that directory
    if not baseDir:
      baseDir = os.path.abspath(".")
    cleanedPath = re.sub('[/:]+','_', url)
    fullUrlToMemPath = os.path.join(baseDir, cleanedPath)
    # print("FULLURL to Mem ", fullUrlToMemPath)
    createDir(fullUrlToMemPath)

    #Time to download all the matched files 
    dlResults = map(
       lambda eachUrl: dlData(eachUrl, fullUrlToMemPath), matchedFileUrls
    )
    resultsList = list(filter(lambda val: val, dlResults))
    #Report to user successful saves
    downloadCount = len(resultsList)
    # print(downloadCount) 
    if not downloadCount:
      # Mark this url as a bad one/miss and for the sake of crawling 
      # not hitting dead ends, we won't crawl it anymore unless otherwise specified
      urlHash = getHash(url)
      urlScoreTuple = missesDict.get(urlHash, None)
      badCrawlCount = 0

      if urlScoreTuple and len(urlScoreTuple) != 2: 
         badCrawlCount = (urlScoreTuple[1]) + 1 # Increment the bad crawl score

      missesDict[urlHash] = (url, badCrawlCount, time.time())
      return # Cut this journey short
    else:
      streamPrintFlush(
       "For url %s downloaded %d files\n"%(url, downloadCount), sys.stderr
      )

    recursionDepth -= 1
    for eachUrl in plainUrls:
      getFiles(eachUrl, extCompile, recursionDepth, baseDir=fullUrlToMemPath)
Beispiel #4
0
def createDir(dirPath):
  # print("CreateDir:: ", dirPath)
  if dirPath and not os.path.exists(dirPath):
     os.mkdir(dirPath)
     if DEBUG: streamPrintFlush("Done creating %s\n"%(dirPath), sys.stderr)
Beispiel #5
0
def main():
  while True:
    try:
      streamPrintFlush(
        "\nTarget Url: eg [www.example.org or http://www.h.com] ", sys.stderr
      )
      lineIn, eofState = readFromStream()
      if eofState: break

      baseUrl = lineIn.strip("\n")

      streamPrintFlush(
       "Your extensions separated by '|' eg png|html: ", sys.stderr
      )

      lineIn, eofState = readFromStream()
      if eofState: break
      extensions = lineIn.strip("\n")
      
      streamPrintFlush(
        "\nRecursion Depth(a negative depth indicates you want script to go as far): "
      ,sys.stderr)

      lineIn, eofState = readFromStream()
      if eofState: break
      
      rDepth = int(lineIn.strip("\n"))

      formedRegex =\
             "\.(%s)"%(extensions) if extensions else DEFAULT_EXTENSIONS_REGEX

      extCompile = regexCompile(formedRegex)

    except ValueError:
      streamPrintFlush("Recursion depth must be an integer\n", sys.stderr)
    except KeyboardInterrupt:
      streamPrintFlush("Ctrl-C applied. Exiting now..\n",sys.stderr)
      break
    except Exception:
      continue
    else:
      if not baseUrl:
        continue

      if extCompile:
        getFiles(baseUrl, extCompile, rDepth)

  streamPrintFlush("Bye..\n",sys.stderr)
Beispiel #6
0
def dlData(url, dirStoragePath=None):
 #Args: A url
 #Download the data from the url and write it to memory
 #Returns: True iff the data was successfully written, else: False
 if not (url and re.search(HTTP_HEAD_REGEX,url)): return None

 # Let's check the cache first
 # Computing the url's hash
 
 urlStrHash = getHash(url)
 if not urlStrHash:
   streamPrintFlush("Cannot hash the provided URL")
   return
  
 isMiss = missesDict.get(urlStrHash, None) 
 if isMiss:
    if DEBUG: streamPrintFlush("Uncrawlable link: %s"%(url))
    return None

 alreadyIn = hitsDict.get(urlStrHash, None)
 if alreadyIn:
   if DEBUG: streamPrintFlush("\033[32mAlready downloaded %s\033[00m\n"%(url))
   return None

 try: data = urlGetter.urlopen(url)
 except Exception: return False
 else:
   fileSearch = re.findall(END_NAME, url)
   if not fileSearch : return False

   fileName = fileSearch[0]
   fnameExtensionSeparate = re.findall("(.*)\.(\w+)$", fileName, re.UNICODE)
   if not fnameExtensionSeparate: return False # Raise error possibly
   proposedName, extension = fnameExtensionSeparate[0]
    
   # availableName = fileNameTrie.getSuggestion(proposedName)
   # if not availableName:
   #    print(
   #      "Sorry no alternate suggestions for %s could be proposed"%(fileName)
   #    )
   #    return False

   fileName = "%s.%s"%(proposedName, extension)
   # fileNameTrie.addSeq(availableName, 0, len(availableName)) # Mark this entry as taken
   if dirStoragePath and os.path.exists(dirStoragePath):
      fileName = os.path.join(dirStoragePath, fileName)

   streamPrintFlush("From url %s\n"%(url), sys.stderr)

   try:
     f = open(fileName,'wb')
     f.write(data.read())
     f.close()
   except: 
     streamPrintFlush("Failed to write %s to memory\n"%(fileName), sys.stderr) 
     return False
   else:
     streamPrintFlush("Wrote %s to memory\n"%(fileName), sys.stderr)
     
     # Let's now cache that url and mark it's content as already visited
     # where the urlString hash is the key and downloaded urls are the values
     markedContent = hitsDict.get(urlStrHash, [])
     markedContent.append(url)
     hitsDict[urlStrHash] = markedContent

     return True
Beispiel #7
0
def main():
  args, options = restDriver.cliParser()

  # Route manager
  router = Router([
      'http://192.168.1.117:8000', 'http://192.168.1.110:8008', 'http://127.0.0.1:8009'
  ])
  while True:
    try:
      utils.streamPrintFlush(
        "\nTarget Url: eg [www.example.org or http://www.h.com] ", sys.stderr
      )
      lineIn, eofState = readFromStream()
      if eofState: break

      if lineIn:
        baseUrl = lineIn.strip("\n")

      else:
        continue

      utils.streamPrintFlush(
       "Your extensions separated by '|' eg png|html: ", sys.stderr
      )

      lineIn, eofState = readFromStream()
      if eofState: break
      extensions = lineIn.strip("\n")
      
      utils.streamPrintFlush(
        "\nRecursion Depth(a negative depth indicates you want script to go as far): ", sys.stderr
      )

      lineIn, eofState = readFromStream()
      if eofState: break

      elif lineIn:
        rDepth = int(lineIn.strip("\n") or 1)
      else:
        rDepth = 1

      formedRegex = utils.extensionify(extensions or utils.DEFAULT_EXTENSIONS_REGEX)
      extCompile = utils.regexCompile(formedRegex)

    except ValueError:
      utils.streamPrintFlush("Recursion depth must be an integer\n", sys.stderr)
    except KeyboardInterrupt:
      utils.streamPrintFlush("Ctrl-C applied. Exiting now..\n", sys.stderr)
      break
    except Exception:
      # TODO: [Informative exceptions]:
      #       + Handle traceback from sys somehow, since using Exception as e won't
      #         is invalid syntax for x <= Python2.5
      print('Generic exception encountered')
      continue
    else:
      if not baseUrl:
        continue

      if extCompile:
        extractFileUrls(baseUrl, extCompile, router, rDepth)

  utils.streamPrintFlush("Bye..\n",sys.stderr)
Beispiel #8
0
def main():
    while True:
        try:
            utils.streamPrintFlush(
                "\nTarget Url: eg [www.example.org or http://www.h.com] ",
                sys.stderr)
            lineIn, eofState = readFromStream()
            if eofState: break

            baseUrl = lineIn.strip("\n")

            utils.streamPrintFlush(
                "Your extensions separated by '|' eg png|html: ", sys.stderr)

            lineIn, eofState = readFromStream()
            if eofState: break
            extensions = lineIn.strip("\n")

            utils.streamPrintFlush(
                "\nRecursion Depth(a negative depth indicates you want script to go as far): ",
                sys.stderr)

            lineIn, eofState = readFromStream()
            if eofState: break

            rDepth = int(lineIn.strip("\n"))

            formedRegex = utils.extensionify(extensions
                                             or utils.DEFAULT_EXTENSIONS_REGEX)
            extCompile = utils.regexCompile(formedRegex)

        except ValueError:
            utils.streamPrintFlush("Recursion depth must be an integer\n",
                                   sys.stderr)
        except KeyboardInterrupt:
            utils.streamPrintFlush("Ctrl-C applied. Exiting now..\n",
                                   sys.stderr)
            break
        except Exception:
            continue
        else:
            if not baseUrl:
                continue

            if extCompile:
                getFiles(baseUrl, extCompile, rDepth)

    utils.streamPrintFlush("Bye..\n", sys.stderr)
Beispiel #9
0
def getFiles(url,
             extCompile,
             recursionDepth=5,
             httpDomain=utils.HTTPS_DOMAIN,
             baseDir=None):
    # Args: url, extCompile=> A pattern object of the extension(s) to match
    #      recursionDepth => An integer that indicates how deep to scrap
    #                        Note: A negative recursion depth indicates that you want
    #                          to keep crawling as far as the program can go
    if not recursionDepth:
        return
    elif not hasattr(extCompile, 'search'):
        utils.streamPrintFlush(
            "Expecting a pattern object/result of re.compile(..) for arg 'extCompile'\n",
            sys.stderr)
        return

    if not utils.httpHeadCompile.search(url):
        url = "%s%s" % (httpDomain, url)

    decodedData = utils.dlAndDecode(url)
    if not decodedData:
        return
    else:
        urls = utils.urlCompile.findall(decodedData)
        urls = list(
            map(
                lambda s: utils.repeatHttpHeadCompile.sub(
                    utils.HTTP_HEAD_REGEX, s), urls))

        if not urls:
            capableUrls = utils.urlCapableCompile.findall(decodedData)
            trimmedHeadUrl = url.strip('/')

            for capableUrl in capableUrls:
                trimmed = capableUrl.strip('/')
                fixedUrl = '%s/%s' % (trimmedHeadUrl, trimmed)
                urls.append(fixedUrl)

        plainUrls = []
        matchedFileUrls = []

        for u in urls:
            pathSelector = plainUrls
            regSearch = extCompile.search(u)
            if regSearch:
                g = regSearch.groups(1)
                u = '%s.%s' % (g[0], g[1])
                pathSelector = matchedFileUrls

            pathSelector.append(u)

        if not baseDir:
            baseDir = os.path.abspath(".")

        fullUrlToMemPath = os.path.join(baseDir,
                                        utils.pathCleanseCompile.sub('_', url))
        utils.createDir(fullUrlToMemPath)

        # Time to download all the matched files
        dlResults = []
        for eachUrl in matchedFileUrls:
            dlResults.append(dlData(eachUrl, fullUrlToMemPath))

        resultsList = list(filter(lambda val: val, dlResults))

        # Report to user successful saves
        downloadCount = len(resultsList)
        # print(downloadCount)
        if not downloadCount:
            # Mark this url as a bad one/miss and for the sake of crawling
            # not hitting dead ends, we won't crawl it anymore unless otherwise specified
            urlHash = getHash(url)
            urlScoreTuple = missesDict.get(urlHash, None)
            badCrawlCount = 0

            if urlScoreTuple and len(urlScoreTuple) != 2:
                badCrawlCount = (
                    urlScoreTuple[1]) + 1  # Increment the bad crawl score

            missesDict[urlHash] = (url, badCrawlCount, time.time())
            return  # Cut this journey short
        else:
            utils.streamPrintFlush(
                "For url %s downloaded %d files\n" % (url, downloadCount),
                sys.stderr)

        recursionDepth -= 1
        for eachUrl in plainUrls:
            getFiles(eachUrl,
                     extCompile,
                     recursionDepth,
                     baseDir=fullUrlToMemPath)
Beispiel #10
0
def dlData(url, dirStoragePath=None):
    # Args: A url
    # Download the data from the url and write it to memory
    # Returns: True iff the data was successfully written, else: False
    if not (url and utils.httpHeadCompile.search(url)):
        return None

    urlStrHash = getHash(url)
    if not urlStrHash:
        utils.streamPrintFlush("Cannot hash the provided URL")
        return

    isMiss = missesDict.get(urlStrHash, None)
    if isMiss:
        if DEBUG:
            utils.streamPrintFlush("Uncrawlable link: %s" % (url))
        return None

    alreadyIn = hitsDict.get(urlStrHash, None)
    if alreadyIn:
        if DEBUG:
            utils.streamPrintFlush("\033[32mAlready downloaded %s\033[00m\n" %
                                   (url))
        return None

    try:
        data = utils.urlGetter.urlopen(url)
    except Exception:
        return False
    else:
        fileSearch = utils.endNameCompile.findall(url)
        if not fileSearch:
            return False

        fileName = fileSearch[0]
        fnameExtensionSeparate = utils.fnameCompile.findall(fileName)
        if not fnameExtensionSeparate:
            return False  # Raise error possibly

        proposedName, extension = fnameExtensionSeparate[0]

        # availableName = fileNameTrie.getSuggestion(proposedName)
        # if not availableName:
        #    print(
        #      "Sorry no alternate suggestions for %s could be proposed"%(fileName)
        #    )
        #    return False

        fileName = "%s.%s" % (proposedName, extension)
        # fileNameTrie.addSeq(availableName, 0, len(availableName)) # Mark this entry as taken

        if dirStoragePath and os.path.exists(dirStoragePath):
            fileName = os.path.join(dirStoragePath, fileName)

        utils.streamPrintFlush("From url %s\n" % (url), sys.stderr)

        try:
            f = open(fileName, 'wb')
            f.write(data.read())
            f.close()
        except:
            utils.streamPrintFlush(
                "Failed to write %s to memory\n" % (fileName), sys.stderr)
            return False
        else:
            utils.streamPrintFlush("Wrote %s to memory\n" % (fileName),
                                   sys.stderr)

            # Let's now cache that url and mark it's content as already visited
            # where the urlString hash is the key and downloaded urls are the values
            markedContent = hitsDict.get(urlStrHash, [])
            markedContent.append(url)
            hitsDict[urlStrHash] = markedContent

            return True
Beispiel #11
0
def getFiles(url, extCompile, recursionDepth=5, httpDomain=utils.HTTPS_DOMAIN, baseDir=None):
  # Args: url, extCompile=> A pattern object of the extension(s) to match
  #      recursionDepth => An integer that indicates how deep to scrap
  #                        Note: A negative recursion depth indicates that you want
  #                          to keep crawling as far as the program can go
  if not recursionDepth:
    return
  elif not hasattr(extCompile, 'search'):
    utils.streamPrintFlush(
     "Expecting a pattern object/result of re.compile(..) for arg 'extCompile'\n"
    , sys.stderr)
    return

  if not utils.httpHeadCompile.search(url): 
    url = "%s%s"%(httpDomain, url)

  decodedData = utils.dlAndDecode(url)
  if not decodedData:
    return
  else:
    urls = utils.urlCompile.findall(decodedData)
    urls = list(
        map(lambda s: utils.repeatHttpHeadCompile.sub(utils.HTTP_HEAD_REGEX, s), urls)
    )

    if not urls:
       capableUrls = utils.urlCapableCompile.findall(decodedData)
       trimmedHeadUrl = url.strip('/')

       for capableUrl in capableUrls:
          trimmed = capableUrl.strip('/')
          fixedUrl = '%s/%s'%(trimmedHeadUrl, trimmed)
          urls.append(fixedUrl)

    plainUrls = []
    matchedFileUrls = []

    for u in urls:
        pathSelector = plainUrls
        regSearch = extCompile.search(u)
        if regSearch:
            g = regSearch.groups(1)
            u = '%s.%s'%(g[0], g[1])
            pathSelector = matchedFileUrls

        pathSelector.append(u)

    if not baseDir:
      baseDir = os.path.abspath(".")

    fullUrlToMemPath = os.path.join(baseDir, utils.pathCleanseCompile.sub('_', url))
    utils.createDir(fullUrlToMemPath)

    # Time to download all the matched files 
    dlResults = []
    for eachUrl in matchedFileUrls:
        dlResults.append(dlData(eachUrl, fullUrlToMemPath))

    resultsList = list(filter(lambda val: val, dlResults))

    # Report to user successful saves
    downloadCount = len(resultsList)
    # print(downloadCount) 
    if not downloadCount:
      # Mark this url as a bad one/miss and for the sake of crawling 
      # not hitting dead ends, we won't crawl it anymore unless otherwise specified
      urlHash = getHash(url)
      urlScoreTuple = missesDict.get(urlHash, None)
      badCrawlCount = 0

      if urlScoreTuple and len(urlScoreTuple) != 2: 
         badCrawlCount = (urlScoreTuple[1]) + 1 # Increment the bad crawl score

      missesDict[urlHash] = (url, badCrawlCount, time.time())
      return # Cut this journey short
    else:
      utils.streamPrintFlush(
       "For url %s downloaded %d files\n"%(url, downloadCount), sys.stderr
      )

    recursionDepth -= 1
    for eachUrl in plainUrls:
      getFiles(eachUrl, extCompile, recursionDepth, baseDir=fullUrlToMemPath)
Beispiel #12
0
def main():
    args, options = restDriver.cliParser()

    # Route manager
    router = Router([
        'http://192.168.1.117:8000', 'http://192.168.1.110:8008',
        'http://127.0.0.1:8009'
    ])
    while True:
        try:
            utils.streamPrintFlush(
                "\nTarget Url: eg [www.example.org or http://www.h.com] ",
                sys.stderr)
            lineIn, eofState = readFromStream()
            if eofState: break

            if lineIn:
                baseUrl = lineIn.strip("\n")

            else:
                continue

            utils.streamPrintFlush(
                "Your extensions separated by '|' eg png|html: ", sys.stderr)

            lineIn, eofState = readFromStream()
            if eofState: break
            extensions = lineIn.strip("\n")

            utils.streamPrintFlush(
                "\nRecursion Depth(a negative depth indicates you want script to go as far): ",
                sys.stderr)

            lineIn, eofState = readFromStream()
            if eofState: break

            elif lineIn:
                rDepth = int(lineIn.strip("\n") or 1)
            else:
                rDepth = 1

            formedRegex = utils.extensionify(extensions
                                             or utils.DEFAULT_EXTENSIONS_REGEX)
            extCompile = utils.regexCompile(formedRegex)

        except ValueError:
            utils.streamPrintFlush("Recursion depth must be an integer\n",
                                   sys.stderr)
        except KeyboardInterrupt:
            utils.streamPrintFlush("Ctrl-C applied. Exiting now..\n",
                                   sys.stderr)
            break
        except Exception:
            # TODO: [Informative exceptions]:
            #       + Handle traceback from sys somehow, since using Exception as e won't
            #         is invalid syntax for x <= Python2.5
            print('Generic exception encountered')
            continue
        else:
            if not baseUrl:
                continue

            if extCompile:
                extractFileUrls(baseUrl, extCompile, router, rDepth)

    utils.streamPrintFlush("Bye..\n", sys.stderr)