Exemple #1
0
def spider(init, max=-1, ignore_qs=False, post_func=None,
           excluded_func=None, hosts=None):
  """
  Spider a request by following some links.

  init    - The initial request(s)
  max       - The maximum of request to execute
  post_func - A hook to be executed after each new page fetched
  hosts     - A lists of authorised hosts to spider on. By default
              only the hostname of r_init is allowed.
  excluded_func - A predicate that must indicates if a Request should
                  be executed.
  """
  nb = 0
  checked = []
  if isinstance(init, Request):
    q = deque([init, ])
    hs = [ init.hostname, ]
  elif isinstance(init, RequestSet):
    q = deque(init)
    hs = list(set(init.extract("hostname")))
  else:
    raise TypeError("init must be a Request or a RequestSet")
  if hosts:
    hs += hosts
  try:
    while nb != max and q:
      to_add = []
      r = q.popleft()
      print str(len(checked)) + "/" + str(len(q)),
      clear_line()
      if not r.response:
        r()
      if r.response.content_type:
        if re.match(r'text/html', r.response.content_type):
          to_add += _follow_redirect(r)
          to_add += _get_links(r)
        else:
          print "\nIgnoring", r.response.content_type
      checked.append(r)
      if post_func:
        post_func(r)
      for nr in to_add:
        if nr.hostname not in hs:
          continue
        if excluded_func and excluded_func(nr):
          continue
        if not ignore_qs and any(nr == rc for rc in checked + list(q)):
          continue
        if ignore_qs and any(nr.similar(rc) for rc in checked + list(q)):
          continue
        q.append(nr)
      nb += 1
  except KeyboardInterrupt:
    print str(len(checked)) + "/" + str(len(q))
  return RequestSet(checked)
Exemple #2
0
def spider(init, max=-1, ignore_qs=False, post_func=None,
           excluded_func=None, hosts=None):
  """
  Spider a request by following some links.

  init    - The initial request(s)
  max       - The maximum of request to execute
  post_func - A hook to be executed after each new page fetched
  hosts     - A lists of authorised hosts to spider on. By default
              only the hostname of r_init is allowed.
  excluded_func - A predicate that must indicates if a Request should
                  be executed.
  """
  nb = 0
  checked = []
  if isinstance(init, Request):
    q = deque([init, ])
    hs = [ init.hostname, ]
  elif isinstance(init, RequestSet):
    q = deque(init)
    hs = list(set(init.extract("hostname")))
  else:
    raise TypeError("init must be a Request or a RequestSet")
  if hosts:
    hs += hosts
  try:
    while nb != max and q:
      to_add = []
      r = q.popleft()
      print str(len(checked)) + "/" + str(len(q)),
      clear_line()
      if not r.response:
        r()
      if r.response.content_type:
        if re.match(r'text/html', r.response.content_type):
          to_add += _follow_redirect(r)
          to_add += _get_links(r)
        else:
          print "\nIgnoring", r.response.content_type
      checked.append(r)
      if post_func:
        post_func(r)
      for nr in to_add:
        if nr.hostname not in hs:
          continue
        if excluded_func and excluded_func(nr):
          continue
        if not ignore_qs and any(nr == rc for rc in checked + list(q)):
          continue
        if ignore_qs and any(nr.similar(rc) for rc in checked + list(q)):
          continue
        q.append(nr)
      nb += 1
  except KeyboardInterrupt:
    print str(len(checked)) + "/" + str(len(q))
  return RequestSet(checked)
Exemple #3
0
 def __call__(self, indices=None, stop_event=None, force=False, randomised=False, 
              verbose=1, post_func=None, post_args=[]):
   if not self.reqs:
     raise Exception("No request to process")
   hostnames = set([r.hostname for r in self.reqs])
   ports = set([r.port for r in self.reqs])
   use_ssls = set([r.use_ssl for r in self.reqs])
   if len(hostnames) > 1 or len(ports) > 1 or len(use_ssls) > 1:
     raise Exception("Only one host per request set to run it")
   self.hostname = hostnames.pop()
   self.port = ports.pop()
   self.use_ssl = use_ssls.pop()
   if force and verbose:
     print "Clearing previous responses..."
     self.clear()
   conn = self._init_connection()
   if verbose:
     print "Running {} requests...".format(len(self.reqs)),
     clear_line()
   indices = range(len(self.reqs)) if not indices else indices
   if randomised: random.shuffle(indices)
   done = 0
   todo = len(self.reqs)
   for i in indices:
     if stop_event and stop_event.is_set():
       return
     r = self.reqs[i]
     if verbose:
       print "Running {} requests...{:.2f}%".format(todo, done * 100. / todo),
       clear_line()
     next = False
     if r.response and not force:
       todo -= 1
       next = True
     while not next:
       try:
         if verbose == 2: print repr(r)
         r(conn=conn)
         if post_func: post_func(r, *post_args)
         if verbose == 2: print repr(r.response)
         if r.response.closed:
           conn = self._init_connection()
         done += 1
         next = True
       except (socket.error, BadStatusLine):
         conn = self._init_connection()
         next = False
       if conf.delay:
         time.sleep(conf.delay)
   if verbose:
     print "Running {} requests...done.".format(len(self.reqs))
   conn.close()
Exemple #4
0
 def parallel(self, threads=4):
   stop = threading.Event()
   indices = range(len(self.reqs))
   jobs = []
   for ics in chunks(indices, threads):
     t = threading.Thread(target=self.__call__,
                      kwargs={"indices":ics, "verbose":False,
                              "stop_event":stop})
     jobs.append(t)
     t.start()
   try:
     for j in jobs:
       while j.is_alive():
         j.join(1)
         done = len(self.filter(lambda x: x.response))
         print "Running {} requests... {:.2f}%".format(len(self), done * 100. / len(self)),
         clear_line()
   except KeyboardInterrupt:
     stop.set()
   print "Running {} requests...done.".format(len(self))
Exemple #5
0
 def parallel(self, threads=4, verbose=True, **kw):
   stop = threading.Event()
   indices = range(len(self.reqs))
   jobs = []
   for ics in chunks(indices, threads):
     mkw = kw.copy()
     mkw.update({"indices":ics, "stop_event":stop, "verbose":False})
     t = threading.Thread(target=self.__call__, kwargs=mkw)
     jobs.append(t)
     t.start()
   try:
     for j in jobs:
       while j.is_alive():
         j.join(1)
         if verbose:
           done = len(self.filter(lambda x: x.response))
           print "Running {} requests... {:.2f}%".format(len(self), done * 100. / len(self)),
         clear_line()
   except KeyboardInterrupt:
     stop.set()
   if verbose:
     ## the two extra spaces in the end erase the left over "00%" from "100%"
     print "Running {} requests... done.  ".format(len(self))
Exemple #6
0
 def parallel(self, threads=4, verbose=True, **kw):
     stop = threading.Event()
     indices = range(len(self.reqs))
     jobs = []
     for ics in chunks(indices, threads):
         mkw = kw.copy()
         mkw.update({"indices": ics, "stop_event": stop, "verbose": False})
         t = threading.Thread(target=self.__call__, kwargs=mkw)
         jobs.append(t)
         t.start()
     try:
         for j in jobs:
             while j.is_alive():
                 j.join(1)
                 if verbose:
                     done = len(self.filter(lambda x: x.response))
                     print "Running {} requests... {:.2f}%".format(
                         len(self), done * 100. / len(self)),
                 clear_line()
     except KeyboardInterrupt:
         stop.set()
     if verbose:
         print "Running {} requests...done.".format(len(self))
Exemple #7
0
 def __call__(self,
              force=False,
              randomised=False,
              verbose=1,
              retry=0,
              indices=None,
              stop_event=None,
              post_func=None,
              post_args=[]):
     if not self.reqs:
         raise Exception("No request to process")
     hostnames = set([r.hostname for r in self.reqs])
     ports = set([r.port for r in self.reqs])
     use_ssls = set([r.use_ssl for r in self.reqs])
     if len(hostnames) > 1 or len(ports) > 1 or len(use_ssls) > 1:
         raise Exception("Only one host per request set to run it")
     self.hostname = hostnames.pop()
     self.port = ports.pop()
     self.use_ssl = use_ssls.pop()
     if force:
         if verbose:
             print "Clearing previous responses..."
         self.clear()
     conn = self._init_connection()
     if verbose:
         print "Running {} requests...".format(len(self.reqs)),
         clear_line()
     indices = range(len(self.reqs)) if not indices else indices
     if randomised:
         random.shuffle(indices)
     done = 0
     failed = 0
     todo = len(self.reqs)
     for i in indices:
         if stop_event and stop_event.is_set():
             return
         r = self.reqs[i]
         if verbose:
             if failed:
                 print "Running {} requests...{:.2f}% (failed: {})".format(
                     todo, done * 100. / todo, failed),
             else:
                 print "Running {} requests...{:.2f}%".format(
                     todo, done * 100. / todo),
             clear_line()
         next = False
         if r.response and not force:
             todo -= 1
             next = True
         retried = 0
         while not next:
             try:
                 if verbose == 2: print repr(r)
                 r(conn=conn)
                 if post_func: post_func(r, *post_args)
                 if verbose == 2: print repr(r.response)
                 if r.response.closed:
                     conn = self._init_connection()
                 done += 1
                 next = True
             except (socket.error, BadStatusLine):
                 conn = self._init_connection()
                 next = False
                 retried += 1
                 if retried > retry:
                     failed += 1
                     next = True
             if conf.delay:
                 time.sleep(conf.delay)
     if verbose:
         print "Running {} requests...done.".format(len(self.reqs))
     conn.close()