Exemple #1
0
    def _http_request(self, url, timeout=10):
        try:
            if not url: url = '/'
            conn_fuc = httplib.HTTPSConnection if self.schema == 'https' else httplib.HTTPConnection
            conn = conn_fuc(self.host, timeout=timeout)

            conn.request(
                method='GET',
                url=url,
                headers={
                    'User-Agent':
                    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
                    '(KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36 BBScan/1.1',
                    'Range':
                    'bytes=0-10240',
                    'Connection':
                    'Close'
                })
            resp = conn.getresponse()
            resp_headers = dict(resp.getheaders())
            status = resp.status
            if resp_headers.get('content-type', '').find('text') >= 0 or \
                            resp_headers.get('content-type', '').find('html') >= 0 or \
                            int(resp_headers.get('content-length', '0')) <= 307200:    # 1024 * 300
                html_doc = decode_response_text(resp.read())
            else:
                html_doc = ''
            conn.close()
            return status, resp_headers, html_doc
        except Exception, e:
            return -1, {}, ''
Exemple #2
0
 def _http_request(self, url, timeout=20):
     try:
         if not url:
             url = '/'
         url = self.base_url + url
         headers = {
             'User-Agent':
             'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
             '(KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36 BBScan/1.2',
             'Range':
             'bytes=0-10240',
             'Connection':
             'keep-alive'
         }
         resp = self.session.get(url,
                                 headers=headers,
                                 timeout=(3.0, timeout))
         resp_headers = resp.headers
         status = resp.status_code
         if resp_headers.get('content-type', '').find('text') >= 0 \
                 or resp_headers.get('content-type', '').find('html') >= 0 \
                 or int(resp_headers.get('content-length', '0')) <= 10240:
             html_doc = decode_response_text(resp.content)
         else:
             html_doc = ''
         return status, resp_headers, html_doc
     except:
         return -1, {}, ''
Exemple #3
0
    def crawl_index(self, path):
        try:
            status, headers, html_doc = self._http_request(path)
            if status != 200:
                try:
                    html_doc = decode_response_text(
                        urllib2.urlopen(self.url).read())
                except Exception as e:
                    pass
            soup = BeautifulSoup(html_doc, "html.parser")
            for link in soup.find_all('a'):
                url = link.get('href', '').strip()
                url, depth = self._cal_depth(url)
                if depth <= self.max_depth:
                    self._enqueue(url)
            if self.find_text(html_doc):
                self.results['/'] = []
                m = re.search('<title>(.*?)</title>', html_doc)
                title = m.group(1) if m else ''
                _ = {
                    'status': status,
                    'url': '%s%s' % (self.base_url, path),
                    'title': title
                }
                if _ not in self.results['/']:
                    self.results['/'].append(_)

        except Exception as e:
            logging.error('[crawl_index Exception] %s' % str(e))
            traceback.print_exc()
Exemple #4
0
    def _http_request(self, url, timeout=20):
        try:
            if not url:
                url = '/'

            conn_fuc = httplib.HTTPSConnection if self.schema == 'https' else httplib.HTTPConnection
            conn = conn_fuc(self.host, timeout=timeout)

            conn.request(method='GET', url=url,
                         headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
                                                '(KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36 BBScan/1.3',
                                  'Range': 'bytes=0-10240',
                                  'Connection': 'Close'})
            resp = conn.getresponse()
            resp_headers = dict(resp.getheaders())
            status = resp.status
            if resp_headers.get('content-type', '').find('text') >= 0 \
                    or resp_headers.get('content-type', '').find('html') >= 0 \
                    or int(resp_headers.get('content-length', '0')) <= 20480:    # 1024 * 20
                html_doc = decode_response_text(resp.read())
            else:
                html_doc = ''
            try:
                if conn.sock:
                    conn.sock.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER, struct.pack('ii', 1, 0))
            except:
                pass
            conn.close()
            return status, resp_headers, html_doc
        except Exception as e:
            return -1, {}, ''
        finally:
            conn.close()
Exemple #5
0
 def crawl_index(self, path):
     try:
         status, headers, html_doc = self._http_request(path)
         if status != 200:
             try:
                 html_doc = decode_response_text(urllib2.urlopen(self.url).read())
             except Exception,e :
                 pass
         soup = BeautifulSoup(html_doc, "html.parser")
         links = soup.find_all('a')
         for l in links:
             url = l.get('href', '').strip()
             url, depth = self._cal_depth(url)
             if depth <= self.max_depth:
                 self._enqueue(url)
Exemple #6
0
 def crawl_index(self, path):
     try:
         status, headers, html_doc = self._http_request(path)
         if status != 200:
             try:
                 html_doc = decode_response_text(
                     urllib2.urlopen(self.url).read())
             except Exception, e:
                 pass
         soup = BeautifulSoup(html_doc, "html.parser")
         links = soup.find_all('a')
         for l in links:
             url = l.get('href', '').strip()
             url, depth = self._cal_depth(url)
             if depth <= self.max_depth:
                 self._enqueue(url)
Exemple #7
0
    def http_request(self, url, headers=config.default_headers, timeout=20):
        try:
            if not url:
                url = '/'
            if not self.conn_pool:
                return -1, {}, ''
            if self.args.debug:
                self.print_msg('--> %s' % self.base_url + url)
            resp = self.conn_pool.urlopen('GET',
                                          self.base_url + url,
                                          headers=headers,
                                          assert_same_host=False,
                                          redirect=False,
                                          timeout=timeout,
                                          retries=3)
            if resp.headers.get('content-type', '').find('text') >= 0 \
                    or resp.headers.get('content-type', '').find('html') >= 0 \
                    or int(resp.headers.get('content-length', '0')) <= 20480:  # 1024 * 20
                html_doc = decode_response_text(resp.data)
            else:
                html_doc = ''

            if resp.status == 502:  # 502出现3次以上,排除该站点
                self.status_502_count += 1
                if self.status_502_count > 5:
                    self.url_queue.queue.clear()
                    try:
                        if self.conn_pool:
                            self.conn_pool.close()
                    except Exception as e:
                        pass
                    self.conn_pool = None
                    # self.print_msg('Website 502: %s' % self.base_url)

            return resp.status, resp.headers, html_doc
        except urllib3.exceptions.MaxRetryError as e:
            return -1, {}, ''
        except TypeError as e:
            return -1, {}, ''
        except Exception as e:
            self.print_msg(str(e))
            return -1, {}, ''
Exemple #8
0
    def request_index(self, path):
        try:
            status, headers, html_doc = self._http_request(path)
            if status != 200:
                try:
                    html_doc = self.conn_pool.urlopen(
                        'GET',
                        self.url,
                        headers=headers_without_range,
                        retries=1).data
                    html_doc = decode_response_text(html_doc)
                except Exception as e:
                    pass
            self.index_status, self.index_headers, self.index_html_doc = status, headers, html_doc  # save index content
            soup = BeautifulSoup(self.index_html_doc, "html.parser")
            for link in soup.find_all('a'):
                url = link.get('href', '').strip()
                self.index_a_urls.add(url)

        except Exception as e:
            logging.error('[request_index Exception] %s' % str(e))
            traceback.print_exc()
Exemple #9
0
    def _http_request(self, url, timeout=30):
        try:
            if not url:
                url = '/'
            # print 'request', self.base_url + url
            resp = self.conn_pool.urlopen('GET',
                                          self.base_url + url,
                                          redirect=False,
                                          timeout=timeout,
                                          retries=0)
            resp_headers = resp.headers
            status = resp.status
            if resp_headers.get('content-type', '').find('text') >= 0 \
                    or resp_headers.get('content-type', '').find('html') >= 0 \
                    or int(resp_headers.get('content-length', '0')) <= 20480:  # 1024 * 20
                html_doc = decode_response_text(resp.data)
            else:
                html_doc = ''

            return status, resp_headers, html_doc
        except Exception as e:
            return -1, {}, ''
Exemple #10
0
 def _http_request(self, url, timeout=20):
     try:
         if not url:
             url = '/'
         url = self.base_url + url
         headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
                                  '(KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36 BBScan/1.2',
                    'Range': 'bytes=0-10240',
                    'Connection': 'keep-alive'
                    }
         resp = self.session.get(url, headers=headers, timeout=(3.0, timeout))
         resp_headers = resp.headers
         status = resp.status_code
         if resp_headers.get('content-type', '').find('text') >= 0 \
                 or resp_headers.get('content-type', '').find('html') >= 0 \
                 or int(resp_headers.get('content-length', '0')) <= 10240:
             html_doc = decode_response_text(resp.content)
         else:
             html_doc = ''
         return status, resp_headers, html_doc
     except:
         return -1, {}, ''
Exemple #11
0
    def _http_request(self, url, timeout=10):
        try:
            if not url: url = '/'
            conn_fuc = httplib.HTTPSConnection if self.schema == 'https' else httplib.HTTPConnection
            conn = conn_fuc(self.host, timeout=timeout)

            conn.request(method='GET', url=url,
                         headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 '
                                                '(KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36 BBScan/1.1',
                                  'Range': 'bytes=0-10240',
                                  'Connection': 'Close'})
            resp = conn.getresponse()
            resp_headers = dict(resp.getheaders())
            status = resp.status
            if resp_headers.get('content-type', '').find('text') >= 0 or \
                            resp_headers.get('content-type', '').find('html') >= 0 or \
                            int(resp_headers.get('content-length', '0')) <= 307200:    # 1024 * 300
                html_doc = decode_response_text(resp.read())
            else:
                html_doc = ''
            conn.close()
            return status, resp_headers, html_doc
        except Exception, e:
            return -1, {}, ''
Exemple #12
0
 def get_all_info(self):
     result = self.path+self.query
     result = decode_response_text(result)
     return result
Exemple #13
0
 def __init__(self,content,url,depth= 1):
     self.content = decode_response_text(content)
     self.url = url
     self.depth = depth