def _http_request(self, url, timeout=10): try: if not url: url = '/' conn_fuc = httplib.HTTPSConnection if self.schema == 'https' else httplib.HTTPConnection conn = conn_fuc(self.host, timeout=timeout) conn.request( method='GET', url=url, headers={ 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36 BBScan/1.1', 'Range': 'bytes=0-10240', 'Connection': 'Close' }) resp = conn.getresponse() resp_headers = dict(resp.getheaders()) status = resp.status if resp_headers.get('content-type', '').find('text') >= 0 or \ resp_headers.get('content-type', '').find('html') >= 0 or \ int(resp_headers.get('content-length', '0')) <= 307200: # 1024 * 300 html_doc = decode_response_text(resp.read()) else: html_doc = '' conn.close() return status, resp_headers, html_doc except Exception, e: return -1, {}, ''
def _http_request(self, url, timeout=20): try: if not url: url = '/' url = self.base_url + url headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36 BBScan/1.2', 'Range': 'bytes=0-10240', 'Connection': 'keep-alive' } resp = self.session.get(url, headers=headers, timeout=(3.0, timeout)) resp_headers = resp.headers status = resp.status_code if resp_headers.get('content-type', '').find('text') >= 0 \ or resp_headers.get('content-type', '').find('html') >= 0 \ or int(resp_headers.get('content-length', '0')) <= 10240: html_doc = decode_response_text(resp.content) else: html_doc = '' return status, resp_headers, html_doc except: return -1, {}, ''
def crawl_index(self, path): try: status, headers, html_doc = self._http_request(path) if status != 200: try: html_doc = decode_response_text( urllib2.urlopen(self.url).read()) except Exception as e: pass soup = BeautifulSoup(html_doc, "html.parser") for link in soup.find_all('a'): url = link.get('href', '').strip() url, depth = self._cal_depth(url) if depth <= self.max_depth: self._enqueue(url) if self.find_text(html_doc): self.results['/'] = [] m = re.search('<title>(.*?)</title>', html_doc) title = m.group(1) if m else '' _ = { 'status': status, 'url': '%s%s' % (self.base_url, path), 'title': title } if _ not in self.results['/']: self.results['/'].append(_) except Exception as e: logging.error('[crawl_index Exception] %s' % str(e)) traceback.print_exc()
def _http_request(self, url, timeout=20): try: if not url: url = '/' conn_fuc = httplib.HTTPSConnection if self.schema == 'https' else httplib.HTTPConnection conn = conn_fuc(self.host, timeout=timeout) conn.request(method='GET', url=url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36 BBScan/1.3', 'Range': 'bytes=0-10240', 'Connection': 'Close'}) resp = conn.getresponse() resp_headers = dict(resp.getheaders()) status = resp.status if resp_headers.get('content-type', '').find('text') >= 0 \ or resp_headers.get('content-type', '').find('html') >= 0 \ or int(resp_headers.get('content-length', '0')) <= 20480: # 1024 * 20 html_doc = decode_response_text(resp.read()) else: html_doc = '' try: if conn.sock: conn.sock.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER, struct.pack('ii', 1, 0)) except: pass conn.close() return status, resp_headers, html_doc except Exception as e: return -1, {}, '' finally: conn.close()
def crawl_index(self, path): try: status, headers, html_doc = self._http_request(path) if status != 200: try: html_doc = decode_response_text(urllib2.urlopen(self.url).read()) except Exception,e : pass soup = BeautifulSoup(html_doc, "html.parser") links = soup.find_all('a') for l in links: url = l.get('href', '').strip() url, depth = self._cal_depth(url) if depth <= self.max_depth: self._enqueue(url)
def crawl_index(self, path): try: status, headers, html_doc = self._http_request(path) if status != 200: try: html_doc = decode_response_text( urllib2.urlopen(self.url).read()) except Exception, e: pass soup = BeautifulSoup(html_doc, "html.parser") links = soup.find_all('a') for l in links: url = l.get('href', '').strip() url, depth = self._cal_depth(url) if depth <= self.max_depth: self._enqueue(url)
def http_request(self, url, headers=config.default_headers, timeout=20): try: if not url: url = '/' if not self.conn_pool: return -1, {}, '' if self.args.debug: self.print_msg('--> %s' % self.base_url + url) resp = self.conn_pool.urlopen('GET', self.base_url + url, headers=headers, assert_same_host=False, redirect=False, timeout=timeout, retries=3) if resp.headers.get('content-type', '').find('text') >= 0 \ or resp.headers.get('content-type', '').find('html') >= 0 \ or int(resp.headers.get('content-length', '0')) <= 20480: # 1024 * 20 html_doc = decode_response_text(resp.data) else: html_doc = '' if resp.status == 502: # 502出现3次以上,排除该站点 self.status_502_count += 1 if self.status_502_count > 5: self.url_queue.queue.clear() try: if self.conn_pool: self.conn_pool.close() except Exception as e: pass self.conn_pool = None # self.print_msg('Website 502: %s' % self.base_url) return resp.status, resp.headers, html_doc except urllib3.exceptions.MaxRetryError as e: return -1, {}, '' except TypeError as e: return -1, {}, '' except Exception as e: self.print_msg(str(e)) return -1, {}, ''
def request_index(self, path): try: status, headers, html_doc = self._http_request(path) if status != 200: try: html_doc = self.conn_pool.urlopen( 'GET', self.url, headers=headers_without_range, retries=1).data html_doc = decode_response_text(html_doc) except Exception as e: pass self.index_status, self.index_headers, self.index_html_doc = status, headers, html_doc # save index content soup = BeautifulSoup(self.index_html_doc, "html.parser") for link in soup.find_all('a'): url = link.get('href', '').strip() self.index_a_urls.add(url) except Exception as e: logging.error('[request_index Exception] %s' % str(e)) traceback.print_exc()
def _http_request(self, url, timeout=30): try: if not url: url = '/' # print 'request', self.base_url + url resp = self.conn_pool.urlopen('GET', self.base_url + url, redirect=False, timeout=timeout, retries=0) resp_headers = resp.headers status = resp.status if resp_headers.get('content-type', '').find('text') >= 0 \ or resp_headers.get('content-type', '').find('html') >= 0 \ or int(resp_headers.get('content-length', '0')) <= 20480: # 1024 * 20 html_doc = decode_response_text(resp.data) else: html_doc = '' return status, resp_headers, html_doc except Exception as e: return -1, {}, ''
def _http_request(self, url, timeout=20): try: if not url: url = '/' url = self.base_url + url headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36 BBScan/1.2', 'Range': 'bytes=0-10240', 'Connection': 'keep-alive' } resp = self.session.get(url, headers=headers, timeout=(3.0, timeout)) resp_headers = resp.headers status = resp.status_code if resp_headers.get('content-type', '').find('text') >= 0 \ or resp_headers.get('content-type', '').find('html') >= 0 \ or int(resp_headers.get('content-length', '0')) <= 10240: html_doc = decode_response_text(resp.content) else: html_doc = '' return status, resp_headers, html_doc except: return -1, {}, ''
def _http_request(self, url, timeout=10): try: if not url: url = '/' conn_fuc = httplib.HTTPSConnection if self.schema == 'https' else httplib.HTTPConnection conn = conn_fuc(self.host, timeout=timeout) conn.request(method='GET', url=url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 ' '(KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36 BBScan/1.1', 'Range': 'bytes=0-10240', 'Connection': 'Close'}) resp = conn.getresponse() resp_headers = dict(resp.getheaders()) status = resp.status if resp_headers.get('content-type', '').find('text') >= 0 or \ resp_headers.get('content-type', '').find('html') >= 0 or \ int(resp_headers.get('content-length', '0')) <= 307200: # 1024 * 300 html_doc = decode_response_text(resp.read()) else: html_doc = '' conn.close() return status, resp_headers, html_doc except Exception, e: return -1, {}, ''
def get_all_info(self): result = self.path+self.query result = decode_response_text(result) return result
def __init__(self,content,url,depth= 1): self.content = decode_response_text(content) self.url = url self.depth = depth