def __init__(self, url, method='GET', headers=None, cookies=None, refer=None, data=None, user_agent=None, **kwargs): if isinstance(url, URL): self._url = url else: self._url = URL(url) self._method = method.upper() self._headers = {} self._cookies = cookies self._refer = refer self._user_agent = user_agent if self._cookies: headers.update({"Cookie": self._cookies}) if self._refer: self._headers.update({"Referer": self._refer}) if self._user_agent: self._headers.update({"User-Agent": self._user_agent}) self._get_date = self._url.get_querystring() if data: self._post_data = data
def _find_header_urls(self, headers): for key, value in headers.items(): if key in self.URL_HEADERS: if value.startwith('http'): url = URL(value, encoding=self._encoding) else: url = self._base_url.urljoin(value).url_string url = URL(url, encoding=self._encoding) self._tag_urls.add(url)
def _find_regex_urls(self, doc_str): re_urls = set() for url in re.findall(HtmlParser.URL_RE, doc_str): try: url = URL(url[0], encoding=self._encoding) except ValueError: pass else: re_urls.add(url) def find_relative(doc_str): res = set() regex = '' #fix me this line relative_regex = re.compile(regex, re.U | re.I) for match_truple in relative_regex.findall(doc_str): match_str = match_truple[0] url = self._base_url.join_url(match_str).url_string url = URL(url, encoding=self._encoding) res.add(url) return res re_urls.update(find_relative(doc_str)) self._re_urls.update(re_urls)
def _find_tag_urls(self, tag, attrs): for attr_name, attr_value in attrs.iteritems(): if attr_name in self.URL_ATTRS and attr_value and not attr_value.startwith( '#'): try: if attr_value.startwith('http'): url = URL(attr_value, encoding=self._encoding) else: url = self._base_url.urljoin(attr_value).url_string url = URL(url, encoding=self._encoding) except ValueError: pass else: self._tag_urls.add(url)
def find_relative(doc_str): res = set() regex = '' #fix me this line relative_regex = re.compile(regex, re.U | re.I) for match_truple in relative_regex.findall(doc_str): match_str = match_truple[0] url = self._base_url.join_url(match_str).url_string url = URL(url, encoding=self._encoding) res.add(url) return res
def post(self, url, headers={}, data=None, **kwargs): default_headers = self.get_default_headers(headers) if not isinstance(url, URL): url = URL(url) requests_response = None try: requests_response = requests.post(url.url_string, headers=default_headers, **kwargs) except: return self._make_response(requests_response, url) response = self._make_response(requests_response, url) return response
def is_similar_url(urla1, urlb2): urla = URL(urla1) urlb = URL(urlb2) hosta = urla.get_host() hostb = urlb.get_host() porta = urla.get_port() portb = urlb.get_port() patha = urla.get_path() pathb = urlb.get_path() if patha.count('/') < 2: return True patha1 = txt_wrap_by('/', '/', patha) pathb1 = txt_wrap_by('/', '/', pathb) if hosta == hostb and porta == portb and patha1 == pathb1: return True else: return False
class WSRequest: def __init__(self, url, method='GET', headers=None, cookies=None, refer=None, data=None, user_agent=None, **kwargs): if isinstance(url, URL): self._url = url else: self._url = URL(url) self._method = method.upper() self._headers = {} self._cookies = cookies self._refer = refer self._user_agent = user_agent if self._cookies: headers.update({"Cookie": self._cookies}) if self._refer: self._headers.update({"Referer": self._refer}) if self._user_agent: self._headers.update({"User-Agent": self._user_agent}) self._get_date = self._url.get_querystring() if data: self._post_data = data def get_get_param(self): " ' " " ' " return self._get_date def get_post_parm(self): return self._post_data def get_url(self): return self._url def get_method(self): return self._method def get_headers(self): return self._headers def get_cookies(self): return self._cookies def set_post_data(self, postdata): self._post_data = postdata def set_get_data(self, getdata): self._get_date = getdata def set_refer(self, refer): self._refer = refer def set_cookies(self, cookies): self._cookies = cookies def __str__(self): result_string = self._method result_string += " " + self._url.url_string + " HTTP/1.1\r\n" headers = copy.deepcopy(self._headers) #fix me this line headers.update({"Host": self._url.get_host()}) for key, value in headers.items(): result_string += key + ": " + value result_string += "\r\n" result_string += "\r\n" if self._method == "POST": result_string += str(self._post_data) result_string = result_string.encode("utf-8") return result_string def __repr__(self): vals = { 'method': self.get_method(), 'url': str(self.get_url()), 'id': self.get_id() } return '<Request | %(method)s | %(url)s + %(id)s>' % vals