def __init__(self, url, cache=None): self.url = url self.cache = cache try: f = None if self.cache is not None and os.path.exists(self.cache): # cache exists and use it instead fname = self.cache f = open(self.cache, 'r') else: fname = self.url f = urllib2.urlopen(self.url) page = f.read() f.close() if self.cache is not None and not os.path.exists(self.cache): # cache specified but it doesn't exist, create one with open(self.cache, 'w') as f: f.write(page) # check encoding and convert into utf-8 if it is not # alternation: use from_encoding="encoding" # todo: can from_encoding replace charset=? m = re.search(r'charset=([^ "\']+)', page, flags=re.I) if m: encoding = m.group(1) ##print 'encoding={}\n {}'.format(encoding, fname) charset = 'charset={}'.format(encoding) if re.match(r'gb2312', encoding, re.IGNORECASE): encoding = 'gb18030' page = unicode(page, encoding) page = re.sub(charset, 'charset=UTF-8', page, flags=re.IGNORECASE) BeautifulSoup.__init__(self, page) except Exception, ex: print("Fail to load url {}\n{}".format(self.url, ex)) raise ex
def __init__(self, url, usr=None, session=None, postData=None, vars=None, proxy=None): self.url = url self.postData = postData self.vars = vars if not session and not usr: if postData: r = requests.post(url, data=postData, headers=vars, proxies=proxy) else: r = requests.get(url, headers=vars, proxies=proxy) elif usr: if postData: r = usr.session.post(url, data=postData, headers=vars, proxies=proxy) else: r = usr.session.get(url, headers=vars, proxies=proxy) elif session: if postData: r = session.post(url, data=postData, headers=vars, proxies=proxy) else: r = session.get(url, headers=vars, proxies=proxy) self.resp = r self.request = r.request self.header = r.headers self.content = r.text self.usr = usr if "text/html" in r.headers['content-type']: BeautifulSoup.__init__(self, r.content) else: self.content = r.content
def __init__(self, html_doc, html_parser, *args, **kwargs): BeautifulSoup.__init__(self, html_doc, html_parser, *args, **kwargs) # html5lib adds a head, html and body to docs without them, # so remove if not in original doc if html_parser == 'html5lib': if html_doc.find('<head') < 0: self.find('head').decompose() if html_doc.find('<html') < 0: self.find('html').unwrap() if html_doc.find('<body') < 0: self.find('body').unwrap()
def __init__(self, number): import config res = "" status = 1 request_url = config.emudepot_url.format(number) request = urllib2.Request(request_url, headers=config.header) try: res = urllib2.urlopen(request, timeout=5).read() status = 1 except Exception as error: print("hook/depotHook.py: Error: Request error occurs") print(error) status = 0 print("hook/depotHook.py: Info: depotHook module loaded: {}".format( number)) BeautifulSoup.__init__(self, res, features="html.parser")
def __init__(self, src, encode = 'utf-8', selector = None): if os.path.exists(src): with codecs.open(src, 'r', encode) as fp: BeautifulSoup.__init__(self, fp) self.filename = src self.sourceString = None else: BeautifulSoup.__init__(self, src) self.filename = None self.sourceString = src self.selector = selector c = self.select(selector) if c: self.mainContent = c[0] else: self.mainContent = self.body self.img = [] self.styles = [ c.get('href') for c in self.find_all('meta', rel= 'stylesheet') ]
def __init__(self, infile): _BS.__init__(self, infile, ["lxml", "xml"])
def __init__(self, markup): BeautifulSoup.__init__(self, markup, 'html.parser')
def __init__(self,content): BS.__init__(self,content,"lxml")
def __init__(self, encoding, text=None, avoidParserProblems=True, initialTextIsEverything=True): self._encoding = encoding BeautifulSoup.__init__( self, text, avoidParserProblems, initialTextIsEverything)
def __init__(self, *args, **kwargs): if 'features' not in kwargs: kwargs['features'] = 'xml' BeautifulSoup.__init__(self, *args, **kwargs)
def __init__(self, infile): BeautifulSoup.__init__(self, infile, ["lxml", "xml"])
def __init__(self, *args, **kwargs): BeautifulSoup.__init__(self, *args, **kwargs)
def __init__(self, url): BeautifulSoup.__init__(self, self.get_page(url), 'html.parser')
def __init__(self, raw_html, **kwargs): BeautifulSoup.__init__(self, markup = raw_html, features = "lxml") _parse_greeting(self, **kwargs)
def __init__(self,req): BS.__init__(self,req.content,"lxml") self.req=req
def __init__(self, html): BeautifulSoup.__init__(self, html, 'html.parser')
def __init__(self, ip=None): self.ip = ip self.data = fetch_data(ip=self.ip) BeautifulSoup.__init__(self, self.data, 'lxml')