Ejemplo n.º 1
0
 def __init__(self, url, cache=None):
     self.url = url
     self.cache = cache
     try:
         f = None
         if self.cache is not None and os.path.exists(self.cache):
             # cache exists and use it instead
             fname = self.cache
             f = open(self.cache, 'r')
         else:
             fname = self.url
             f = urllib2.urlopen(self.url)
         page = f.read()
         f.close()
         if self.cache is not None and not os.path.exists(self.cache):
             # cache specified but it doesn't exist, create one
             with open(self.cache, 'w') as f:
                 f.write(page)
         # check encoding and convert into utf-8 if it is not
         #     alternation: use from_encoding="encoding"
         #         todo: can from_encoding replace charset=?
         m = re.search(r'charset=([^ "\']+)', page, flags=re.I)
         if m:
             encoding = m.group(1)
             ##print 'encoding={}\n    {}'.format(encoding, fname)
             charset = 'charset={}'.format(encoding)
             if re.match(r'gb2312', encoding, re.IGNORECASE):
                 encoding = 'gb18030'
             page = unicode(page, encoding)
             page = re.sub(charset, 'charset=UTF-8', page, flags=re.IGNORECASE)
         BeautifulSoup.__init__(self, page)
     except Exception, ex:
         print("Fail to load url {}\n{}".format(self.url, ex))
         raise ex
Ejemplo n.º 2
0
 def __init__(self, url, usr=None, session=None, postData=None, vars=None, proxy=None):
     self.url = url
     self.postData = postData
     self.vars = vars
     
     if not session and not usr:
         if postData:
             r = requests.post(url, data=postData, headers=vars, proxies=proxy)
         else:
             r = requests.get(url, headers=vars, proxies=proxy)
     elif usr:
         if postData:
             r = usr.session.post(url, data=postData, headers=vars, proxies=proxy)
         else:
             r = usr.session.get(url, headers=vars, proxies=proxy)
     elif session:
         if postData:
             r = session.post(url, data=postData, headers=vars, proxies=proxy)
         else:
             r = session.get(url, headers=vars, proxies=proxy)
     
     self.resp = r
     self.request = r.request
     self.header = r.headers
     self.content = r.text
     self.usr = usr
     
     if "text/html" in r.headers['content-type']:
         BeautifulSoup.__init__(self, r.content)
     else:
         self.content = r.content
Ejemplo n.º 3
0
 def __init__(self, html_doc, html_parser, *args, **kwargs):
     BeautifulSoup.__init__(self, html_doc, html_parser, *args, **kwargs)
     # html5lib adds a head, html and body to docs without them,
     # so remove if not in original doc
     if html_parser == 'html5lib':
         if html_doc.find('<head') < 0:
             self.find('head').decompose()
         if html_doc.find('<html') < 0:
             self.find('html').unwrap()
         if html_doc.find('<body') < 0:
             self.find('body').unwrap()
Ejemplo n.º 4
0
 def __init__(self, number):
     import config
     res = ""
     status = 1
     request_url = config.emudepot_url.format(number)
     request = urllib2.Request(request_url, headers=config.header)
     try:
         res = urllib2.urlopen(request, timeout=5).read()
         status = 1
     except Exception as error:
         print("hook/depotHook.py: Error: Request error occurs")
         print(error)
         status = 0
     print("hook/depotHook.py: Info: depotHook module loaded: {}".format(
         number))
     BeautifulSoup.__init__(self, res, features="html.parser")
Ejemplo n.º 5
0
    def __init__(self,
                 url,
                 usr=None,
                 session=None,
                 postData=None,
                 vars=None,
                 proxy=None):
        self.url = url
        self.postData = postData
        self.vars = vars

        if not session and not usr:
            if postData:
                r = requests.post(url,
                                  data=postData,
                                  headers=vars,
                                  proxies=proxy)
            else:
                r = requests.get(url, headers=vars, proxies=proxy)
        elif usr:
            if postData:
                r = usr.session.post(url,
                                     data=postData,
                                     headers=vars,
                                     proxies=proxy)
            else:
                r = usr.session.get(url, headers=vars, proxies=proxy)
        elif session:
            if postData:
                r = session.post(url,
                                 data=postData,
                                 headers=vars,
                                 proxies=proxy)
            else:
                r = session.get(url, headers=vars, proxies=proxy)

        self.resp = r
        self.request = r.request
        self.header = r.headers
        self.content = r.text
        self.usr = usr

        if "text/html" in r.headers['content-type']:
            BeautifulSoup.__init__(self, r.content)
        else:
            self.content = r.content
Ejemplo n.º 6
0
 def __init__(self, src, encode = 'utf-8', selector = None):
     if os.path.exists(src):
         with codecs.open(src, 'r', encode) as fp:
             BeautifulSoup.__init__(self, fp)
         self.filename = src
         self.sourceString = None
     else:
         BeautifulSoup.__init__(self, src)
         self.filename = None
         self.sourceString = src
     self.selector = selector
     c = self.select(selector)
     if c:
         self.mainContent = c[0]
     else:
         self.mainContent = self.body
     self.img = []
     self.styles = [ c.get('href')
         for c in self.find_all('meta', rel= 'stylesheet') ]
Ejemplo n.º 7
0
 def __init__(self, infile):
     _BS.__init__(self, infile, ["lxml", "xml"])
Ejemplo n.º 8
0
 def __init__(self, markup):
     BeautifulSoup.__init__(self, markup, 'html.parser')
Ejemplo n.º 9
0
 def __init__(self,content):
     BS.__init__(self,content,"lxml")
Ejemplo n.º 10
0
 def __init__(self, encoding, text=None, avoidParserProblems=True,
              initialTextIsEverything=True):
     self._encoding = encoding
     BeautifulSoup.__init__(
         self, text, avoidParserProblems, initialTextIsEverything)
Ejemplo n.º 11
0
 def __init__(self, *args, **kwargs):
     if 'features' not in kwargs:
         kwargs['features'] = 'xml'
     BeautifulSoup.__init__(self, *args, **kwargs)
Ejemplo n.º 12
0
 def __init__(self, infile):
     BeautifulSoup.__init__(self, infile, ["lxml", "xml"])
Ejemplo n.º 13
0
 def __init__(self, *args, **kwargs):
     BeautifulSoup.__init__(self, *args, **kwargs)
Ejemplo n.º 14
0
 def __init__(self, url):
     BeautifulSoup.__init__(self, self.get_page(url), 'html.parser')
Ejemplo n.º 15
0
	def __init__(self, raw_html, **kwargs):
		BeautifulSoup.__init__(self, markup = raw_html, features = "lxml")
		_parse_greeting(self, **kwargs)
Ejemplo n.º 16
0
 def __init__(self,req):
     BS.__init__(self,req.content,"lxml")
     self.req=req
Ejemplo n.º 17
0
 def __init__(self, html):
     BeautifulSoup.__init__(self, html, 'html.parser')
Ejemplo n.º 18
0
    def __init__(self, ip=None):

        self.ip = ip
        self.data = fetch_data(ip=self.ip)
        BeautifulSoup.__init__(self, self.data, 'lxml')
Ejemplo n.º 19
0
 def __init__(self, infile):
     _BS.__init__(self, infile, ["lxml", "xml"])