def parse(self, data): # Locate url treebuilder = treebuilders.getTreeBuilder("etree", etree) p = html5parser.HTMLParser(tree=treebuilder)#, tokenizer=sanitizer.HTMLSanitizer) res = chardet.detect(data) if res and res.has_key('encoding'): encoding = res['encoding'] if encoding is None: document = p.parse(StringIO(data)) else: document = p.parse(StringIO(data), encoding=encoding) d = document.xpath('//a[text()="XLS"]') if len(d) > 0: url = d[0].attrib['href'] url = urljoin(self.url, url) (status, headers, body) = httpc.get_(url, ok=[500, 501,300, 301, 503, 200, 404, 405]) f = open(self.filename, 'wb') f.write(body) f.close() return self.readFile(self.filename)
def test_get_(self): status, msg, body = httpc.get_(self.base_url() + 'hello') self.assertEquals(status, 200) self.assertEquals(msg.dict['x-get'], 'hello') self.assertEquals(body, 'hello world')
def get(self, url): (status, headers, body) = httpc.get_(url, ok=[500, 501, 300, 301, 503, 200, 404, 405]) return body