def _isFile(self, filename): isFile = re.match('[\w]+.html', filename) if not isinstance(isFile, types.NoneType): content = open(filename) self.q = gumbo.soup_parse(content.read()) return True else: return False
def _isUrl(self, url): isUrl = re.match('^http', url) if not isinstance(isUrl, types.NoneType): response = urllib2.urlopen(url) self.q = gumbo.soup_parse(response.read()) return True else: return False
def raw_urls(): def link_extractor(attr_array): for t in attr_array: if len(t) == 2: (href, link) = t if href == "href" and len(link) > 0: return link return None urls = [] try: req = requests.get("http://www.fda.gov/MedicalDevices/ProductsandMedicalProcedures/DeviceApprovalsandClearances/510kClearances/ucm089428.htm") soup = gumbo.soup_parse(req.text) links = soup.findAll('a', href=re.compile('.*\.zip')) attrs = map(lambda x: x.attrs, links) urls = map(link_extractor, attrs) except: urls = [] return urls
def benchmark_gumbo_bs3(): parser = gumbo.soup_parse(html_unicode) divs = parser.findAll("div") print "gumbo bs3", len(divs)
def setHtml(self, html): self.q = gumbo.soup_parse(html)