def testRows(self, MockClass): self.setUpMock(MockClass) html = self.addingMachine.main() p = MyHtmlParser() p.feed(html) self.assertEqual(p.allParagraphText.strip(), "The sum of 2 and 3 is 5", "got {0}".format(p.allParagraphText))
def testHtml(self): text = self.addingMachine.run("AddingMachine.csv") parse = MyHtmlParser() parse.feed(text) lines = parse.allParagraphText.splitlines() lineCount = lines.count("The sum of 1 and 2 is 3") self.assertEqual(lineCount, 1, "Bad sums".format("Line count: {0}".format(lineCount)))
def testHtml(self): text = self.addingMachine.run("AddingMachine.csv") parse = MyHtmlParser() parse.feed(text) lines = parse.allParagraphText.splitlines() lineCount = lines.count("The sum of 1 and 2 is 3") self.assertEqual( lineCount, 1, "Bad sums".format("Line count: {0}".format(lineCount)))
def main(): url = "https://www.airbnb.com/careers/departments/business-development" html = HtmlRetriever(url) print "length of html string:", len(html) parser = MyHtmlParser() parser.feed(html) print "parser links 2: ", parser.links
def parserMetaInfo(context): # 解析首页出来的新内容,得到title,url, my = MyHtmlParser() my.feed(context) for k in my.kvs: print k + " , " + my.kvs[k] now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") value = [k, my.kvs[k]] #sql = "insert into blog_news(title,url,source) values('%s', '%s', 'jobbole.com')" sql = "insert into blog_news(title,url,source,create_time,update_time) \ values('" + k + "', '" + my.kvs[k] + "','jobbole.com','" + now + "', '" + now + "')" print sql commitDate(sql)
def _fetch_and_parse(self, job_id, url, depth): """ Fetch a webpage and parse it for links and images. Arguments: job_id: intefer job id. url: string URL. depth: integer current depth. Returns: None. """ html_parser = MyHtmlParser(url) request_headers = {'User-Agent': self.user_agent} request = urllib_Request(url, headers=request_headers) try: webpage = urlopen(request).read().decode() except Exception as error: data.redis.set(url, 'failed') return try: html_parser.feed(webpage) except (HTMLParseError) as error: data.redis.set(url, 'failed') return data.add_webpages(url, html_parser.hyperlinks, depth) data.redis.set(url, 'complete') data.complete_crawl(url) if 0 < depth and self._active and not data.job_is_aborted(job_id): if html_parser.hyperlinks: data.redis.sadd('job' + str(job_id), *html_parser.hyperlinks) data.redis.publish('deploy', pickle.dumps(job_id))
from MyHtmlParser import MyHtmlParser from sources import data if __name__ == '__main__': exclude = [] for k in data: if k in exclude: continue for v in data[k]: parsed = MyHtmlParser(dataset_name=v.get("dataset"), url=v.get("url"), dataset_type=k) print(v.get('dataset')) parsed.set_range_dates(v.get("period")) parsed.retrieve_data()
import urllib.request import urllib.parse import urllib.error import http.cookiejar import html.parser from MyHtmlParser import MyHtmlParser url = 'http://www.esdict.cn/home/dailysentence' headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0"} response = urllib.request.urlopen(url, timeout=10) html = response.read().decode('utf-8') htmlParser = MyHtmlParser() htmlParser.feed(html) data = htmlParser.HTMLData htmlParser.clean() for line in data: if line[:4] == '每日一句': print(line)