Python MyHtmlParser Examples

Programming Language: Python

Namespace/Package Name: MyHtmlParser

Class/Type: MyHtmlParser

Examples at hotexamples.com: 10

Python MyHtmlParser - 10 examples found. These are the top rated real world Python examples of MyHtmlParser.MyHtmlParser extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

MyHtmlParser(5)

feed(4)

clean(1)

retrieve_data(1)

set_range_dates(1)

Example #1

Show file

File: TestAddWriteMock.py Project: AlbertoGarcia/JsObjects

 def testRows(self, MockClass):
     self.setUpMock(MockClass)
     html = self.addingMachine.main()               
     p = MyHtmlParser()        
     p.feed(html)           
     self.assertEqual(p.allParagraphText.strip(), 
         "The sum of 2 and 3 is 5", "got {0}".format(p.allParagraphText))

Example #2

Show file

File: TestAddWriteMock.py Project: zinghrsanjay/JsObjects

 def testRows(self, MockClass):
     self.setUpMock(MockClass)
     html = self.addingMachine.main()
     p = MyHtmlParser()
     p.feed(html)
     self.assertEqual(p.allParagraphText.strip(), "The sum of 2 and 3 is 5",
                      "got {0}".format(p.allParagraphText))

Example #3

Show file

File: TestAddReader.py Project: AlbertoGarcia/JsObjects

 def testHtml(self):
     text = self.addingMachine.run("AddingMachine.csv")
     parse = MyHtmlParser()
     parse.feed(text)
     lines = parse.allParagraphText.splitlines()
     lineCount = lines.count("The sum of 1 and 2 is 3")        
     self.assertEqual(lineCount, 1, "Bad sums".format("Line count: {0}".format(lineCount)))

Example #4

Show file

File: TestAddReader.py Project: zinghrsanjay/JsObjects

 def testHtml(self):
     text = self.addingMachine.run("AddingMachine.csv")
     parse = MyHtmlParser()
     parse.feed(text)
     lines = parse.allParagraphText.splitlines()
     lineCount = lines.count("The sum of 1 and 2 is 3")
     self.assertEqual(
         lineCount, 1,
         "Bad sums".format("Line count: {0}".format(lineCount)))

Example #5

Show file

File: testImport.py Project: dantalbert/jobAlert

def main():
    url = "https://www.airbnb.com/careers/departments/business-development"

    html = HtmlRetriever(url)
    print "length of html string:", len(html)

    parser = MyHtmlParser()
    parser.feed(html)
    print "parser links 2: ", parser.links

Example #6

Show file

File: excutor.py Project: dp20056789/newsSpider

def parserMetaInfo(context):
    # 解析首页出来的新内容，得到title，url，
    my = MyHtmlParser()
    my.feed(context)
    for k in my.kvs:
        print k + " , " + my.kvs[k]
        now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        value = [k, my.kvs[k]]
        #sql = "insert into blog_news(title,url,source) values('%s', '%s', 'jobbole.com')"
        sql = "insert into blog_news(title,url,source,create_time,update_time)  \
        values('" + k + "', '" + my.kvs[k] + "','jobbole.com','" + now + "', '" + now + "')"
        print sql
        commitDate(sql)

Example #7

Show file

File: DeploymentManager.py Project: samalba/image-spider

    def _fetch_and_parse(self, job_id, url, depth):

        """
        Fetch a webpage and parse it for links and images.

        Arguments:
            job_id: intefer job id.
            url: string URL.
            depth: integer current depth.

        Returns: None.
        """

        html_parser = MyHtmlParser(url)
        request_headers = {'User-Agent': self.user_agent}
        request = urllib_Request(url, headers=request_headers)

        try:
            webpage = urlopen(request).read().decode()
        except Exception as error:
            data.redis.set(url, 'failed')
            return

        try:
            html_parser.feed(webpage)
        except (HTMLParseError) as error:
            data.redis.set(url, 'failed')
            return

        data.add_webpages(url, html_parser.hyperlinks, depth)
        data.redis.set(url, 'complete')
        data.complete_crawl(url)

        if 0 < depth and self._active and not data.job_is_aborted(job_id):
            if html_parser.hyperlinks:
                data.redis.sadd('job' + str(job_id), *html_parser.hyperlinks)
            data.redis.publish('deploy', pickle.dumps(job_id))

Example #8

Show file

File: DeploymentManager.py Project: samalba/image-spider

    def _fetch_and_parse(self, job_id, url, depth):
        """
        Fetch a webpage and parse it for links and images.

        Arguments:
            job_id: intefer job id.
            url: string URL.
            depth: integer current depth.

        Returns: None.
        """

        html_parser = MyHtmlParser(url)
        request_headers = {'User-Agent': self.user_agent}
        request = urllib_Request(url, headers=request_headers)

        try:
            webpage = urlopen(request).read().decode()
        except Exception as error:
            data.redis.set(url, 'failed')
            return

        try:
            html_parser.feed(webpage)
        except (HTMLParseError) as error:
            data.redis.set(url, 'failed')
            return

        data.add_webpages(url, html_parser.hyperlinks, depth)
        data.redis.set(url, 'complete')
        data.complete_crawl(url)

        if 0 < depth and self._active and not data.job_is_aborted(job_id):
            if html_parser.hyperlinks:
                data.redis.sadd('job' + str(job_id), *html_parser.hyperlinks)
            data.redis.publish('deploy', pickle.dumps(job_id))

Example #9

Show file

from MyHtmlParser import MyHtmlParser
from sources import data

if __name__ == '__main__':

    exclude = []
    for k in data:
        if k in exclude:
            continue
        for v in data[k]:
            parsed = MyHtmlParser(dataset_name=v.get("dataset"),
                                  url=v.get("url"),
                                  dataset_type=k)
            print(v.get('dataset'))
            parsed.set_range_dates(v.get("period"))
            parsed.retrieve_data()

Example #10

Show file

File: UrllibDemo.py Project: MingxingSu/MingxingLearnPython

import urllib.request
import urllib.parse
import urllib.error
import http.cookiejar
import html.parser
from MyHtmlParser import MyHtmlParser

url = 'http://www.esdict.cn/home/dailysentence'

headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0"}

response = urllib.request.urlopen(url, timeout=10)
html = response.read().decode('utf-8')

htmlParser = MyHtmlParser()
htmlParser.feed(html)

data = htmlParser.HTMLData

htmlParser.clean()

for line in data:
    if line[:4] == '每日一句':
        print(line)