def testRows(self, MockClass):
     self.setUpMock(MockClass)
     html = self.addingMachine.main()               
     p = MyHtmlParser()        
     p.feed(html)           
     self.assertEqual(p.allParagraphText.strip(), 
         "The sum of 2 and 3 is 5", "got {0}".format(p.allParagraphText))
 def testRows(self, MockClass):
     self.setUpMock(MockClass)
     html = self.addingMachine.main()
     p = MyHtmlParser()
     p.feed(html)
     self.assertEqual(p.allParagraphText.strip(), "The sum of 2 and 3 is 5",
                      "got {0}".format(p.allParagraphText))
Example #3
0
 def testHtml(self):
     text = self.addingMachine.run("AddingMachine.csv")
     parse = MyHtmlParser()
     parse.feed(text)
     lines = parse.allParagraphText.splitlines()
     lineCount = lines.count("The sum of 1 and 2 is 3")        
     self.assertEqual(lineCount, 1, "Bad sums".format("Line count: {0}".format(lineCount)))
Example #4
0
 def testHtml(self):
     text = self.addingMachine.run("AddingMachine.csv")
     parse = MyHtmlParser()
     parse.feed(text)
     lines = parse.allParagraphText.splitlines()
     lineCount = lines.count("The sum of 1 and 2 is 3")
     self.assertEqual(
         lineCount, 1,
         "Bad sums".format("Line count: {0}".format(lineCount)))
Example #5
0
def main():
    url = "https://www.airbnb.com/careers/departments/business-development"

    html = HtmlRetriever(url)
    print "length of html string:", len(html)

    parser = MyHtmlParser()
    parser.feed(html)
    print "parser links 2: ", parser.links
Example #6
0
def parserMetaInfo(context):
    # 解析首页出来的新内容,得到title,url,
    my = MyHtmlParser()
    my.feed(context)
    for k in my.kvs:
        print k + " , " + my.kvs[k]
        now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
        value = [k, my.kvs[k]]
        #sql = "insert into blog_news(title,url,source) values('%s', '%s', 'jobbole.com')"
        sql = "insert into blog_news(title,url,source,create_time,update_time)  \
        values('" + k + "', '" + my.kvs[k] + "','jobbole.com','" + now + "', '" + now + "')"
        print sql
        commitDate(sql)
    def _fetch_and_parse(self, job_id, url, depth):

        """
        Fetch a webpage and parse it for links and images.

        Arguments:
            job_id: intefer job id.
            url: string URL.
            depth: integer current depth.

        Returns: None.
        """

        html_parser = MyHtmlParser(url)
        request_headers = {'User-Agent': self.user_agent}
        request = urllib_Request(url, headers=request_headers)

        try:
            webpage = urlopen(request).read().decode()
        except Exception as error:
            data.redis.set(url, 'failed')
            return

        try:
            html_parser.feed(webpage)
        except (HTMLParseError) as error:
            data.redis.set(url, 'failed')
            return

        data.add_webpages(url, html_parser.hyperlinks, depth)
        data.redis.set(url, 'complete')
        data.complete_crawl(url)

        if 0 < depth and self._active and not data.job_is_aborted(job_id):
            if html_parser.hyperlinks:
                data.redis.sadd('job' + str(job_id), *html_parser.hyperlinks)
            data.redis.publish('deploy', pickle.dumps(job_id))
    def _fetch_and_parse(self, job_id, url, depth):
        """
        Fetch a webpage and parse it for links and images.

        Arguments:
            job_id: intefer job id.
            url: string URL.
            depth: integer current depth.

        Returns: None.
        """

        html_parser = MyHtmlParser(url)
        request_headers = {'User-Agent': self.user_agent}
        request = urllib_Request(url, headers=request_headers)

        try:
            webpage = urlopen(request).read().decode()
        except Exception as error:
            data.redis.set(url, 'failed')
            return

        try:
            html_parser.feed(webpage)
        except (HTMLParseError) as error:
            data.redis.set(url, 'failed')
            return

        data.add_webpages(url, html_parser.hyperlinks, depth)
        data.redis.set(url, 'complete')
        data.complete_crawl(url)

        if 0 < depth and self._active and not data.job_is_aborted(job_id):
            if html_parser.hyperlinks:
                data.redis.sadd('job' + str(job_id), *html_parser.hyperlinks)
            data.redis.publish('deploy', pickle.dumps(job_id))
Example #9
0
from MyHtmlParser import MyHtmlParser
from sources import data

if __name__ == '__main__':

    exclude = []
    for k in data:
        if k in exclude:
            continue
        for v in data[k]:
            parsed = MyHtmlParser(dataset_name=v.get("dataset"),
                                  url=v.get("url"),
                                  dataset_type=k)
            print(v.get('dataset'))
            parsed.set_range_dates(v.get("period"))
            parsed.retrieve_data()
import urllib.request
import urllib.parse
import urllib.error
import http.cookiejar
import html.parser
from MyHtmlParser import MyHtmlParser

url = 'http://www.esdict.cn/home/dailysentence'

headers = {"User-Agent": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0"}

response = urllib.request.urlopen(url, timeout=10)
html = response.read().decode('utf-8')

htmlParser = MyHtmlParser()
htmlParser.feed(html)

data = htmlParser.HTMLData

htmlParser.clean()

for line in data:
    if line[:4] == '每日一句':
        print(line)