Python HtmlParser.HtmlParser Examples

Programming Language: Python

Namespace/Package Name: htmlparser

Class/Type: HtmlParser

Method/Function: HtmlParser

Examples at hotexamples.com: 14

The HTMLParser class in the HTMLParser module of Python is used for parsing HTML documents. It provides methods and callbacks that allow developers to handle and process HTML tags, data, and attributes in an organized manner. By subclassing the HTMLParser class, developers can define their own specific handling for different elements and tags in an HTML document. This class is commonly used for web scraping, data extraction, and HTML parsing tasks in Python.

Python HtmlParser.HtmlParser - 14 examples found. These are the top rated real world Python examples of htmlparser.HtmlParser.HtmlParser extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

HtmlParser(14)

parse(8)

get_element_by_xpath(3)

get_url(2)

get_meta_index(1)

transXML(1)

init(1)

getcompany_url(1)

getcompany_info(1)

get_title(1)

get_meta_keyword(1)

get_meta_description(1)

content(1)

get_htag(1)

get_broken_a_tags(1)

get_all_words(1)

get_all_h(1)

getSrcs(1)

getLinks(1)

getContent(1)

feed(1)

transcode(1)

Example #1

Show file

 def __init__(self, db, filename):
     'Harvest articles from the list of feeds in filename.'
     self.db = db
     self.filename = filename
     self.htmlparser = HtmlParser()
     feedlist = self.read_feed_list(filename)
     self.articles = self.parse_feedlist(feedlist)

Example #2

Show file

File: runner.py Project: zzszmyf/common

def load(html, encode, xpaths):
    parser = HtmlParser(html, encode)
    parser.parse()
    for key in xpaths:
        xpath = xpaths.get(key)
        elements = parser.get_element_by_xpath(xpath, encode)
        value = elements[0][2].encode('utf-8')

Example #3

Show file

def load(id,tm,url,html,encode, xpaths):
    parser = HtmlParser(html,encode)
    parser.parse()

    db_sql =  "insert into job_detail(url,src_desc,type,title,\
    keywords,department,job_require,job_duty,\
    job_welfare,label,company,company_desc,\
    logo,salary,work_experience,\
    edu, field,location,head_count,pub_time) values("

    jd = page_pb2.JobDescription()
    js ="{\"pub_tm\":\"" + tm + "\","
    js = js + "\"url\":\"" + url + "\","
    for key in xpaths:
#        print "[ON]handle " + key
        xpath=xpaths.get(key)
        elements = parser.get_element_by_xpath(xpath,encode)
        if (len(elements) == 0):
            print "[ERR] " + key
            continue
        value = elements[0][2].encode('utf-8')
        js += "\"" + key + "\":\"" + value + "\","
#        set_pb(jd,key,value)
    fp=open("./data/"+id+".dat",'w')
    fp.write(js.rstrip(',') + "}")
    fp.close()

Example #4

Show file

File: spider.py Project: Honda-a/seotool

 def __init__(self,
              url,
              number_of_threads=20,
              allowed_urls=[],
              blocked_urls=[],
              basic_auth=(),
              depth=-1):
     self.url = url
     self.number_of_threads = number_of_threads
     self.allowed_urls = allowed_urls
     # self.blocked_urls = blocked_urls
     self.lost_url = set()
     self.basic_auth = basic_auth
     self.depth = depth
     self.crawl = True
     self.visited = {}
     self.general_visited = set()
     self.unvisited = set()
     self.general_unvisited = {self.url}
     self.fetched_url_record = dict()
     self.csv_table = CsvFormat([
         "url", "status code", "title", "keyword", "description", "h1",
         "h2", "h3", "h4", "h5", "h6", "index", "open tags",
         "external links", "h_tag_format"
     ])
     self.downloaded_pages = {}
     self.record = []
     self.url_parser = UrlParser(url)
     self.parser = HtmlParser()
     self.filemanager = FileManager()

Example #5

Show file

File: bothandler.py Project: kensshii/py-tgbot_covid19

    def _toString(self):
        htmlParser = HtmlParser('https://www.worldometers.info/coronavirus/')
        htmlParser.parse()

        timeStr = time.strftime("%d %b %Y %H:%M:%S", time.gmtime())
        text = "Статистика зараженных на " + timeStr + "\nЗараженных: " + htmlParser.getContent()[0] + "\nУмерших: " + htmlParser.getContent()[1] + "\nВыздоровевших: " + htmlParser.getContent()[2]

        return text

Example #6

Show file

File: main.py Project: gloomyline/ML

def crawl(init_url):
    url_pool = UrlManager()
    downloader = Downloader()
    parser = HtmlParser()
    outputer = Outputer()
    temp_url = init_url
    while temp_url:
        driver = downloader.download(temp_url)
        content, temp_url = parser.parse(driver)
        outputer.write(content)
    outputer.close()

Example #7

Show file

 def parse_feed(self, feed):
     'Extract list of articles from the feed.'
     articles = []
     htmlparser = HtmlParser()
     for e in feed.entries[:1]: # read just the first entry while debugging
         article = Article(source=e.author, title=e.title, link=e.link)
         content = htmlparser.parse(e.link)
         article.content = re.sub(r' -.*$', '', content)
         article.save() # and associated word frequencies
         articles.append(article)
     return articles

Example #8

Show file

    def diff_html_from_file(cls, fileName1, fileName2, encode):
        '''get different elements btw. two html files
        '''

        if fileName1 == "" or fileName2 == "":
            print "class differ : function :diff_html_from_file() fileName1 or fileName2 is null"
            return []

        html_str1 = file(fileName1, "rb").read()
        html_Parser1 = HtmlParser(html_str1, encode)
        elements1 = html_Parser1.parse()
        html_Parser1.saveElementsToFile(elements1, "./tmp1.txt")

        html_str2 = file(fileName2, "rb").read()
        html_Parser2 = HtmlParser(html_str2, encode)
        elements2 = html_Parser2.parse()
        html_Parser2.saveElementsToFile(elements2, "./tmp2.txt")

        diffs = cls.diff_txt_from_file("tmp1.txt", "tmp2.txt")
        return diffs

Example #9

Show file

File: runner.py Project: zzszmyf/common

def load(id, html, encode, xpaths):
    parser = HtmlParser(html, encode)
    parser.parse()
    jd = page_pb2.JobDescription()
    js = "{"
    for key in xpaths:
        #        print "[ON]handle " + key
        xpath = xpaths.get(key)
        elements = parser.get_element_by_xpath(xpath, encode)
        if (len(elements) == 0):
            print "[ERR] " + key
            continue
        value = elements[0][2].encode('utf-8')
        js += "\"" + key + "\":\"" + value + "\","


#        set_pb(jd,key,value)
    fp = open("./data/" + id + ".dat", 'w')
    fp.write(js.rstrip(',') + "}")
    fp.close()

Example #10

Show file

def main():
    # initialize argument parser
    parser = argparse.ArgumentParser()

    # add arguments
    parser.add_argument('url')
    parser.add_argument('keyword')

    # get arguments
    args = parser.parse_args()

    # set keyword and url from arguments
    keyword = args.keyword
    url = args.url

    # do a get request and get html from url
    response = do_request(url)

    # check if keyword is in response
    if keyword in response:
        print(Fore.BLUE + '==>' + Fore.RESET + ' {}'.format(url))
        results = process_source(response, keyword)

    # initialize html parser
    parser = HtmlParser()

    # parse links from parser
    links = parser.feed(response)

    # iterate through collected links
    for link in links:
        # get the css or js file behind the links
        response = do_request(link)

        # check if keyword is in css or js file
        if keyword in response:
            print(Fore.BLUE + '==>' + Fore.RESET + ' {}'.format(link))
            results = process_source(response, keyword)

Example #11

Show file

 def __init__(self, root_url):
     self.parser = HtmlParser()
     self.storage = DataStore()
     self._get_root_urls(root_url)

Example #12

Show file

 def __init__(self):
     self.manage = UrlManager()
     self.parser = HtmlParser()
     self.downloader = Htmldownloader()
     self.output = DataOutput()

Example #13

Show file

File: spidermanager.py Project: oujx28/Spider_study

 def __init__(self):
     self.downloader = HtmlDownloader()
     self.parser = HtmlParser()
     self.output = DataOutput()

Example #14

Show file

File: test.py Project: Honda-a/seotool

import requests
from bs4 import BeautifulSoup
from htmlparser import HtmlParser
from urlparser import UrlParser
from time import sleep
import codecs
import json
import pandas as pd

visited = set()
unvisited = set()
domain = 'www.motoji.co.jp'
siteUrl = f"https://{domain}/"
praser_url = UrlParser(siteUrl)
parser_html = HtmlParser()
DATA = []

def get_res(url):

    headers_pc = {'User-Agent': 'robot wpmake'}
    try:
        res = requests.get(url, headers=headers_pc, timeout=5.0, allow_redirects=False)
        return res
    except requests.exceptions.RequestException as e:
        print(e)
        return False

def update_data(url, status_code):

    DATA.append({"url": url, "status_code": status_code})