Python HtmlDom Examples, htmldom.htmldom.HtmlDom Python Examples

Example #1

0

Show file

    def __init__(self, geocoder):
        ab = self.LastfmVenue(
            'Ancienne Belgique',
            geocoder.geocode('Anspachlaan 110, Brussels, 1000, Belgium'))
        botanique = self.LastfmVenue(
            'Botanique',
            geocoder.geocode('Rue Royale 236, Brussels, 1210, Belgium'))
        self.geocoder = geocoder
        self.venue_dict = {ab.name: ab, botanique.name: botanique}
        self.events = []
        self.artist_dict = {}

        url_base = 'https://www.last.fm'
        url = 'https://www.last.fm/events?location_0=Brussels,+Belgium&location_1=50.8503396&radius=10000&location_2=4.351710300000036'
        for event in htmldom.HtmlDom(url).createDom().find(
                'div.events-list-item-event--title'):
            link = url_base + event.children().attr('href')
            event_dom = htmldom.HtmlDom(link).createDom()
            name = event_dom.find('h1.header-title').text().strip()
            artists = self.get_or_create_artists(event_dom)
            img_link = self.get_img_link(event_dom)
            official_page = self.get_official_page(event_dom)
            dt = self.get_datetime(event_dom)
            lastfm_venue = self.get_or_create_venue(event_dom)
            lastfm_event = self.LastfmEvent(name, lastfm_venue, img_link,
                                            official_page, dt)
            self.events.append(lastfm_event)
            for artist in artists:
                artist.add(lastfm_event)

        self.venues = self.venue_dict.values()
        self.artists = self.artist_dict.values()
        for x in self.events:
            print(x.name)

Example #2

0

Show file

def compare(h1, h2, isShort):
    result = ["<body>"]
    dom1 = htmldom.HtmlDom()
    dom2 = htmldom.HtmlDom()
    dif1 = dom1.createDom(h1)
    dif2 = dom2.createDom(h2)

    array1 = []
    node = dif1.referenceToRootElement
    DOMtoArray(node, array1)

    array2 = []
    node = dif2.referenceToRootElement
    DOMtoArray(node, array2)

    almA, almB, sim = nw_align(array1, array2)

    roadRunner(almA, almB, result, isShort)

    for n in range(len(result)):
        result[n] = re.sub("<a\shref=.+?>", '<a href=...>', result[n])

    result.append("</body>")

    return result

Example #3

0

Show file

File: bakalari.py Project: Greenscreener/Bakalari

    def getGrades(self):
        html = self.opener.open(self.baseUrl + 'prehled.aspx?s=2').read()

        dom = htmldom.HtmlDom().createDom(html)
        vs = dom.find("input#__VIEWSTATE").attr("value")
        ev = dom.find("input#__EVENTVALIDATION").attr("value")

        postData = dict()
        postData["__EVENTTARGET"] = "ctl00$cphmain$Checkdetail"
        postData["__EVENTARGUMENT"] = ""
        postData["__LASTFOCUS"] = ""
        postData["__VIEWSTATE"] = vs
        postData["__EVENTVALIDATION"] = ev
        postData["ctl00$cphmain$Flyout2$Checktypy"] = "on"
        postData["ctl00$cphmain$Flyout2$Checkdatumy"] = "on"

        html = self.opener.open(self.baseUrl + 'prehled.aspx?s=2',
                                urllib.urlencode(postData)).read()

        dom = htmldom.HtmlDom().createDom(html)

        gradesDom = dom.find("table.radekznamky")
        #print gradesDom.html()

        subjects = gradesDom.children().toList()

        subjectsArr = list()
        subInd = 0
        for sub in subjects:
            subName = sub.firstChild().firstChild().firstChild().firstChild(
            ).text
            subjectsArr.append({'name': subName, "grades": []})

            subDom = htmldom.HtmlDom().createDom(sub.html())
            gradesDom = subDom.find("tr.detznamka")
            gradeInd = 0
            for grade in gradesDom.find("td").toList():
                gradeTitle = grade.attr("title").strip()
                gradeText = grade.getText().strip()
                gradeDate = subDom.find("tr.datum").find(
                    "td").toList()[gradeInd].getText().strip()
                gradeWeight = subDom.find("tr.typ").find(
                    "td").toList()[gradeInd].getText().strip()
                subjectsArr[subInd]["grades"].append({
                    "title": gradeTitle,
                    "grade": gradeText,
                    "date": gradeDate,
                    "weight": gradeWeight
                })
                gradeInd += 1

            subInd += 1
        return subjectsArr

Example #4

0

Show file

def action_wrapper(hermes, intentMessage, conf):
    if len(intentMessage.slots.Uhrzeit) != 0:
        dtti = intentMessage.slots.Uhrzeit.first().value
        today = datetime.now().date()
        tomorrow = today + timedelta(days=1)
        date_obj = datetime.strptime(dtti.split(' ')[0], "%Y-%m-%d").date()
        time_obj = dtti.split(' ')[1].split(':')[0] + ":" + dtti.split(
            ' ')[1].split(':')[1]
        PARAMS['time'] = time_obj
        PARAMS['date'] = date_obj.strftime("%d.%m.%Y")
        if date_obj == today:
            result_sentence = "Heute nach " + time_obj + " Uhr fährt der Bus um "
        elif date_obj == tomorrow:
            result_sentence = "Morgen nach " + time_obj + " Uhr fährt der Bus um "
        else:
            result_sentence = "Am " + date_obj.strftime(
                "%d.%m.%Y") + " nach " + time_obj + " Uhr fährt der Bus um "
    else:
        result_sentence = "Der Bus fährt um "
    r = requests.get(url=URL, params=PARAMS)
    dom = htmldom.HtmlDom().createDom(r.text)
    a = dom.find("li[class=item]")
    result_sentence += a[0].text().split(
        '\n')[0] + " Uhr und dann um " + a[1].text().split('\n')[0] + " Uhr"
    current_session_id = intentMessage.session_id
    hermes.publish_end_session(current_session_id, result_sentence)

Example #5

0

Show file

File: bakalari.py Project: Greenscreener/Bakalari

    def getTimetable(self):
        html = self.opener.open(self.baseUrl + 'prehled.aspx?s=6').read()

        dom = htmldom.HtmlDom().createDom(html)

        gradesDom = dom.find("table.rozbunka")
        print gradesDom.html()

Example #6

0

Show file

File: AppParser.py Project: KonH/GParser

    def get_info(self, bundle):
        page_content = Parser.get_page_content(self.get_url(bundle))

        dom = htmldom.HtmlDom()
        dom.createDom(page_content)
        details_info = dom.find('div[class="details-info"]')
        cover_container = details_info.find('div[class="cover-container"]')
        icon_img = cover_container.find("img")

        # Icon
        icon = icon_img.attr("src")
        print("Icon: " + icon)

        # Name
        info_container = details_info.find('div[class="info-container"]')
        document_title = info_container.find('div[class="document-title"]')
        name_div = document_title.find("div")
        name = name_div.text()
        print("Name: " + name)

        # Category
        # TODO
        #category = info_container.find('div#href')
        #print("Category: " + str(category.html()))
        #<a class="document-subtitle category" href="/store/apps/category/GAME_ACTION"> <span itemprop="genre">Экшен</span> </a>

        # Description
        # TODO: Optimization find
        # TODO: Safe assignment
        description = dom.find('div[class="id-app-orig-desc"]')
        print("Description: " + description.text())

        # Recent changes
        recent_changes = dom.find('div[class="recent-change"]')
        print("Recent changes: " + recent_changes.text())

Example #7

0

Show file

def main():
    db_path = '../data/'

    conn = sqlite3.connect(db_path + 'database/db')
    c = conn.cursor()

    start_date = datetime.date(2013, 5, 1)
    end_date = datetime.datetime.date(datetime.datetime.now())
    for single_date in daterange(start_date, end_date):
        d = single_date.strftime("%d-%m-%Y")
        # -- zakladamy ze jesli nie pobrano danych z danego dnia to nie bedzie go w ogole w bazie
        try:
            if not check_if_exist(conn, c, d):
                print("Downloading " + d + "... ", end="")
                dom = htmldom.HtmlDom().createDom(get_weather(d))
                for node in dom.find('li.weather-entry'):
                    godz, temp, kier, pred, zach, wilg = parse_line(node)
                    # data, godz, temp, kier, pred, zach, wilg
                    c.execute(
                        "insert or ignore into Pogoda values ('%s', %s, %s, '%s', %s, %s, %s)"
                        % (d, godz, value_or_null(temp), value_or_null(kier),
                           value_or_null(pred), value_or_null(zach),
                           value_or_null(wilg)))
                print("done.")
                conn.commit()
        except:
            print("Could not download and parse -> " + d)
    print("Updated.")

Example #8

0

Show file

File: scraper.py Project: aliharis/shopify-scraper

    def get_product_reviews(self, product_id):
        try:
            url = f'https://productreviews.shopifycdn.com/proxy/v4/reviews/product?product_id={product_id}&version=v4&shop={self.shopify_url}'
            product = requests.get(url).json()
        except:
            # If reviews are not available for the site
            return [], 0

        # Check If there are any reviews for the product
        if (product["reviews"] == "" or product["aggregate_rating"] == ""):
            return [], 0

        reviews = []

        dom = htmldom.HtmlDom()
        dom = dom.createDom(product["reviews"])

        review_html = dom.find(".spr-review")
        for element in review_html:
            reviews.append(element.find('.spr-review-content-body').text())

        # Grab the aggregate_rating and convert to a dictionary
        m = re.search('<script[\s\S]*?>([\s\S]*?)<\/script>',
                      product["aggregate_rating"])
        aggregate_rating = json.loads(m.group(1))

        return reviews, aggregate_rating["ratingValue"]

Example #9

0

Show file

def parse(html):
	
	# create the DOM
	dom = HTML.HtmlDom().createDom(html)
	
	# each bank name & bank code grouped in element like bellow:
	# <table> 
	#	<tr>
	#		<td class="bank_textcol">row number</td>
	#		<td class="bank_namecol">bank logo</td>
	#		<td class="bank_textcol text-left">bank name</td>		<--- hold the bank name
	#		<td class="bank_codecol">bank code</td>					<--- hold the bank code
	#	</tr>
	# </table>
	#
	# so, we need to find the div element "div" with class "text" inside element "div" with class "bank-item"
	t_code = dom.find("td.bank_codecol")
	
	# collect result to two dimensional array [bank name, bank code]
	result = []
	for i in range(t_code.length()):
		bankCode = t_code[i]
		bankName = t_code[i].prev()
		result.append([
			bankName.text().strip(),
			bankCode.text().strip()
		])
		
	return result

Example #10

0

Show file

    def fetch_data(self, url):
        p = 1
        while 1 == 1:
            http.client.HTTPConnection._https_vsn = 10
            http.client.HTTPConnection._https_vsn_str = 'HTTPS/1.0'
            headers = {
                'User-Agent':
                'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
            }
            print('get data page : ...' + url + '?page=' + str(p))
            data_ = requests.get(url + '?page=' + str(p), headers=headers)

            try:
                data = htmldom.HtmlDom().createDom(data_.text)
                lists = data.find('.product-item')
                if lists.length() > 0:
                    for n in lists:
                        if n.attr('data-id') != 'Undefined Attribute':
                            ids = n.attr('data-id')
                        elif n.attr('href') != 'Undefined Attribute':
                            ids = n.attr('href')
                            ids = re.findall(r"\-p\d+\.html", ids)[0]
                            ids = ids.replace('-p', '')
                            ids = ids.replace('.html', '')

                        self.data.append({'id': ids})
                    self.post_data()
                else:
                    break
            except Exception as e:
                print(e)
            p = p + 1

Example #11

0

Show file

File: parser.py Project: sarenkazabojca/planszowki

def getUrlDom(url):
    request = urlopen(url)  #tworzymy żądanie po zawartość podanego adresu
    html = request.read().decode(
        'windows-1250'
    )  #pobieramy treść odpowiedzi, a następnie dekodujemy ją z kodowania windows 1250
    return htmldom.HtmlDom().createDom(
        html
    )  #na podstawie html tworzymy drzewo dokumentu (htmldom to biblioteka do tworzenie drzewa dokumentu)

Example #12

0

Show file

    def find_element_on_urls(self, url_list):
        """Finds dom element on url provided.

        Args:
           url_list: List of urls.
        """
        dom = htmldom.HtmlDom( "http://www.android.com" ).createDom()
        print(dom)

Example #13

0

Show file

File: task2.py Project: marvpaul/ContentManagementTasks

def getAltAttrFromArticle(link):
    base = 'http://xkcd.com'
    linkCompl = parse.urljoin(base, link)
    f = request.urlopen(linkCompl)
    content = str(f.read(), 'utf-8')

    dom = htmldom.HtmlDom().createDom(content)
    return [dom.find('#comic').find('img').attr('title'), linkCompl]

Example #14

0

Show file

 def build_dom(self, sample):
     dom = htmldom.HtmlDom().createDom(sample)
     print(dom)
     #Using the dom instance from the above code snippet
     html = dom.find("html")
     # Gets all the children
     chldrn = html.children()
     print(chldrn.first().html())
     self.build_dom(html)

Example #15

0

Show file

File: HTMLDataResolver.py Project: stanley17112000/TWEleReceipt

    def findDetail(self, content):
        """resolve the detail items"""
        dom = htmldom.HtmlDom().createDom(content)

        items = dom.find("table[id=invoiceDetailTable]")
        if items.length() == 0:
            return False
        else:
            return True

Example #16

0

Show file

File: animetwist.py Project: michcioperz/otaku-enclave

 def show_datafy(self, show):
     try:
         data = json.loads(htmldom.HtmlDom("https://twist.moe/a/%s" % show.codename).createDom().find("script#series-object").text())
         show.description = data.get("description", "")
         show.title = data.get("title", "")
         show.alternative_title = data.get("altTitle", "")
     except:
         pass
     return show

Example #17

0

Show file

def get_matching_item_on_page(url, text, selector):
    page = htmldom.HtmlDom(url)
    page.createDom()
    elems = page.find(selector)

    possible_matches = [elem.text() for elem in elems]
    best_match = process.extractOne(text, possible_matches)

    match_index = possible_matches.index(best_match[0])
    return elems[match_index]

Example #18

0

Show file

def getComicLinks():
    base = 'http://xkcd.com'
    archive = parse.urljoin(base, 'archive')

    #Load article
    f = request.urlopen(archive)
    content = str(f.read(), 'utf-8')

    archive_html = htmldom.HtmlDom().createDom(content)
    middleContainer = archive_html.find('#middleContainer')
    return middleContainer.find('a')

Example #19

0

Show file

File: main.py Project: KristijanKanalas/electricity-notification

def get_last_post(page_url):
    response = requests.get(page_url)

    if response.status_code == 200:
        response_html = response.json()['html']
        if response_html:
            dom = htmldom.HtmlDom().createDom(response_html)
            post = dom.find("a.articles__title").first()
            return {'url': post.attr('href'), 'text': post.text()}
        return None
    return None

Example #20

0

Show file

def parse_line(node):
    inner = htmldom.HtmlDom().createDom(node.html())
    hour = inner.find("span.hour").text().strip()
    # minutes = inner.find("span.minutes").text().strip()
    temp = inner.find("span.forecast-temp").text().strip()[:-2]
    wind_dir = inner.find("span.wind-direction").text().strip()
    wind_speed = inner.find("span.speed-value").text().strip()
    prec_value = inner.find("span.entry-precipitation-value").text().replace(
        "%", "").strip()
    humidity = inner.find("div.entry-humidity").text().strip()[:-1]

    return hour, temp, wind_dir, wind_speed, prec_value, humidity

Example #21

0

Show file

 def inner_find(self, isbn):
     results = self.lg.search(isbn, "identifier")
     for result in results:
         if ((result['extension'] == 'epub')
                 or (result['extension'] == 'mobi')):
             mirror = result['mirrors'][0]
             dom = htmldom.HtmlDom("http://libgen.io" + mirror).createDom()
             # dom = htmldom.HtmlDom(mirror).createDom()
             h2s = dom.find('h2')
             for h2 in h2s:
                 if (h2.text().lower() == "download"):
                     return [h2.parent().attr('href'), result['extension']]
     return None

Example #22

0

Show file

def myquery(name, lastvalue, seasonvalue):
    num = queryfornum(name)
    data = queryforhtml(num)
    #print(len(data))

    dom = htmldom.HtmlDom()
    lastvaluestr = "li[format=" + lastvalue + "]"
    seasonvaluestr = "li[season=" + seasonvalue + "]"
    dom = dom.createDom(data)
    p = dom.find(lastvaluestr).filter(seasonvaluestr)
    parseddata = p.html()
    #print(parseddata)
    return parseddata

Example #23

0

Show file

File: crawler.py Project: ttklm20/vor-knowledge-graph

def download_page(url, selectors, verbose=False):
    # Download the entire page HTML, processed as a DOM tree
    print(colored('Fetching: ', 'green') + colored(url, 'cyan'))
    page = htmldom.HtmlDom(url).createDom()

    # Apply selector functions in order to create
    # a content package
    content = {}
    for tup in selectors:
        field, selector = tup
        if verbose:
            print(colored('   Mapping : ', 'green'), field)
        content[field] = selector(page)

    return content

Example #24

0

Show file

def encryptArticel(link):
    "Encrypt a given spiegel online article"

    #Load article
    f = request.urlopen(url)
    content = str(f.read(), 'utf-8')

    #Create dom and process selection
    dom = htmldom.HtmlDom().createDom(content)
    cryptedText = dom.find("p.obfuscated")

    encryptedArticle = ""
    for text in cryptedText:
        encryptedArticle += (encrypt(text.text(), -1) + "\n")
    return encryptedArticle

Example #25

0

Show file

def test_htmldom():
    dom = htmldom.HtmlDom().createDom("""<html>
			<div id='one'><p>This is paragraph >1<strong>strong Element >1</strong></p></div>
			<div id='two'><p>This is paragraph >2<strong>strong Element >2</strong></p></div>
			<p id='three'><p>This is paragraph >3<strong>strong Element >3</strong></p></p> 
			<h4 id='four'><p>This is paragraph >4<strong>strong Element >4</strong></p></h4></html>"""
                                      )
    # Getting p element from html data
    p = dom.find("p")
    # You can print html content using "html" method of HtmlNodeList object
    print(p.html())
    print "\t"

    # Getting all elements
    all = dom.find("*")
    print "Getting all elements", all

    # Getting sibling elements using '+'
    sibling = dom.find("div + div")
    print "Getting sibling elements using '+'", sibling

    # Getting Descendant element
    desc = dom.find("div p strong")
    print "Getting Descendant element", desc

    # Getting child element using '>'
    child = dom.find("div > p > strong")
    print "Getting child element using '>'", child

    # Selecting elements through attributes
    elem = dom.find("div[id=one]")
    print "Selecting elements through attributes", elem

    #or
    elem = dom.find("[id]")
    print "or", elem

    #or
    elem = dom.find("div[id] p")
    print "or", elem

    #or
    elem = dom.find("div#one")
    print "or", elem

    #If 'one' were a class then,
    elem = dom.find("div.one")
    print "If 'one' were a class then", elem

Example #26

0

Show file

def saveImageFromArticle(link):
    base = 'http://xkcd.com'
    f = request.urlopen(parse.urljoin(base, link))
    content = str(f.read(), 'utf-8')

    dom = htmldom.HtmlDom().createDom(content)

    extractedSrcPath = dom.find('#comic').find('img').attr('src')
    name = str(extractedSrcPath).split('/')
    name = name[len(name) - 1].split('.')[0].replace('_', ' ')
    imageLink = parse.urljoin(base, extractedSrcPath)

    dir = 'xkcd'
    os.makedirs(dir, exist_ok=True)
    with open((dir + '/' + name + '.png'), 'wb') as f:
        f.write(request.urlopen(imageLink).read())

Example #27

0

Show file

def link_checker_using_dom():
    url = "https://aem-qa.ok.gov/okdrs"
    try:
        dom = htmldom.HtmlDom(url)
        dom.createDom()
        all_links = dom.find("a")
        for links in all_links:
            print(links.attr("href"))
        print(
            "--------------------------------------------------------------------------------------------------------"
        )
        all_images = dom.find("img")
        for images in all_images:
            print(images.attr("src"))
    except Exception as e:
        print(e)

Example #28

0

Show file

File: bakalari.py Project: Greenscreener/Bakalari

    def login(self, user, password):
        html = urllib.urlopen(self.baseUrl).read()
        dom = htmldom.HtmlDom().createDom(html)
        vs = dom.find("input#__VIEWSTATE").attr("value")

        postData = dict()
        postData["__LASTFOCUS"] = ""
        postData["__EVENTTARGET"] = ""
        postData["__EVENTARGUMENT"] = ""
        postData["__VIEWSTATE"] = vs
        postData["ctl00$cphmain$TextBoxjmeno"] = user
        postData["ctl00$cphmain$TextBoxheslo"] = password
        postData["ctl00$cphmain$ButtonPrihlas"] = ""

        self.opener.open(self.baseUrl + 'login.aspx',
                         urllib.urlencode(postData))

Example #29

0

Show file

    def resolve(self, content):
        """resolve the html dom"""
        dom = htmldom.HtmlDom().createDom(content)

        items = dom.find("table[class=lpTb] tr td")
        if items.length is 0:
            return {}

        try:
            data = self.findtheData(items)
        except NotFoundResult:
            data = None
        except NoRecord:
            data = {}
        except NotCorrectFormat:
            data = None
        return data

Example #30

0

Show file

File: HTMLDataResolver.py Project: stanley17112000/TWEleReceipt

    def resolve(self, content):
        """resolve the html dom"""
        if password_error_str in content:
            log.debug(password_error_msg)
            return None
        elif query_receipt_data_error_str in content:
            log.debug(query_unknown_error)
            return None
        elif no_data_rec_str in content:
            return {}

        try:
            dom = htmldom.HtmlDom().createDom(content)
        except Exception, e:
            log.error(e)
            log.error("error content:{}".format(content))
            exit(1)