def __init__(self, geocoder): ab = self.LastfmVenue( 'Ancienne Belgique', geocoder.geocode('Anspachlaan 110, Brussels, 1000, Belgium')) botanique = self.LastfmVenue( 'Botanique', geocoder.geocode('Rue Royale 236, Brussels, 1210, Belgium')) self.geocoder = geocoder self.venue_dict = {ab.name: ab, botanique.name: botanique} self.events = [] self.artist_dict = {} url_base = 'https://www.last.fm' url = 'https://www.last.fm/events?location_0=Brussels,+Belgium&location_1=50.8503396&radius=10000&location_2=4.351710300000036' for event in htmldom.HtmlDom(url).createDom().find( 'div.events-list-item-event--title'): link = url_base + event.children().attr('href') event_dom = htmldom.HtmlDom(link).createDom() name = event_dom.find('h1.header-title').text().strip() artists = self.get_or_create_artists(event_dom) img_link = self.get_img_link(event_dom) official_page = self.get_official_page(event_dom) dt = self.get_datetime(event_dom) lastfm_venue = self.get_or_create_venue(event_dom) lastfm_event = self.LastfmEvent(name, lastfm_venue, img_link, official_page, dt) self.events.append(lastfm_event) for artist in artists: artist.add(lastfm_event) self.venues = self.venue_dict.values() self.artists = self.artist_dict.values() for x in self.events: print(x.name)
def compare(h1, h2, isShort): result = ["<body>"] dom1 = htmldom.HtmlDom() dom2 = htmldom.HtmlDom() dif1 = dom1.createDom(h1) dif2 = dom2.createDom(h2) array1 = [] node = dif1.referenceToRootElement DOMtoArray(node, array1) array2 = [] node = dif2.referenceToRootElement DOMtoArray(node, array2) almA, almB, sim = nw_align(array1, array2) roadRunner(almA, almB, result, isShort) for n in range(len(result)): result[n] = re.sub("<a\shref=.+?>", '<a href=...>', result[n]) result.append("</body>") return result
def getGrades(self): html = self.opener.open(self.baseUrl + 'prehled.aspx?s=2').read() dom = htmldom.HtmlDom().createDom(html) vs = dom.find("input#__VIEWSTATE").attr("value") ev = dom.find("input#__EVENTVALIDATION").attr("value") postData = dict() postData["__EVENTTARGET"] = "ctl00$cphmain$Checkdetail" postData["__EVENTARGUMENT"] = "" postData["__LASTFOCUS"] = "" postData["__VIEWSTATE"] = vs postData["__EVENTVALIDATION"] = ev postData["ctl00$cphmain$Flyout2$Checktypy"] = "on" postData["ctl00$cphmain$Flyout2$Checkdatumy"] = "on" html = self.opener.open(self.baseUrl + 'prehled.aspx?s=2', urllib.urlencode(postData)).read() dom = htmldom.HtmlDom().createDom(html) gradesDom = dom.find("table.radekznamky") #print gradesDom.html() subjects = gradesDom.children().toList() subjectsArr = list() subInd = 0 for sub in subjects: subName = sub.firstChild().firstChild().firstChild().firstChild( ).text subjectsArr.append({'name': subName, "grades": []}) subDom = htmldom.HtmlDom().createDom(sub.html()) gradesDom = subDom.find("tr.detznamka") gradeInd = 0 for grade in gradesDom.find("td").toList(): gradeTitle = grade.attr("title").strip() gradeText = grade.getText().strip() gradeDate = subDom.find("tr.datum").find( "td").toList()[gradeInd].getText().strip() gradeWeight = subDom.find("tr.typ").find( "td").toList()[gradeInd].getText().strip() subjectsArr[subInd]["grades"].append({ "title": gradeTitle, "grade": gradeText, "date": gradeDate, "weight": gradeWeight }) gradeInd += 1 subInd += 1 return subjectsArr
def action_wrapper(hermes, intentMessage, conf): if len(intentMessage.slots.Uhrzeit) != 0: dtti = intentMessage.slots.Uhrzeit.first().value today = datetime.now().date() tomorrow = today + timedelta(days=1) date_obj = datetime.strptime(dtti.split(' ')[0], "%Y-%m-%d").date() time_obj = dtti.split(' ')[1].split(':')[0] + ":" + dtti.split( ' ')[1].split(':')[1] PARAMS['time'] = time_obj PARAMS['date'] = date_obj.strftime("%d.%m.%Y") if date_obj == today: result_sentence = "Heute nach " + time_obj + " Uhr fährt der Bus um " elif date_obj == tomorrow: result_sentence = "Morgen nach " + time_obj + " Uhr fährt der Bus um " else: result_sentence = "Am " + date_obj.strftime( "%d.%m.%Y") + " nach " + time_obj + " Uhr fährt der Bus um " else: result_sentence = "Der Bus fährt um " r = requests.get(url=URL, params=PARAMS) dom = htmldom.HtmlDom().createDom(r.text) a = dom.find("li[class=item]") result_sentence += a[0].text().split( '\n')[0] + " Uhr und dann um " + a[1].text().split('\n')[0] + " Uhr" current_session_id = intentMessage.session_id hermes.publish_end_session(current_session_id, result_sentence)
def getTimetable(self): html = self.opener.open(self.baseUrl + 'prehled.aspx?s=6').read() dom = htmldom.HtmlDom().createDom(html) gradesDom = dom.find("table.rozbunka") print gradesDom.html()
def get_info(self, bundle): page_content = Parser.get_page_content(self.get_url(bundle)) dom = htmldom.HtmlDom() dom.createDom(page_content) details_info = dom.find('div[class="details-info"]') cover_container = details_info.find('div[class="cover-container"]') icon_img = cover_container.find("img") # Icon icon = icon_img.attr("src") print("Icon: " + icon) # Name info_container = details_info.find('div[class="info-container"]') document_title = info_container.find('div[class="document-title"]') name_div = document_title.find("div") name = name_div.text() print("Name: " + name) # Category # TODO #category = info_container.find('div#href') #print("Category: " + str(category.html())) #<a class="document-subtitle category" href="/store/apps/category/GAME_ACTION"> <span itemprop="genre">Экшен</span> </a> # Description # TODO: Optimization find # TODO: Safe assignment description = dom.find('div[class="id-app-orig-desc"]') print("Description: " + description.text()) # Recent changes recent_changes = dom.find('div[class="recent-change"]') print("Recent changes: " + recent_changes.text())
def main(): db_path = '../data/' conn = sqlite3.connect(db_path + 'database/db') c = conn.cursor() start_date = datetime.date(2013, 5, 1) end_date = datetime.datetime.date(datetime.datetime.now()) for single_date in daterange(start_date, end_date): d = single_date.strftime("%d-%m-%Y") # -- zakladamy ze jesli nie pobrano danych z danego dnia to nie bedzie go w ogole w bazie try: if not check_if_exist(conn, c, d): print("Downloading " + d + "... ", end="") dom = htmldom.HtmlDom().createDom(get_weather(d)) for node in dom.find('li.weather-entry'): godz, temp, kier, pred, zach, wilg = parse_line(node) # data, godz, temp, kier, pred, zach, wilg c.execute( "insert or ignore into Pogoda values ('%s', %s, %s, '%s', %s, %s, %s)" % (d, godz, value_or_null(temp), value_or_null(kier), value_or_null(pred), value_or_null(zach), value_or_null(wilg))) print("done.") conn.commit() except: print("Could not download and parse -> " + d) print("Updated.")
def get_product_reviews(self, product_id): try: url = f'https://productreviews.shopifycdn.com/proxy/v4/reviews/product?product_id={product_id}&version=v4&shop={self.shopify_url}' product = requests.get(url).json() except: # If reviews are not available for the site return [], 0 # Check If there are any reviews for the product if (product["reviews"] == "" or product["aggregate_rating"] == ""): return [], 0 reviews = [] dom = htmldom.HtmlDom() dom = dom.createDom(product["reviews"]) review_html = dom.find(".spr-review") for element in review_html: reviews.append(element.find('.spr-review-content-body').text()) # Grab the aggregate_rating and convert to a dictionary m = re.search('<script[\s\S]*?>([\s\S]*?)<\/script>', product["aggregate_rating"]) aggregate_rating = json.loads(m.group(1)) return reviews, aggregate_rating["ratingValue"]
def parse(html): # create the DOM dom = HTML.HtmlDom().createDom(html) # each bank name & bank code grouped in element like bellow: # <table> # <tr> # <td class="bank_textcol">row number</td> # <td class="bank_namecol">bank logo</td> # <td class="bank_textcol text-left">bank name</td> <--- hold the bank name # <td class="bank_codecol">bank code</td> <--- hold the bank code # </tr> # </table> # # so, we need to find the div element "div" with class "text" inside element "div" with class "bank-item" t_code = dom.find("td.bank_codecol") # collect result to two dimensional array [bank name, bank code] result = [] for i in range(t_code.length()): bankCode = t_code[i] bankName = t_code[i].prev() result.append([ bankName.text().strip(), bankCode.text().strip() ]) return result
def fetch_data(self, url): p = 1 while 1 == 1: http.client.HTTPConnection._https_vsn = 10 http.client.HTTPConnection._https_vsn_str = 'HTTPS/1.0' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36' } print('get data page : ...' + url + '?page=' + str(p)) data_ = requests.get(url + '?page=' + str(p), headers=headers) try: data = htmldom.HtmlDom().createDom(data_.text) lists = data.find('.product-item') if lists.length() > 0: for n in lists: if n.attr('data-id') != 'Undefined Attribute': ids = n.attr('data-id') elif n.attr('href') != 'Undefined Attribute': ids = n.attr('href') ids = re.findall(r"\-p\d+\.html", ids)[0] ids = ids.replace('-p', '') ids = ids.replace('.html', '') self.data.append({'id': ids}) self.post_data() else: break except Exception as e: print(e) p = p + 1
def getUrlDom(url): request = urlopen(url) #tworzymy żądanie po zawartość podanego adresu html = request.read().decode( 'windows-1250' ) #pobieramy treść odpowiedzi, a następnie dekodujemy ją z kodowania windows 1250 return htmldom.HtmlDom().createDom( html ) #na podstawie html tworzymy drzewo dokumentu (htmldom to biblioteka do tworzenie drzewa dokumentu)
def find_element_on_urls(self, url_list): """Finds dom element on url provided. Args: url_list: List of urls. """ dom = htmldom.HtmlDom( "http://www.android.com" ).createDom() print(dom)
def getAltAttrFromArticle(link): base = 'http://xkcd.com' linkCompl = parse.urljoin(base, link) f = request.urlopen(linkCompl) content = str(f.read(), 'utf-8') dom = htmldom.HtmlDom().createDom(content) return [dom.find('#comic').find('img').attr('title'), linkCompl]
def build_dom(self, sample): dom = htmldom.HtmlDom().createDom(sample) print(dom) #Using the dom instance from the above code snippet html = dom.find("html") # Gets all the children chldrn = html.children() print(chldrn.first().html()) self.build_dom(html)
def findDetail(self, content): """resolve the detail items""" dom = htmldom.HtmlDom().createDom(content) items = dom.find("table[id=invoiceDetailTable]") if items.length() == 0: return False else: return True
def show_datafy(self, show): try: data = json.loads(htmldom.HtmlDom("https://twist.moe/a/%s" % show.codename).createDom().find("script#series-object").text()) show.description = data.get("description", "") show.title = data.get("title", "") show.alternative_title = data.get("altTitle", "") except: pass return show
def get_matching_item_on_page(url, text, selector): page = htmldom.HtmlDom(url) page.createDom() elems = page.find(selector) possible_matches = [elem.text() for elem in elems] best_match = process.extractOne(text, possible_matches) match_index = possible_matches.index(best_match[0]) return elems[match_index]
def getComicLinks(): base = 'http://xkcd.com' archive = parse.urljoin(base, 'archive') #Load article f = request.urlopen(archive) content = str(f.read(), 'utf-8') archive_html = htmldom.HtmlDom().createDom(content) middleContainer = archive_html.find('#middleContainer') return middleContainer.find('a')
def get_last_post(page_url): response = requests.get(page_url) if response.status_code == 200: response_html = response.json()['html'] if response_html: dom = htmldom.HtmlDom().createDom(response_html) post = dom.find("a.articles__title").first() return {'url': post.attr('href'), 'text': post.text()} return None return None
def parse_line(node): inner = htmldom.HtmlDom().createDom(node.html()) hour = inner.find("span.hour").text().strip() # minutes = inner.find("span.minutes").text().strip() temp = inner.find("span.forecast-temp").text().strip()[:-2] wind_dir = inner.find("span.wind-direction").text().strip() wind_speed = inner.find("span.speed-value").text().strip() prec_value = inner.find("span.entry-precipitation-value").text().replace( "%", "").strip() humidity = inner.find("div.entry-humidity").text().strip()[:-1] return hour, temp, wind_dir, wind_speed, prec_value, humidity
def inner_find(self, isbn): results = self.lg.search(isbn, "identifier") for result in results: if ((result['extension'] == 'epub') or (result['extension'] == 'mobi')): mirror = result['mirrors'][0] dom = htmldom.HtmlDom("http://libgen.io" + mirror).createDom() # dom = htmldom.HtmlDom(mirror).createDom() h2s = dom.find('h2') for h2 in h2s: if (h2.text().lower() == "download"): return [h2.parent().attr('href'), result['extension']] return None
def myquery(name, lastvalue, seasonvalue): num = queryfornum(name) data = queryforhtml(num) #print(len(data)) dom = htmldom.HtmlDom() lastvaluestr = "li[format=" + lastvalue + "]" seasonvaluestr = "li[season=" + seasonvalue + "]" dom = dom.createDom(data) p = dom.find(lastvaluestr).filter(seasonvaluestr) parseddata = p.html() #print(parseddata) return parseddata
def download_page(url, selectors, verbose=False): # Download the entire page HTML, processed as a DOM tree print(colored('Fetching: ', 'green') + colored(url, 'cyan')) page = htmldom.HtmlDom(url).createDom() # Apply selector functions in order to create # a content package content = {} for tup in selectors: field, selector = tup if verbose: print(colored(' Mapping : ', 'green'), field) content[field] = selector(page) return content
def encryptArticel(link): "Encrypt a given spiegel online article" #Load article f = request.urlopen(url) content = str(f.read(), 'utf-8') #Create dom and process selection dom = htmldom.HtmlDom().createDom(content) cryptedText = dom.find("p.obfuscated") encryptedArticle = "" for text in cryptedText: encryptedArticle += (encrypt(text.text(), -1) + "\n") return encryptedArticle
def test_htmldom(): dom = htmldom.HtmlDom().createDom("""<html> <div id='one'><p>This is paragraph >1<strong>strong Element >1</strong></p></div> <div id='two'><p>This is paragraph >2<strong>strong Element >2</strong></p></div> <p id='three'><p>This is paragraph >3<strong>strong Element >3</strong></p></p> <h4 id='four'><p>This is paragraph >4<strong>strong Element >4</strong></p></h4></html>""" ) # Getting p element from html data p = dom.find("p") # You can print html content using "html" method of HtmlNodeList object print(p.html()) print "\t" # Getting all elements all = dom.find("*") print "Getting all elements", all # Getting sibling elements using '+' sibling = dom.find("div + div") print "Getting sibling elements using '+'", sibling # Getting Descendant element desc = dom.find("div p strong") print "Getting Descendant element", desc # Getting child element using '>' child = dom.find("div > p > strong") print "Getting child element using '>'", child # Selecting elements through attributes elem = dom.find("div[id=one]") print "Selecting elements through attributes", elem #or elem = dom.find("[id]") print "or", elem #or elem = dom.find("div[id] p") print "or", elem #or elem = dom.find("div#one") print "or", elem #If 'one' were a class then, elem = dom.find("div.one") print "If 'one' were a class then", elem
def saveImageFromArticle(link): base = 'http://xkcd.com' f = request.urlopen(parse.urljoin(base, link)) content = str(f.read(), 'utf-8') dom = htmldom.HtmlDom().createDom(content) extractedSrcPath = dom.find('#comic').find('img').attr('src') name = str(extractedSrcPath).split('/') name = name[len(name) - 1].split('.')[0].replace('_', ' ') imageLink = parse.urljoin(base, extractedSrcPath) dir = 'xkcd' os.makedirs(dir, exist_ok=True) with open((dir + '/' + name + '.png'), 'wb') as f: f.write(request.urlopen(imageLink).read())
def link_checker_using_dom(): url = "https://aem-qa.ok.gov/okdrs" try: dom = htmldom.HtmlDom(url) dom.createDom() all_links = dom.find("a") for links in all_links: print(links.attr("href")) print( "--------------------------------------------------------------------------------------------------------" ) all_images = dom.find("img") for images in all_images: print(images.attr("src")) except Exception as e: print(e)
def login(self, user, password): html = urllib.urlopen(self.baseUrl).read() dom = htmldom.HtmlDom().createDom(html) vs = dom.find("input#__VIEWSTATE").attr("value") postData = dict() postData["__LASTFOCUS"] = "" postData["__EVENTTARGET"] = "" postData["__EVENTARGUMENT"] = "" postData["__VIEWSTATE"] = vs postData["ctl00$cphmain$TextBoxjmeno"] = user postData["ctl00$cphmain$TextBoxheslo"] = password postData["ctl00$cphmain$ButtonPrihlas"] = "" self.opener.open(self.baseUrl + 'login.aspx', urllib.urlencode(postData))
def resolve(self, content): """resolve the html dom""" dom = htmldom.HtmlDom().createDom(content) items = dom.find("table[class=lpTb] tr td") if items.length is 0: return {} try: data = self.findtheData(items) except NotFoundResult: data = None except NoRecord: data = {} except NotCorrectFormat: data = None return data
def resolve(self, content): """resolve the html dom""" if password_error_str in content: log.debug(password_error_msg) return None elif query_receipt_data_error_str in content: log.debug(query_unknown_error) return None elif no_data_rec_str in content: return {} try: dom = htmldom.HtmlDom().createDom(content) except Exception, e: log.error(e) log.error("error content:{}".format(content)) exit(1)