def _parse(self, htmlString, encode='utf8'): """ Parse HTML to DOM Document """ if len(htmlString) < 2: return None if encode: dom = parseString(htmlString, html=1, unfinished=1, htmlencoding=encode) self._testDom(dom) else: dom = parseString(htmlString, html=1, unfinished=1) self._testDom(dom) return dom
def fetch_labels(self, query): self.__clear_labels() self.wq.search(query, sites='en.wikipedia.org', count=self.max_docs) opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'MwClient-0.6.4')] for idx, url in enumerate(self.wq.urls()[0:self.max_docs]): try: infile = opener.open(url) page = infile.read() doc = libxml2dom.parseString(page, html=1) if self.debug: util.log("url", url) labels = DocLabels() labels.title = self.__collect_text(doc.xpath("//*[@id='firstHeading']")[0]) labels.categories = self.__nodes_to_array(doc.xpath("//*[@id='mw-normal-catlinks']/span")) # remove disambiguation pages dp_str = 'Disambiguation pages' if dp_str in labels.categories: labels.categories.remove(dp_str) # headline text labels.headlines = [] for node in doc.xpath("//h3/*[@class='mw-headline']"): labels.headlines.append(self.__collect_text(node)) labels.num_anchors = len(doc.getElementsByTagName("a")) labels.anchors = [] # only taking external link texts for node in doc.xpath("//ul/li/*[@class='external text']"): labels.anchors.append(self.__collect_text(node)) labels.rank = idx + 1 self.labels_for_urls[url] = labels except (urllib2.HTTPError, IndexError), e: if self.debug: util.error("%s, url: %s" % (e, url))
def fetch_labels(self, query): self.__clear_labels() self.wq.search(query, sites='en.wikipedia.org', count=self.max_docs) opener = urllib2.build_opener() opener.addheaders = [('User-agent', 'MwClient-0.6.4')] for idx, url in enumerate(self.wq.urls()[0:self.max_docs]): try: infile = opener.open(url) page = infile.read() doc = libxml2dom.parseString(page, html=1) if self.debug: util.log("url", url) labels = DocLabels() labels.title = self.__collect_text( doc.xpath("//*[@id='firstHeading']")[0]) labels.categories = self.__nodes_to_array( doc.xpath("//*[@id='mw-normal-catlinks']/span")) # remove disambiguation pages dp_str = 'Disambiguation pages' if dp_str in labels.categories: labels.categories.remove(dp_str) # headline text labels.headlines = [] for node in doc.xpath("//h3/*[@class='mw-headline']"): labels.headlines.append(self.__collect_text(node)) labels.num_anchors = len(doc.getElementsByTagName("a")) labels.anchors = [] # only taking external link texts for node in doc.xpath("//ul/li/*[@class='external text']"): labels.anchors.append(self.__collect_text(node)) labels.rank = idx + 1 self.labels_for_urls[url] = labels except (urllib2.HTTPError, IndexError), e: if self.debug: util.error("%s, url: %s" % (e, url))
def getParams(data, formName=None, params={}): doc = libxml2dom.parseString(data, html=1) xpath = "//form" if formName is not None: xpath += "[@name = '%s']" % formName for form in doc.xpath(xpath): for inputElm in form.xpath(".//input"): elmType = inputElm.getAttribute("type").lower() if elmType == "reset": continue if not elmType == "checkbox" or inputElm.hasAttribute("checked"): params[inputElm.getAttribute("name")] = inputElm.getAttribute( "value") for selectElm in form.xpath(".//select"): options = selectElm.xpath(".//option[@selected]") if len(options) > 1: print "Error: Multiple selected options not handled: %s (%d)" % ( inputElm.getAttribute("name"), len(options)) for optionElm in options: params[inputElm.getAttribute("name")] = optionElm.getAttribute( "value") for textElm in form.xpath("textarea"): params[inputElm.getAttribute("name")] = "".join( inputElm.xpath(".//text()")) return params
def parse_issue_page(link): ps = get_page_string(link) dom = libxml2dom.parseString(ps, html=1) #retrieve id id = link[link.rfind('/')+1:] #retrieve subject subject_dom = dom.xpath('.//div[@class="subject"]')[0] subject = subject_dom.xpath('.//h3/text()')[0].toString() attributes_dom = dom.xpath('.//table[@class="attributes"]')[0] #retrieve status status = attributes_dom.xpath('.//td[@class="status"]/text()')[0].toString() #retrieve priority priority = attributes_dom.xpath('.//td[@class="priority"]/text()')[0].toString() #retrieve category category = attributes_dom.xpath('.//td[@class="category"]/text()')[0].toString() #retrieve affected version aff_dom = attributes_dom.xpath('.//td[@class="cf_4"]/a/text()') aff_version = "" if (len(aff_dom) > 0): aff_version = aff_dom[0].toString() if not issues_dict.has_key(category): issues_dict[category] = [] issues_dict[category].append([id, status, subject, priority, aff_version]) print id + " " + category + " " + subject + " " + status + " " + priority + " " + aff_version
def getEinzelwerteListe(self, strIndex): url = self.__UrlEinzelwerteListePerIndex.replace("XXX", strIndex) page = self.__webConnect.runGETRequest(url) page = page.replace("<br>", " ").replace("<br/>", " ").replace("<br />", " ") doc = libxml2dom.parseString(page, html=1) td_elements = doc.getElementsByTagName("td") ''' durch alle td elemente laufen und schauen,ob irgendwo eine isin gefunden wurde. falls ja, ist das ein gueltiger eintrag ''' StockList = list() for i in td_elements: data = i.textContent arr = data.rsplit() if len(arr) > 1 and self.__checkForISIN(arr[-1]) == 1: s = CStock() s.ISIN = arr[-1] s.Name = string.join(arr[0:-1]) StockList.append(s) if len(StockList) == 0: raise NameError('Achtung: Aktienliste fuer ' + strIndex + ' hat keine Werte!') return StockList
def test_TreeContent(self): """ Unittest to check tree content after load """ html = """ <html> <body> <div> Ola <div> Mundo </div> <div> <a> ! </a> </div> </div> </body> </html> """ dom = parseString(html, html=1) tree = Node().loadNodeTree(dom, 0) print tree.childNodes[0].childNodes[0].dom.localName self.assertEquals(tree.childNodes[0].childNodes[0].depth, 2) self.assertEquals(tree.childNodes[0].childNodes[0].height, 5) # print tree.childNodes[0].childNodes[0].childNodes[0].childNodes[1].str self.assertEquals(tree.childNodes[0].childNodes[0].childNodes[0].childNodes[1].depth, 4) self.assertEquals(tree.childNodes[0].childNodes[0].childNodes[0].childNodes[1].height, 2)
def getArticleInfoForPage(page): doc=libxml2dom.parseString(page,html=True) doc=doc.getElementById('content_list_view').childNodes[1] items=[] currDomain='' currCat='' for ch in doc.childNodes: if ch.tagName=='h1': currDomain=ch.textContent if ch.tagName=='li': currCat=ch.textContent.strip() if ch.tagName=='ul' : try: info=[] if ch.getAttribute('style')=='margin-left:10px': #pocinje lista odma curSubcat='' info=parseSubCategory(ch) for item in info: item['topic']=currDomain+'/'+currCat+'/'+curSubcat else: curSubcat=ch.childNodes[0].textContent.strip() info=parseSubCategory([t for t in ch.childNodes if t.tagName=='ul'][0]) for item in info: item['topic']=currDomain+'/'+currCat+'/'+curSubcat items+=info except: pass return items
def getEinzelwerteListe(self, strIndex): url = self.__UrlEinzelwerteListePerIndex.replace("XXX", strIndex) page = self.__webConnect.runGETRequest( url ) page = page.replace("<br>", " ").replace("<br/>", " ").replace("<br />", " ") doc = libxml2dom.parseString(page, html=1) td_elements = doc.getElementsByTagName("td") ''' durch alle td elemente laufen und schauen,ob irgendwo eine isin gefunden wurde. falls ja, ist das ein gueltiger eintrag ''' StockList = list() for i in td_elements: data = i.textContent arr = data.rsplit() if len(arr) > 1 and self.__checkForISIN(arr[-1]) == 1: s = CStock() s.ISIN = arr[-1] s.Name = string.join(arr[0:-1]) StockList.append(s) if len(StockList) == 0: raise NameError('Achtung: Aktienliste fuer ' + strIndex + ' hat keine Werte!') return StockList
def parse(htmlString, encoding=None, url='', category='', language=''): """ Parse HTML to DOM Document """ result = parseString(htmlString, html=1, unfinished=1, htmlencoding='utf8') return result
def test_TreeStructure(self): """ Unittest tree strucure """ html = """ <html> <body> <div> Ola <div> Mundo </div> <div> <a> ! </a> </div> </div> </body> </html> """ dom = parseString(html, html=1) tree = Node().loadNodeTree(dom, 0) self.assert_(tree.dom == dom) self.assertEquals(tree.childNodes[0].dom.localName, "html") self.assert_(tree.childNodes[0].dom == dom.childNodes[0]) # print tree.childNodes[0].childNodes[0].dom.localName self.assertEquals(tree.childNodes[0].childNodes[0].str, "bodydivdivdiva") # print ttree.childNodes[0].childNodes[0].childNodes[0].childNodes[1].str self.assertEquals(tree.childNodes[0].childNodes[0].childNodes[0].childNodes[1].str, "diva")
def __getDataforStock(self, stock): t1 = date.today() t2 = t1 - timedelta(days=380) self.__POSTDataAktie["pkAktieNr"] = stock.FinanzenNetId self.__POSTDataAktie["strBoerse"] = stock.strBoerseFinanzenNet self.__POSTDataAktie["dtTag1"] = t2.day self.__POSTDataAktie["dtMonat1"] = t2.month self.__POSTDataAktie["dtJahr1"] = t2.year self.__POSTDataAktie["dtTag2"] = t1.day self.__POSTDataAktie["dtMonat2"] = t1.month self.__POSTDataAktie["dtJahr2"] = t1.year page = self.__webConnect.runPOSTRequest( self.__FinanzenNetHistorischeKurseURL, self.__POSTDataAktie) doc = libxml2dom.parseString(page, html=1) td_elements = doc.getElementsByTagName("td") c = 0 for i in td_elements: data = i.textContent if (self.__validateDate(data) == 1): datum = datetime.datetime.strptime(data, '%d.%m.%Y').date() ''' datum ist gueltig. pruefe nun, wann die notwendigen deltas erreicht wurden, um die werte zu setzen ''' if (self.AktienkursHeute == 0 and datum >= date.today() - timedelta(days=3)): self.AktienkursHeute = float( td_elements[c + 2].textContent.replace(",", ".")) if (self.AktienkursVor6Monaten == 0 and datum <= date.today() - timedelta(days=180)): self.AktienkursVor6Monaten = float( td_elements[c + 2].textContent.replace(",", ".")) if (self.AktienkursVor12Monaten == 0 and datum <= date.today() - timedelta(days=360)): self.AktienkursVor12Monaten = float( td_elements[c + 2].textContent.replace(",", ".")) DatumList = [] DatumList.append(date.today() - timedelta(date.today().day)) DatumList.append(DatumList[0] - timedelta(DatumList[0].day)) DatumList.append(DatumList[1] - timedelta(DatumList[1].day)) DatumList.append(DatumList[2] - timedelta(DatumList[2].day)) for i in [0, 1, 2, 3]: if (self.AktieList[i] == 0 and datum <= DatumList[i]): self.AktieList[i] = float( td_elements[c + 2].textContent.replace( ".", "").replace(",", ".")) c = c + 1
def get_problem(url): data = urllib.urlopen(url) s = data.read() doc = libxml2dom.parseString(s,html=1) main = doc.getElementById("main") for node in main.childNodes: if node.getAttribute("class") == "wiki_text_block": return node.toString()
def parseString(s, html=0, htmlencoding=None, unfinished=0, impl=None): doc = libxml2dom.parseString(s, html=html, htmlencoding=htmlencoding, unfinished=unfinished, impl=(impl or default_impl)) initialiseEvents(doc) return doc
def __init__(self, path, nodes): sock = urllib.urlopen(path) html_source = sock.read() sock.close() doc = libxml2dom.parseString(html_source, html=1) self.content = doc.getElementById("gc-wrapper") self.wrapper = self.__get_wrapper() self.__nodes = nodes self.__collect_nodes()
def load(self,file_name): try: f = open(file_name,'r') doc_string = reduce(lambda x,y:x+y,f) f.close() return libxml2dom.parseString(doc_string, html=1) except IOError: print "unable to read file ",file_name
def parse(url): #parse out the uri for the first recent image html = download(url) try: return libxml2dom.parseString(html, html=1) except TypeError, detail: print detail print "Could not parse document, attempt ", attempts, "; retrying " return parse(url)
def __init__(self, gameUrl): openGame = urllib2.urlopen(gameUrl.replace("\n","")+"/rating-systems").read() self.docGame = libxml2dom.parseString(openGame, html=1) allDiv = self.docGame.getElementsByTagName("div") self.descDiv=None for div in allDiv: for atr in div.attributes: if(atr.nodeValue=="rightPanelMain"): self.descDiv=div
def genmoedict(result, dirname, fnames): indexfile = os.path.join(dirname, 'index.html') assert os.path.exists(indexfile) f = open(indexfile, 'r') buf = f.read() try: assert buf.find('</html>') > 0 except: print >> sys.stderr, 'Error: incomplete index.html under ', dirname finally: flist = getindex.findall(buf)[1:] f.close() for fname in flist: isdir = False if fname[-1] == '/': fname = fname[:-1] isdir = True elif fname.find('/') > 0: p = fname.find('/') if fname[p+1:] == 'index.html': fname = fname[:p] isdir = True if fname not in fnames: print >> sys.stderr, 'Warning: %s not in %s' % ( fname, dirname) continue if isdir: continue if not (fname.startswith('m_') and fname.endswith('.html')): print >> sys.stderr, 'Ignoring ' + fname continue num = int(fname[2:-5]) try: f = open(os.path.join(dirname, fname), 'r') buf = f.read() f.close() except: print >> sys.stderr, 'Error reading %s under %s' % ( fname, dirname) continue try: doc = libxml2dom.parseString(buf, html=1, htmlencoding='latin-1') except: print >> sys.stderr, 'Error parsing %s under %s' % ( fname, dirname) continue try: result.append((num, getwordmeanfromdoc(doc))) except: print >> sys.stderr, 'Error processing %s under %s' % ( fname, dirname) if DEBUG: traceback.print_exc() pdb.set_trace() raise
def getMeasure(w): f = urllib.urlopen("http://compling.org/cgi-bin/DAL_sentence_xml.cgi?sentence="+w) s = f.read() f.close() #Parse the XML result to obtain valence and activation scores doc = libxml2dom.parseString(s) measure = doc.getElementsByTagName("measure") valence = measure[0].getAttribute("valence") activation = measure[0].getAttribute("activation") return [valence,activation]
def get_element(self,url): """ Get an HTML element hosted in the given url. If successful, returns the DOM of the element, if not, returns None. """ try: #doc_string = self.proxy.make_request_without_proxy(url) doc_string = self.proxy.make_request(url) if doc_string != None: return libxml2dom.parseString(doc_string, html=1) except IOError: print "unable to connect to server "
def parse(html_data, header): class St: pass st = St() st.id = -1 st.level = 0 st.saved_id = -100 st.saved_level = -1 st.results = [] st.used = False def check_header(h): if header.__class__ == str: return h.strip() == header.strip() else: return header(h) def dfs(node, good_ol=False): st.id += 1 st.level += 1 # print str(node.name) +" " + str(st.id) # print "name=%s id=%d content=%s" %( node.name, st.id, node.textContent) if ["h2", "h3"].count(enc(node.name)) and check_header(enc(node.textContent)): st.saved_id = st.id st.saved_level = st.level st.used = False elif ["ol", "ul"].count(node.name) and (st.id == st.saved_id + 3 or st.level == st.saved_level): good_ol = True elif st.level == st.saved_level and node.name != "text" and st.used: # print "clearing for item type %s" %(node.name,) st.saved_level = -1 elif st.level < st.saved_level: st.saved_level = -1 if node.name == "a" and good_ol: class C: pass c = C() c.link = absolute_url(node.getAttribute("href")) c.name = node.textContent.strip() st.results.append(c) st.used = True for x in node.childNodes: dfs(x, good_ol) st.level -= 1 dom = libxml2dom.parseString(html_data, html=True, htmlencoding="utf-8") dfs(dom) return st.results
def getOffers(isbn): if isbn in ISBNcache: return ISBNcache[isbn] g = urllib.urlopen("http://m.bookscouter.com/prices.php?isbn="+isbn) html = libxml2dom.parseString(g.read(), html=1) venues = [] tablecells = html.getElementsByTagName("td") for n in xrange(0,min(len(tablecells),NUMBER_VENUES*2),2): name = tablecells[n].textContent.replace(" ","_") price = tablecells[n+1].xpath(".//a")[0].textContent venues.append([price, name]) ISBNcache[isbn] = venues time.sleep(0.5) # Be nice to bookscouter return ISBNcache[isbn]
def get_problems(page): problems = [] data = urllib.urlopen("http://infoarena.ro/arhiva?display_entries=250&first_entry="+str(250*page)) s = data.read() doc = libxml2dom.parseString(s, html=1) a_elements = doc.getElementsByTagName("a") for node in a_elements: if re.match('^\/problema.*',node.getAttribute("href")): problems.append(node.getAttribute("href")) problems = list(set(problems)) #getting rid of duplicates caused by open problems who have another link for i in range(len(problems)): problems[i] = [problems[i], "http://infoarena.ro"+problems[i]] return problems
def __getDataforStock(self, stock): t1 = date.today() t2 = t1 - timedelta(days=380) self.__POSTDataAktie["pkAktieNr"] = stock.FinanzenNetId self.__POSTDataAktie["strBoerse"] = stock.strBoerseFinanzenNet self.__POSTDataAktie["dtTag1"] = t2.day self.__POSTDataAktie["dtMonat1"] = t2.month self.__POSTDataAktie["dtJahr1"] = t2.year self.__POSTDataAktie["dtTag2"] = t1.day self.__POSTDataAktie["dtMonat2"] = t1.month self.__POSTDataAktie["dtJahr2"] = t1.year page = self.__webConnect.runPOSTRequest(self.__FinanzenNetHistorischeKurseURL, self.__POSTDataAktie) doc = libxml2dom.parseString(page, html=1) td_elements = doc.getElementsByTagName("td") c = 0 for i in td_elements: data = i.textContent if( self.__validateDate(data) == 1 ): datum = datetime.datetime.strptime(data, '%d.%m.%Y').date() ''' datum ist gueltig. pruefe nun, wann die notwendigen deltas erreicht wurden, um die werte zu setzen ''' if( self.AktienkursHeute == 0 and datum >= date.today() - timedelta(days=3) ): self.AktienkursHeute = float( td_elements[c+2].textContent.replace(",", ".") ) if( self.AktienkursVor6Monaten == 0 and datum <= date.today() - timedelta(days=180) ): self.AktienkursVor6Monaten = float( td_elements[c+2].textContent.replace(",", ".") ) if( self.AktienkursVor12Monaten == 0 and datum <= date.today() - timedelta(days=360) ): self.AktienkursVor12Monaten = float( td_elements[c+2].textContent.replace(",", ".") ) DatumList= [] DatumList.append( date.today() - timedelta(date.today().day) ) DatumList.append( DatumList[0] - timedelta(DatumList[0].day) ) DatumList.append( DatumList[1] - timedelta(DatumList[1].day) ) DatumList.append( DatumList[2] - timedelta(DatumList[2].day) ) for i in [0,1,2,3]: if( self.AktieList[i] == 0 and datum <= DatumList[i] ): self.AktieList[i] = float( td_elements[c+2].textContent.replace(".", "").replace(",", ".") ) c = c + 1
def doWork(self,work): frame = Jaime.getInstance().page.mainFrame() try : self.document = libxml2dom.parseString(frame.toHtml().encode('utf-8','ignore'), html=1) except Exception as e: print 'Error en el frame to html %s' % e # print 'entro a dowork' for w in work: try: f = getattr(self, w[0]) f(*w[1]) except Exception as e: print 'Excepcion en doWork %s' % e
def run(self, html): doc = libxml2dom.parseString(html, html=1) doc_table = doc.getElementsByTagName('table') doc_table = doc_table[0] self.tds_width = self._some_has_width(doc.getElementsByTagName('td')) self.ths_width = self._some_has_width(doc.getElementsByTagName('th')) doc_table = self.parse_table(doc_table) doc_table = self.set_size_table(doc_table) doc_str = doc_table.toString() doc_str_lt = doc_str.replace('<', '<') doc_str_gt = doc_str_lt.replace('>', '>') return doc_str_gt
def __getKBV(self, stock): page = self.__webConnect.runGETRequest( self.__UrlKBV + str(stock.ISIN) ) doc = libxml2dom.parseString(page, html=1) td_elements = doc.getElementsByTagName("td") c = 0 for i in td_elements: data = i.textContent if self.KBV == "NA" and data.find("KBV") > -1 and data.find("title=\"Kurs/Buchungs"): self.KBV = float( td_elements[c+1].textContent.replace(",", ".") ) c = c + 1
def get_lecture_text(html_data): def try_toc_format(dom): def dfs(node): count = None if node.getAttribute("id") == "bodyContent": count = 0 for x in node.childNodes: r = dfs(x) if r: return r if count != None: count += 1 return count res = "" entries = parse(html_data, "Spis tre¶ci") # d = dfs(dom) # print "dfs returned %d, len(entries)=%d" %(d, len(entries)) # print "html_data: " + html_data if [19, 22].count(dfs(dom)) == 0 or len(entries) < 5: return None for entry in entries: # print "subentry: %s %s" %(enc( entry.name), enc( entry.link)) html_data2 = fetch_html(absolute_url(entry.link)) res += " " + get_lecture_text(html_data2) return res def dfs(node, good_ol=False): if node.getAttribute("id") == "bodyContent": good_ol = True if good_ol and node.name == "text": st.results += " " + enc(node.textContent) for x in node.childNodes: dfs(x, good_ol) class St: pass st = St() st.results = "" dom = libxml2dom.parseString(html_data, html=True, htmlencoding="utf-8") res = try_toc_format(dom) if res: return res dfs(dom) return st.results
def __getKBV(self, stock): page = self.__webConnect.runGETRequest(self.__UrlKBV + str(stock.ISIN)) doc = libxml2dom.parseString(page, html=1) td_elements = doc.getElementsByTagName("td") c = 0 for i in td_elements: data = i.textContent if self.KBV == "NA" and data.find("KBV") > -1 and data.find( "title=\"Kurs/Buchungs"): self.KBV = float(td_elements[c + 1].textContent.replace( ",", ".")) c = c + 1
def get_site_description(res): doc = libxml2dom.parseString(res.read(), html=1) meta_tag_nodes = doc.getElementsByTagName('meta') for meta_tag_node in meta_tag_nodes: if str(meta_tag_node.getAttribute('name')).lower() == "description": return meta_tag_node.getAttribute('content') # if description meta was not found, try google url = ('http://ajax.googleapis.com/ajax/services/search/web' \ '?v=1.0&q=site:'+res.url) res = get_http_response(url) if not res: return '' results = simplejson.load(res)['responseData']['results'] if results: return results[0]['content'] return ''
def getOnvistaId(self, stock): page = self.__webConnect.runGETRequest( self.__StockOverviewUrl + str(stock.ISIN) ) onvistaId = "" doc = libxml2dom.parseString(page, html=1) a_elements = doc.getElementsByTagName("a") for i in a_elements: if i.textContent == "Kennzahlen" and "kennzahlen/fundamental.html?ID_OSI" in i.attributes["href"].value: url = i.attributes["href"].value onvistaId = str(url.split("=")[1]) break if onvistaId.isdigit() == False: raise NameError('Error: getOnvistaId, Id nicht numeric: ' + onvistaId) return onvistaId
def getUrlList(mobygameUrl): doc = libxml2dom.parseString(mobygameUrl, html=1) """mof_object_list - id of the table containing the list of all games""" table = doc.getElementById("mof_object_list"); urlList=[] if(table is None): print ("ERROR! No table found!") tbody = table.getElementsByTagName("tbody"); if(len(tbody) > 1): print ("ERROR! More then one tbody in table found!") elif(len(tbody) == 0): print ("ERROR! No tbody found") allTr = tbody[0].getElementsByTagName("tr"); for tr in allTr: urlList.append(tr.getElementsByTagName("a")[0].getAttribute("href")) return urlList
def getFinanzenNetId(self, stock): url= self.__UrlKBV + stock.ISIN page = self.__webConnect.runGETRequest( url ) finanzenNetId = "" doc = libxml2dom.parseString(page, html=1) a_elements = doc.getElementsByTagName("a") for i in a_elements: if i.textContent == "Historisch" and "kurse_historisch.asp" in i.attributes["href"].value: url = i.attributes["href"].value finanzenNetId = str(url.split("=")[1].split("&")[0]) break if finanzenNetId.isdigit() == False: raise NameError('Error: getFinanzenNetId, Id nicht numeric: ' + finanzenNetId) return finanzenNetId
def __getDataForIndex(self, stock): t1 = date.today() t2 = t1 - timedelta(days=380) url = self.__FinanzenNetHistorischeKurseIndizesURL.replace( "XXX", stock.strIndexFinanzenNet) self.__POSTDataIndex["dtTag1"] = t2.day self.__POSTDataIndex["dtMonat1"] = t2.month self.__POSTDataIndex["dtJahr1"] = t2.year self.__POSTDataIndex["dtTag2"] = t1.day self.__POSTDataIndex["dtMonat2"] = t1.month self.__POSTDataIndex["dtJahr2"] = t1.year page = self.__webConnect.runPOSTRequest(url, self.__POSTDataIndex) doc = libxml2dom.parseString(page, html=1) td_elements = doc.getElementsByTagName("td") c = 0 for i in td_elements: data = i.textContent if (self.__validateDate(data) == 1): datum = datetime.datetime.strptime(data, '%d.%m.%Y').date() ''' datum ist gueltig. pruefe nun, wann die notwendigen deltas erreicht wurden, um die werte zu setzen ''' DatumList = [] DatumList.append(date.today() - timedelta(date.today().day)) DatumList.append(DatumList[0] - timedelta(DatumList[0].day)) DatumList.append(DatumList[1] - timedelta(DatumList[1].day)) DatumList.append(DatumList[2] - timedelta(DatumList[2].day)) for i in [0, 1, 2, 3]: if (self.IndexList[i] == 0 and datum <= DatumList[i]): self.IndexList[i] = float( td_elements[c + 2].textContent.replace( ".", "").replace(",", ".")) c = c + 1
def no_header(source, headers, table_index): #initiate a list to hold the return list return_list = [] #get a document object out of the source code doc = libxml2dom.parseString(source, html=1) #get the tables from document tables = doc.getElementsByTagName('table') try: #Try to get focus on the desired table main_table = tables[table_index] except: #if the table doesn't exits then return an error return ['The table index was not found'] #get all of the rows out of the main_table rows = main_table.getElementsByTagName('tr') #loop through each row for row in rows: #get all cells from the current row cells = row.getElementsByTagName('td') #initiate a list to append into the return_list cell_list = [] #loop through the list of desired headers for i in headers: try: #try to add text from the cell into the cell_list cell_list.append(cells[i].textContent) except: #if there is an error usually an index error just continue continue #append the data scraped into the return_list return_list.append(cell_list) #return the return list return return_list
def __getDataForIndex(self, stock): t1 = date.today() t2 = t1 - timedelta(days=380) url = self.__FinanzenNetHistorischeKurseIndizesURL.replace("XXX", stock.strIndexFinanzenNet) self.__POSTDataIndex["dtTag1"] = t2.day self.__POSTDataIndex["dtMonat1"] = t2.month self.__POSTDataIndex["dtJahr1"] = t2.year self.__POSTDataIndex["dtTag2"] = t1.day self.__POSTDataIndex["dtMonat2"] = t1.month self.__POSTDataIndex["dtJahr2"] = t1.year page = self.__webConnect.runPOSTRequest(url, self.__POSTDataIndex) doc = libxml2dom.parseString(page, html=1) td_elements = doc.getElementsByTagName("td") c = 0 for i in td_elements: data = i.textContent if( self.__validateDate(data) == 1 ): datum = datetime.datetime.strptime(data, '%d.%m.%Y').date() ''' datum ist gueltig. pruefe nun, wann die notwendigen deltas erreicht wurden, um die werte zu setzen ''' DatumList= [] DatumList.append( date.today() - timedelta(date.today().day) ) DatumList.append( DatumList[0] - timedelta(DatumList[0].day) ) DatumList.append( DatumList[1] - timedelta(DatumList[1].day) ) DatumList.append( DatumList[2] - timedelta(DatumList[2].day) ) for i in [0,1,2,3]: if( self.IndexList[i] == 0 and datum <= DatumList[i] ): self.IndexList[i] = float( td_elements[c+2].textContent.replace(".", "").replace(",", ".") ) c = c + 1
def getFinanzenNetId(self, stock): url = self.__UrlKBV + stock.ISIN page = self.__webConnect.runGETRequest(url) finanzenNetId = "" doc = libxml2dom.parseString(page, html=1) a_elements = doc.getElementsByTagName("a") for i in a_elements: if i.textContent == "Historisch" and "kurse_historisch.asp" in i.attributes[ "href"].value: url = i.attributes["href"].value finanzenNetId = str(url.split("=")[1].split("&")[0]) break if finanzenNetId.isdigit() == False: raise NameError('Error: getFinanzenNetId, Id nicht numeric: ' + finanzenNetId) return finanzenNetId
def getOnvistaId(self, stock): page = self.__webConnect.runGETRequest(self.__StockOverviewUrl + str(stock.ISIN)) onvistaId = "" doc = libxml2dom.parseString(page, html=1) a_elements = doc.getElementsByTagName("a") for i in a_elements: if i.textContent == "Kennzahlen" and "kennzahlen/fundamental.html?ID_OSI" in i.attributes[ "href"].value: url = i.attributes["href"].value onvistaId = str(url.split("=")[1]) break if onvistaId.isdigit() == False: raise NameError('Error: getOnvistaId, Id nicht numeric: ' + onvistaId) return onvistaId
def _retrieve_summary(self, url): """Retrieve the episode summary""" # reader = HtmlLib.Reader() if not url: print >> sys.stderr, 'Warning: no URL for summary' return '' print "Retrieving %s" % url show = urllib.urlopen(url) showstr = show.read() # Website sometimes contains invalid characters, which cause the # DOM parser to fail. Discard any non-ASCII character showstr = showstr.decode('ascii', 'ignore') show.close() doc = libxml2dom.parseString(showstr, html=1) for node in doc.getElementsByTagName('p'): if node.hasAttribute('class'): if 'deck' in node.getAttribute('class'): for child_node in node.childNodes: if child_node.nodeType == child_node.TEXT_NODE: summary = child_node.nodeValue.strip() if summary: return summary return ''
def __parseOnvistaSummary(self, stock): page = self.__webConnect.runGETRequest( self.__OnvistaFundamentaldatenTabelleUrl + str(stock.OnvistaId)) doc = libxml2dom.parseString(page, html=1) td_elements = doc.getElementsByTagName("td") c = 0 for i in td_elements: data = i.textContent try: if (data == "Marktkap.:"): try: tmp = float(td_elements[c + 1].textContent.replace( ".", "").replace(",", ".").replace(" Mio EUR", "")) self.__MarktkapitalisierungInEuro = tmp * 1000 * 1000 except ValueError: self.__MarktkapitalisierungInEuro = "NA" if (data == "Dividendenrendite in %"): try: self.__DivRenditeAktJahrProzent = float( td_elements[c + 1].textContent.replace(",", ".")) except ValueError: self.__DivRenditeAktJahrProzent = "NA" if (data == u"Kurs-Buchwert-Verhältnis"): try: self.__KBVAktJahr = float( td_elements[c + 1].textContent.replace(",", ".")) except ValueError: self.__KBVAktJahr = "NA" if (data == "KGV"): try: self.__KGVAktJahr = float( td_elements[c + 1].textContent.replace(",", ".")) except ValueError: self.__KGVAktJahr = "NA" try: summe = 0 for j in {1, 2, 3, 4, 5}: summe += float(td_elements[c + j].textContent.replace( ",", ".")) self.__KGVMean5Years = summe / 5 except ValueError: self.__KGVMean5Years = "NA" if (data == "Eigenkapitalquote in %"): try: self.__EigenkapitalquoteAktJahrProzent = float( td_elements[c + 1].textContent.replace(",", ".")) except ValueError: self.__EigenkapitalquoteAktJahrProzent = "NA" if (data == "EBIT-Marge in %"): try: self.__EbitMargeAktJahrProzent = float( td_elements[c + 1].textContent.replace( "%", "").replace(",", ".")) except ValueError: self.__EbitMargeAktJahrProzent = "NA" if (data == "Eigenkapitalrendite in %"): try: self.__EKRAktJahrProzent = float( td_elements[c + 1].textContent.replace( "%", "").replace(",", ".")) except ValueError: self.__EKRAktJahrProzent = "NA" except ValueError, e: traceback.print_exc() raise ValueError("Error parseOnvistaSummary, Stock " + stock.Name + ", ISIN " + stock.ISIN) c = c + 1
# Each CMS has an id associated with it ACIDRE = re.compile(".*start.asp\?acid=([^&]*).*", re.DOTALL) acidMatch = ACIDRE.match(cmsList) if acidMatch is None: flush_print("Could not find site ID") sys.exit(-1) acid = acidMatch.group(1) flush_print("Using Site ID: '%s'" % acid) # Pull the IDs out for each site cmsListDoc = libxml2dom.parseString(cmsList, html=1) CMSRE = re.compile("http://([^/]+)/site/lookup.asp\?c=(.+)", re.DOTALL) MENURE = re.compile("ShowMenu\(event, *'(.*)'.*") FOLEXPCOLLRE = re.compile("fExpandCollapse\(([0-9]*), *[01][^01]") FILEXPCOLLRE = re.compile("fExpandCollapse\('([0-9]*)', *1[01]") CIDRE = re.compile(".*[^a]cid={([^}]*)}.*", re.DOTALL) BINIDRE = re.compile("{([^}]*)}.*") # Using the parseFile method is causing the program to abort with a # "too many files open" error def parseFile(filename, html=True): flush_print("Parsing: %s" % filename) fileHandle = open(filename) fileDoc = libxml2dom.parse(fileHandle)
# if child.nodeType == child.TEXT_NODE: # result += child.nodeValue # else: # result += TextInNode(child) # return result filelist = glob.glob('*.htm') filenum = len(filelist) num = 0 errorfiles = [] for filename in filelist: num += 1 print >> sys.stderr, filename, num, 'of', filenum try: fp = open(filename, 'r') doc = libxml2dom.parseString(fp.read(), html=1) fp.close() style = doc.getElementsByTagName("style")[0].textContent style = re.search(r'(?s)\s*\.(\S+)\s*{\s*display:\s*none', style) displaynone = style.group(1) tabpages = doc.getElementsByTagName("div") tabpages = filter(lambda s: s.getAttribute("class") == "tab-page", tabpages) for tabpage in tabpages: found = False for node in tabpage.childNodes: if node.nodeType == node.ELEMENT_NODE and node.name == 'h2': if node.textContent == whattoextract: found = True break if found:
def appendHTML(node, html): html = "<html>%s</html>" % html doc = libxml2dom.parseString(html, html=1) if doc.documentElement.childNodes.length > 0: for child in doc.documentElement.childNodes[0].childNodes: node.appendChild(node.importNode(child, True))
def parseString(s, html=0, htmlencoding=None, unfinished=0, impl=None): return libxml2dom.parseString(s, html=html, htmlencoding=htmlencoding, unfinished=unfinished, impl=(impl or default_impl))
logging.critical("""To use downloadFromSomeSite function, \ you must provide options baselink and urlsearch in your config""") return None global libxml2dom try: libxml2dom except NameError: import libxml2dom try: a = codecs.open(os.path.join(directory, filename), 'rb') except IOError, m: failedProcedure( u"""%s: could not even open our just written file.leaving \ function..""" % m, directory, filename, threadName, rssItemNode, downloadDict) return None p = libxml2dom.parseString(a.read(), html=True) try: link = "%s%s" % (baselink, [ x.getAttribute('href') for x in p.getElementsByTagName('a') if x.hasAttribute('href') and x.getAttribute('href').count(urlsearch) ][0]) # if you want a regex. Then, instead of # x.getAttribute('href').count(urlsearch) do: # re.search(urlsearch, x.getAttribute('href')) except IndexError, m: failedProcedure( u"""%s: could not find href for downloaded %s item for \ redownload""" % (m, threadName), directory, filename, threadName, rssItemNode, downloadDict) return None try: