def extract_urls(self, doc, base_url): """Parses a document and returns URLS found in it.""" try: tree = TidyHTMLTreeBuilder.parse(doc) except Exception, e: print >> debug, "..Error while parsing: %r" % e return set()
def getPluginHTMLpage(element): """Returns the raw html from the wiki page of the plugin referenced by this element.""" rel_link = element.attrib['link'] url = __join(WIKI_URL, rel_link) plugin_page = TidyHTMLTreeBuilder.parse(urllib.urlopen(url)) return plugin_page
def getPluginHTMLpage(element): """Returns the raw html from the wiki page of the plugin referenced by this element.""" rel_link = element.attrib['link'] url = __join(WIKI_URL,rel_link) plugin_page = TidyHTMLTreeBuilder.parse(urllib.urlopen(url)) return plugin_page
def fetch(self,type,isbn): from socket import setdefaulttimeout setdefaulttimeout(60) ns = 'http://www.w3.org/1999/xhtml' url = 'http://thatscrazyhot.com/search/searchisbn?citetype=%s&search=%s' % (type,isbn) tree = TidyHTMLTreeBuilder.parse(urlopen(url)) for e in tree.findall('.//{%s}div' % ns): if e.attrib.get('id') == 'leftcolumn': return self.get_text(e) return "isbn %s not found" % isbn
def euph(self, irc, msg, args): """generate a euphemism """ url = "http://walkingdead.net/perl/euphemism" tree = TidyHTMLTreeBuilder.parse(urlopen(url)) ns = 'http://www.w3.org/1999/xhtml' td = tree.find('.//{%(ns)s}table/{%(ns)s}tr/{%(ns)s}td' % {'ns':ns}) if td: irc.reply("%s ... %s" % (td.text, td.find('.//{%s}h2' % ns).text)) else: irc.reply("uhoh, couldn't get a euphemism, sorry dude")
def euph(self, irc, msg, args): """generate a euphemism """ url = "http://walkingdead.net/perl/euphemism" tree = TidyHTMLTreeBuilder.parse(urlopen(url)) ns = 'http://www.w3.org/1999/xhtml' td = tree.find('.//{%(ns)s}table/{%(ns)s}tr/{%(ns)s}td' % {'ns': ns}) if td: irc.reply("%s ... %s" % (td.text, td.find('.//{%s}h2' % ns).text)) else: irc.reply("uhoh, couldn't get a euphemism, sorry dude")
def fetch(self, type, isbn): from socket import setdefaulttimeout setdefaulttimeout(60) ns = 'http://www.w3.org/1999/xhtml' url = 'http://thatscrazyhot.com/search/searchisbn?citetype=%s&search=%s' % ( type, isbn) tree = TidyHTMLTreeBuilder.parse(urlopen(url)) for e in tree.findall('.//{%s}div' % ns): if e.attrib.get('id') == 'leftcolumn': return self.get_text(e) return "isbn %s not found" % isbn
def __init__(self): ns = { 'ns' : LastItem.ns } tree = TidyHTMLTreeBuilder.parse(urlopen(self.url)) root = tree.getroot() cells = root.findall('.//{%(ns)s}table/{%(ns)s}tr/{%(ns)s}td' % ns ) self.date = cells[0].find('.//{%(ns)s}br' % ns).tail.strip() self.holdings = cells[2].text.replace('Total holdings in WorldCat: ','') self.title = cells[6].text self.author = cells[8].text self.publisher = cells[10].text if len(cells) == 17: self.contributedby = cells[16].text else: self.contributedby = cells[18].text
def __init__(self): ns = {'ns': LastItem.ns} tree = TidyHTMLTreeBuilder.parse(urlopen(self.url)) root = tree.getroot() cells = root.findall('.//{%(ns)s}table/{%(ns)s}tr/{%(ns)s}td' % ns) self.date = cells[0].find('.//{%(ns)s}br' % ns).tail.strip() self.holdings = cells[2].text.replace('Total holdings in WorldCat: ', '') self.title = cells[6].text self.author = cells[8].text self.publisher = cells[10].text if len(cells) == 17: self.contributedby = cells[16].text else: self.contributedby = cells[18].text
def parseHtml(): XHTML = "{http://www.w3.org/1999/xhtml}" num_freq_dict = {} page = "" page = urllib2.urlopen('http://www.lottery.co.uk/statistics/') tree = TidyHTMLTreeBuilder.parse(page) docRoot = tree.getroot() '''Normalise the XHTML to HTML - removes namespace''' XHTML = "{http://www.w3.org/1999/xhtml}" for elem in docRoot.getiterator(): if elem.tag.startswith(XHTML): elem.tag = elem.tag[len(XHTML):] for n in elem.getchildren(): n.tag = n.tag[len(XHTML):] for p in n.getchildren(): p.tag = p.tag[len(XHTML):] for q in p.getchildren(): q.tag = q.tag[len(XHTML):] for d in docRoot.getiterator(): if d.tag == "div" and d.attrib.has_key("class") and d.attrib['class'] == 'main': for e in d: if e.tag == 'table' and e.attrib.has_key("style"): td = e.findall("./tr/td/table/tr/td") complete = 'false' for h in td: if h is not None: z = h.find('img') if z is not None: k = z.attrib['alt'] continue if h.attrib.has_key("bgcolor") and not h.attrib.has_key("width"): if h.text is not None: v = h.text[2:] complete = 'true' if complete == 'true': print str(k) + " " + str(v) num_freq_dict[k] = v k = None v = None complete = 'false' return num_freq_dict
def parse_xls_file(xls_file): "Parse a xls file." tree = TidyHTMLTreeBuilder.parse(xls_file) root = tree.getroot() table = root.find( '{http://www.w3.org/1999/xhtml}body/{http://www.w3.org/1999/xhtml}table') col_names = None data_sets = [] matrix = [] for row in table.findall('{http://www.w3.org/1999/xhtml}tr'): row_texts = child_texts(row) if None == col_names: col_names = row_texts[1:] elif len(row_texts) == len(col_names) + 1: data_sets.append(row_texts[0]) matrix.append(map(float, row_texts[1:])) matrix = np.array(matrix) return data_sets, col_names, matrix
def inst(self,irc,msg,args): """<institution> Look up a oclc participating institution by code """ from elementtree.ElementTree import tostring from socket import setdefaulttimeout from re import sub setdefaulttimeout(60) ns = 'http://www.w3.org/1999/xhtml' inst = args[0] self.log.info("looking up oclc institution %s" % inst) url = "http://www.oclc.org/common/cgi-oclc/pi.pl?max=1&sym=%s" % inst response = "" try: tree = TidyHTMLTreeBuilder.parse( urlopen(url) ) root = tree.getroot() info = tostring(root.findall('.//{%s}font'%ns)[8]) response = sub("<.*?>",'', info.replace("\n",' ')) except: response = "sorry no hits for %s" % inst irc.reply(response)
def inst(self, irc, msg, args): """<institution> Look up a oclc participating institution by code """ from elementtree.ElementTree import tostring from socket import setdefaulttimeout from re import sub setdefaulttimeout(60) ns = 'http://www.w3.org/1999/xhtml' inst = args[0] self.log.info("looking up oclc institution %s" % inst) url = "http://www.oclc.org/common/cgi-oclc/pi.pl?max=1&sym=%s" % inst response = "" try: tree = TidyHTMLTreeBuilder.parse(urlopen(url)) root = tree.getroot() info = tostring(root.findall('.//{%s}font' % ns)[8]) response = sub("<.*?>", '', info.replace("\n", ' ')) except: response = "sorry no hits for %s" % inst irc.reply(response)
def parse_error(self, html): tree = TidyHTMLTreeBuilder.parse(html) prefix = '{http://www.w3.org/1999/xhtml}' root = tree.getroot() self.form = root.find('{}body/{}form'.format(prefix, prefix)) # Some of the returned html has a blockquote before the form, some don't if self.form is None: self.form = root.find( '{}body/{}blockquote/{}form'.format(*[prefix] * 3)) if self.form is not None: fields = dict(self.form.items()) if fields['name'] == 'frmConfirmation': # Existing sessions open print("There are existing sessions open") #submit = form.find('{}table/{}tr/{}td/{}table/{}tr/{}td/{}table/{}tr/{}td/{}table/{}tr/{}td/{}input'.format(*[prefix]*13)) submit = find_by_name(self.form, 'btnContinue') values = dict(submit.items()) table = find_sessions(self.form) display_session(table) if values["value"] == "Close Selected Sessions and Log in": self.close_sessions(table, True) elif values[ "value"] == "Log in (and optionally Close Selected Sessions)": self.close_sessions(table) session = self.get_session() if session is not None: return session else: return self.parse_error(self.args.out_file) elif fields['name'] == 'frmLogin': print "Found login form, looking post-auth message" # This may be the post-login form # check whether there's a hidden field with a key #print tostring(self.form) temp = self.form.getiterator('{}input'.format(prefix)) isSecondary = 0 if temp is not None: for t in temp: #print tostring(t) if t.get('name') == 'key': key = t.get('value') isSecondary = 1 break if isSecondary == 1: dat = {"key": key, "sn-postauth-proceed": "Proceed"} #print dat self.opener.open( "https://{}{}".format(self.args.hostname, self.args.login_path), urllib.urlencode(dat)) #print 'Attempting to get session' session = self.get_session() if session is not None: return session else: return self.parse_error(self.args.out_file) else: # Invalid user/pass try again print("Invalid user/pass, please try again") self.get_user() passwords = self.get_passwords() self.data = self.configure_data(passwords) self.log_in() session = self.get_session() if session is not None: return session else: return self.parse_error(self.args.out_file) elif fields['name'] == 'frmNextToken': # Wait till the next token pops up and then enter it temp = self.form.find('{}/input'.format(prefix)) if temp is not None: values = dict(temp.items()) try: if values['name'] == 'key': key = values['value'] else: print( "Unable to find the key for the next token form, either we detected the form incorrectly or somthing went wrong." ) return except KeyError: return password = getpass( "Please enter the next securID token to appear on your fob" ) self.data = self.configure_data({"password": password}) #self.opener.open("https://{}{}".format(self.args.hostname, self.args.login_path), self.data) self.log_in() session = self.get_session() else: # Unknown case, note where the file is so they can see what happened print( "An unhandled case has come up. Please view the page at {}" .format(self.args.out_file)) else: print("Unable to parse the html, please view it at {}".format( self.args.out_file))
quotechar='|', quoting=csv.QUOTE_MINIMAL) dataout.writerow(['season', 'week', 'away', 'home', 'awayscore', 'homescore']) chdir(sdir) weeks = regs[:] if y != 2009: weeks.extend(ps) for i in range(len(weeks)): w = weeks[i] if not exists(w): logging.error("No data for %d %s" % (y, w)) continue tree = TidyHTMLTreeBuilder.parse(w) for elt in tree.getiterator(): if elt.tag == div and elt.get('class') == 'col scorebox-container': game = dict() uls = elt.findall('.//'+ul) for u in uls: if u.get('class').endswith('-team'): for l in u.getiterator(): if l.tag == li and l.get('class') == 'team-logo': game[u.get('class')[0:4]] = l.get('id') break divs = elt.findall('.//'+div) for d in divs: if d.get('class') is not None and d.get('class').endswith('-score'): for s in d.getiterator(): if s.tag == div and s.get('class') == 'the-score':
def getPluginListHTMLpage(): """Returns the raw html of the wiki age that collect plugin hierarchy.""" plugin_list_page_tree = TidyHTMLTreeBuilder.parse(urllib.urlopen(PLUGIN_LIST_URL)) return plugin_list_page_tree
def parse_error(self, html): tree = TidyHTMLTreeBuilder.parse(html) prefix = '{http://www.w3.org/1999/xhtml}' root = tree.getroot() self.form = root.find('{}body/{}form'.format(prefix, prefix)) # Some of the returned html has a blockquote before the form, some don't if self.form is None: self.form = root.find('{}body/{}blockquote/{}form'.format(*[prefix]*3)) if self.form is not None: fields = dict(self.form.items()) if fields['name'] == 'frmConfirmation': # Existing sessions open print("There are existing sessions open") #submit = form.find('{}table/{}tr/{}td/{}table/{}tr/{}td/{}table/{}tr/{}td/{}table/{}tr/{}td/{}input'.format(*[prefix]*13)) submit = find_by_name(self.form, 'btnContinue') values = dict(submit.items()) table = find_sessions(self.form) display_session(table) if values["value"] == "Close Selected Sessions and Log in": self.close_sessions(table, True) elif values["value"] == "Log in (and optionally Close Selected Sessions)": self.close_sessions(table) session = self.get_session() if session is not None: return session else: return self.parse_error(self.args.out_file) elif fields['name'] == 'frmLogin': #print "Found login form, looking post-auth message" # This may be the post-login form # check whether there's a hidden field with a key #print tostring(self.form) temp = self.form.getiterator('{}input'.format(prefix)) isSecondary = 0 if temp is not None: for t in temp: #print tostring(t) if t.get('name') == 'key': key = t.get('value') isSecondary = 1 break if isSecondary == 1: dat = {"key": key, "sn-postauth-proceed": "Proceed"} #print dat self.opener.open("https://{}{}".format(self.args.hostname, self.args.login_path), urllib.urlencode(dat)) #print 'Attempting to get session' session = self.get_session() if session is not None: return session else: return self.parse_error(self.args.out_file) else: error_msg = "Undetermined error. Possible invalid user/pass, please try again (HTML of last page is written to {}.".format(self.args.out_file) error_table = find_by_id(self.form, 'table_LoginPage_5') if error_table is not None: error_msg = error_table.find('{}tr/{}td'.format(*[prefix]*2)).text print(error_msg) self.get_user() passwords = self.get_passwords() self.data = self.configure_data(passwords) self.log_in() session = self.get_session() if session is not None: return session else: return self.parse_error(self.args.out_file) elif fields['name'] == 'frmNextToken': # Wait till the next token pops up and then enter it temp = self.form.find('{}/input'.format(prefix)) if temp is not None: values = dict(temp.items()) try: if values['name'] == 'key': key = values['value'] else: print("Unable to find the key for the next token form, either we detected the form incorrectly or somthing went wrong.") return except KeyError: return password = getpass("Please enter the next securID token to appear on your fob") self.data = self.configure_data({"password":password}) #self.opener.open("https://{}{}".format(self.args.hostname, self.args.login_path), self.data) self.log_in() session = self.get_session() else: # Unknown case, note where the file is so they can see what happened print("An unhandled case has come up. Please view the page at {}".format(self.args.out_file)) else: print("Unable to parse the html, please view it at {}".format(self.args.out_file))
def getPluginListHTMLpage(): """Returns the raw html of the wiki age that collect plugin hierarchy.""" plugin_list_page_tree = TidyHTMLTreeBuilder.parse( urllib.urlopen(PLUGIN_LIST_URL)) return plugin_list_page_tree