Ejemplo n.º 1
0
def getPluginHTMLpage(element):
    """Returns the raw html from the wiki page of the plugin referenced by
    this element."""
    rel_link = element.attrib['link']
    url = __join(WIKI_URL, rel_link)
    plugin_page = TidyHTMLTreeBuilder.parse(urllib.urlopen(url))
    return plugin_page
Ejemplo n.º 2
0
def getPluginHTMLpage(element):
    """Returns the raw html from the wiki page of the plugin referenced by
    this element."""
    rel_link = element.attrib['link']
    url = __join(WIKI_URL,rel_link)
    plugin_page = TidyHTMLTreeBuilder.parse(urllib.urlopen(url))
    return plugin_page
Ejemplo n.º 3
0
 def extract_urls(self, doc, base_url):
     """Parses a document and returns URLS found in it."""
     try:
         tree = TidyHTMLTreeBuilder.parse(doc)
     except Exception, e:
         print >> debug, "..Error while parsing: %r" % e
         return set()
Ejemplo n.º 4
0
class HtmlDom:
    
    def __init__(self, url):
        try:
            f = file(url)
            data = f.read()
            f.close()
        except IOError, e:
            try:
                result = fetch(url, agent=MOZILLA_AGENT)
                data = result['data']
            except:
                raise IOError, 'invalid URL'
        
        # create parser
        parser = tidy.TreeBuilder()
        parser.feed(data)
        xmlText = _etree.tostring(parser.close())
        
        #create the DOM
        reader = PyExpat.Reader()
        self.dom = reader.fromString(xmlText)
        
        self.nss = {u'html': XHTML_NAMESPACE}
        self.context = xml.xpath.Context.Context(self.dom, processorNss=self.nss)
Ejemplo n.º 5
0
 def fetch(self,type,isbn):
     from socket import setdefaulttimeout
     setdefaulttimeout(60)
     ns = 'http://www.w3.org/1999/xhtml'
     url = 'http://thatscrazyhot.com/search/searchisbn?citetype=%s&search=%s' % (type,isbn)
     tree = TidyHTMLTreeBuilder.parse(urlopen(url))
     for e in tree.findall('.//{%s}div' % ns):
         if e.attrib.get('id') == 'leftcolumn':
             return self.get_text(e)
     return "isbn %s not found" % isbn
Ejemplo n.º 6
0
 def euph(self, irc, msg, args):
     """generate a euphemism
     """
     url = "http://walkingdead.net/perl/euphemism"
     tree = TidyHTMLTreeBuilder.parse(urlopen(url))
     ns = 'http://www.w3.org/1999/xhtml'
     td = tree.find('.//{%(ns)s}table/{%(ns)s}tr/{%(ns)s}td' % {'ns':ns})
     if td:
         irc.reply("%s ... %s" % (td.text, td.find('.//{%s}h2' % ns).text))
     else:
         irc.reply("uhoh, couldn't get a euphemism, sorry dude")
Ejemplo n.º 7
0
 def fetch(self, type, isbn):
     from socket import setdefaulttimeout
     setdefaulttimeout(60)
     ns = 'http://www.w3.org/1999/xhtml'
     url = 'http://thatscrazyhot.com/search/searchisbn?citetype=%s&search=%s' % (
         type, isbn)
     tree = TidyHTMLTreeBuilder.parse(urlopen(url))
     for e in tree.findall('.//{%s}div' % ns):
         if e.attrib.get('id') == 'leftcolumn':
             return self.get_text(e)
     return "isbn %s not found" % isbn
Ejemplo n.º 8
0
 def euph(self, irc, msg, args):
     """generate a euphemism
     """
     url = "http://walkingdead.net/perl/euphemism"
     tree = TidyHTMLTreeBuilder.parse(urlopen(url))
     ns = 'http://www.w3.org/1999/xhtml'
     td = tree.find('.//{%(ns)s}table/{%(ns)s}tr/{%(ns)s}td' % {'ns': ns})
     if td:
         irc.reply("%s ... %s" % (td.text, td.find('.//{%s}h2' % ns).text))
     else:
         irc.reply("uhoh, couldn't get a euphemism, sorry dude")
Ejemplo n.º 9
0
 def __init__(self):
     ns = { 'ns' : LastItem.ns }
     tree = TidyHTMLTreeBuilder.parse(urlopen(self.url))
     root = tree.getroot()
     cells = root.findall('.//{%(ns)s}table/{%(ns)s}tr/{%(ns)s}td' % ns )
     self.date = cells[0].find('.//{%(ns)s}br' % ns).tail.strip()
     self.holdings = cells[2].text.replace('Total holdings in WorldCat: ','')
     self.title = cells[6].text
     self.author = cells[8].text
     self.publisher = cells[10].text
     if len(cells) == 17:
         self.contributedby = cells[16].text
     else:
         self.contributedby = cells[18].text
Ejemplo n.º 10
0
 def __init__(self):
     ns = {'ns': LastItem.ns}
     tree = TidyHTMLTreeBuilder.parse(urlopen(self.url))
     root = tree.getroot()
     cells = root.findall('.//{%(ns)s}table/{%(ns)s}tr/{%(ns)s}td' % ns)
     self.date = cells[0].find('.//{%(ns)s}br' % ns).tail.strip()
     self.holdings = cells[2].text.replace('Total holdings in WorldCat: ',
                                           '')
     self.title = cells[6].text
     self.author = cells[8].text
     self.publisher = cells[10].text
     if len(cells) == 17:
         self.contributedby = cells[16].text
     else:
         self.contributedby = cells[18].text
Ejemplo n.º 11
0
def parseHtml():
	XHTML = "{http://www.w3.org/1999/xhtml}"
	num_freq_dict = {}

	page = ""	
	page = urllib2.urlopen('http://www.lottery.co.uk/statistics/')
	tree = TidyHTMLTreeBuilder.parse(page)
	docRoot = tree.getroot()
	
	'''Normalise the XHTML to HTML - removes namespace'''
	XHTML = "{http://www.w3.org/1999/xhtml}"
	for elem in docRoot.getiterator():
		if elem.tag.startswith(XHTML):
			elem.tag = elem.tag[len(XHTML):]
			for n in elem.getchildren():
				n.tag = n.tag[len(XHTML):]
				for p in n.getchildren():
					p.tag = p.tag[len(XHTML):]
					for q in p.getchildren():
						q.tag = q.tag[len(XHTML):]

	for d in docRoot.getiterator():
		if d.tag == "div" and d.attrib.has_key("class") and d.attrib['class'] == 'main':
			for e in d:
				if e.tag == 'table' and e.attrib.has_key("style"):
					td = e.findall("./tr/td/table/tr/td")
					complete = 'false'
					for h in td:
						if h is not None:
							z = h.find('img')
							if z is not None:
								k = z.attrib['alt']
								continue
						
						if h.attrib.has_key("bgcolor") and not h.attrib.has_key("width"):
							if h.text is not None:
								v = h.text[2:]
								complete = 'true'

						if complete == 'true':
							print str(k) + " " + str(v)
							num_freq_dict[k] = v
							k = None
							v = None
							complete = 'false'
		return num_freq_dict
Ejemplo n.º 12
0
def parse_xls_file(xls_file):
    "Parse a xls file."
    tree = TidyHTMLTreeBuilder.parse(xls_file)
    root = tree.getroot()
    table = root.find(
        '{http://www.w3.org/1999/xhtml}body/{http://www.w3.org/1999/xhtml}table')
    col_names = None
    data_sets = []
    matrix = []
    for row in table.findall('{http://www.w3.org/1999/xhtml}tr'):
        row_texts = child_texts(row)
        if None == col_names:
            col_names = row_texts[1:]
        elif len(row_texts) == len(col_names) + 1:
            data_sets.append(row_texts[0])
            matrix.append(map(float, row_texts[1:]))
    matrix = np.array(matrix)
    return data_sets, col_names, matrix
Ejemplo n.º 13
0
    def inst(self, irc, msg, args):
        """<institution>

        Look up a oclc participating institution by code
        """
        from elementtree.ElementTree import tostring
        from socket import setdefaulttimeout
        from re import sub
        setdefaulttimeout(60)
        ns = 'http://www.w3.org/1999/xhtml'
        inst = args[0]
        self.log.info("looking up oclc institution %s" % inst)
        url = "http://www.oclc.org/common/cgi-oclc/pi.pl?max=1&sym=%s" % inst
        response = ""
        try:
            tree = TidyHTMLTreeBuilder.parse(urlopen(url))
            root = tree.getroot()
            info = tostring(root.findall('.//{%s}font' % ns)[8])
            response = sub("<.*?>", '', info.replace("\n", ' '))
        except:
            response = "sorry no hits for %s" % inst
        irc.reply(response)
Ejemplo n.º 14
0
    def inst(self,irc,msg,args):
        """<institution>

        Look up a oclc participating institution by code
        """
        from elementtree.ElementTree import tostring
        from socket import setdefaulttimeout
        from re import sub
        setdefaulttimeout(60)
        ns = 'http://www.w3.org/1999/xhtml'
        inst = args[0]
        self.log.info("looking up oclc institution %s" % inst)
        url = "http://www.oclc.org/common/cgi-oclc/pi.pl?max=1&sym=%s" % inst
        response = ""
        try:
            tree = TidyHTMLTreeBuilder.parse( urlopen(url) )
            root = tree.getroot()
            info = tostring(root.findall('.//{%s}font'%ns)[8])
            response = sub("<.*?>",'', info.replace("\n",' '))
        except:
            response = "sorry no hits for %s" % inst
        irc.reply(response)
Ejemplo n.º 15
0
  def parse_error(self, html):
    tree = TidyHTMLTreeBuilder.parse(html)
    prefix = '{http://www.w3.org/1999/xhtml}'
    root = tree.getroot()

    self.form = root.find('{}body/{}form'.format(prefix, prefix))
    # Some of the returned html has a blockquote before the form, some don't
    if self.form is None:
      self.form = root.find('{}body/{}blockquote/{}form'.format(*[prefix]*3))
    if self.form is not None:
      fields = dict(self.form.items())
      if fields['name'] == 'frmConfirmation':
        # Existing sessions open
        print("There are existing sessions open")
        #submit = form.find('{}table/{}tr/{}td/{}table/{}tr/{}td/{}table/{}tr/{}td/{}table/{}tr/{}td/{}input'.format(*[prefix]*13))
        submit = find_by_name(self.form, 'btnContinue')
        values = dict(submit.items())
        table = find_sessions(self.form)
        display_session(table)
        if values["value"] == "Close Selected Sessions and Log in":
          self.close_sessions(table, True)
        elif values["value"] == "Log in (and optionally Close Selected Sessions)":
          self.close_sessions(table)
        session = self.get_session()
        if session is not None:
          return session
        else:
          return self.parse_error(self.args.out_file)
      elif fields['name'] == 'frmLogin':
	#print "Found login form, looking post-auth message"
	# This may be the post-login form
	# check whether there's a hidden field with a key
	#print tostring(self.form)
	temp = self.form.getiterator('{}input'.format(prefix))
	isSecondary = 0
	if temp is not None:
	  for t in temp:
	    #print tostring(t)
	    if t.get('name') == 'key':
	      key = t.get('value')
	      isSecondary = 1
	      break

	if isSecondary == 1:
          dat = {"key": key, "sn-postauth-proceed": "Proceed"}
	  #print dat
	  self.opener.open("https://{}{}".format(self.args.hostname, self.args.login_path), urllib.urlencode(dat))
	  #print 'Attempting to get session'
	  session = self.get_session()
	  if session is not None:
	    return session
	  else:
	    return self.parse_error(self.args.out_file)
	else:
	  error_msg = "Undetermined error. Possible invalid user/pass, please try again (HTML of last page is written to {}.".format(self.args.out_file)
          error_table = find_by_id(self.form, 'table_LoginPage_5')
          if error_table is not None:
            error_msg = error_table.find('{}tr/{}td'.format(*[prefix]*2)).text
          print(error_msg)
	  self.get_user()
	  passwords = self.get_passwords()
	  self.data = self.configure_data(passwords)
	  self.log_in()
	  session = self.get_session()
	  if session is not None:
	    return session
	  else:
	    return self.parse_error(self.args.out_file)
      elif fields['name'] == 'frmNextToken':
        # Wait till the next token pops up and then enter it
        temp = self.form.find('{}/input'.format(prefix))
        if temp is not None:
          values = dict(temp.items())
          try:
            if values['name'] == 'key':
              key = values['value']
            else:
              print("Unable to find the key for the next token form, either we detected the form incorrectly or somthing went wrong.")
              return
          except KeyError:
            return
          password = getpass("Please enter the next securID token to appear on your fob")
          self.data = self.configure_data({"password":password})
          #self.opener.open("https://{}{}".format(self.args.hostname, self.args.login_path), self.data)
          self.log_in()
          session = self.get_session()


      else:
        # Unknown case, note where the file is so they can see what happened
        print("An unhandled case has come up.  Please view the page at {}".format(self.args.out_file))
    else:
      print("Unable to parse the html, please view it at {}".format(self.args.out_file))
Ejemplo n.º 16
0
def getPluginListHTMLpage():
    """Returns the raw html of the wiki age that collect plugin hierarchy."""
    plugin_list_page_tree = TidyHTMLTreeBuilder.parse(
        urllib.urlopen(PLUGIN_LIST_URL))
    return plugin_list_page_tree
Ejemplo n.º 17
0
    def parse_error(self, html):
        tree = TidyHTMLTreeBuilder.parse(html)
        prefix = '{http://www.w3.org/1999/xhtml}'
        root = tree.getroot()

        self.form = root.find('{}body/{}form'.format(prefix, prefix))
        # Some of the returned html has a blockquote before the form, some don't
        if self.form is None:
            self.form = root.find(
                '{}body/{}blockquote/{}form'.format(*[prefix] * 3))
        if self.form is not None:
            fields = dict(self.form.items())
            if fields['name'] == 'frmConfirmation':
                # Existing sessions open
                print("There are existing sessions open")
                #submit = form.find('{}table/{}tr/{}td/{}table/{}tr/{}td/{}table/{}tr/{}td/{}table/{}tr/{}td/{}input'.format(*[prefix]*13))
                submit = find_by_name(self.form, 'btnContinue')
                values = dict(submit.items())
                table = find_sessions(self.form)
                display_session(table)
                if values["value"] == "Close Selected Sessions and Log in":
                    self.close_sessions(table, True)
                elif values[
                        "value"] == "Log in (and optionally Close Selected Sessions)":
                    self.close_sessions(table)
                session = self.get_session()
                if session is not None:
                    return session
                else:
                    return self.parse_error(self.args.out_file)
            elif fields['name'] == 'frmLogin':
                print "Found login form, looking post-auth message"
                # This may be the post-login form
                # check whether there's a hidden field with a key
                #print tostring(self.form)
                temp = self.form.getiterator('{}input'.format(prefix))
                isSecondary = 0
                if temp is not None:
                    for t in temp:
                        #print tostring(t)
                        if t.get('name') == 'key':
                            key = t.get('value')
                            isSecondary = 1
                            break

                if isSecondary == 1:
                    dat = {"key": key, "sn-postauth-proceed": "Proceed"}
                    #print dat
                    self.opener.open(
                        "https://{}{}".format(self.args.hostname,
                                              self.args.login_path),
                        urllib.urlencode(dat))
                    #print 'Attempting to get session'
                    session = self.get_session()
                    if session is not None:
                        return session
                    else:
                        return self.parse_error(self.args.out_file)
                else:
                    # Invalid user/pass try again
                    print("Invalid user/pass, please try again")
                    self.get_user()
                    passwords = self.get_passwords()
                    self.data = self.configure_data(passwords)
                    self.log_in()
                    session = self.get_session()
                    if session is not None:
                        return session
                    else:
                        return self.parse_error(self.args.out_file)
            elif fields['name'] == 'frmNextToken':
                # Wait till the next token pops up and then enter it
                temp = self.form.find('{}/input'.format(prefix))
                if temp is not None:
                    values = dict(temp.items())
                    try:
                        if values['name'] == 'key':
                            key = values['value']
                        else:
                            print(
                                "Unable to find the key for the next token form, either we detected the form incorrectly or somthing went wrong."
                            )
                            return
                    except KeyError:
                        return
                    password = getpass(
                        "Please enter the next securID token to appear on your fob"
                    )
                    self.data = self.configure_data({"password": password})
                    #self.opener.open("https://{}{}".format(self.args.hostname, self.args.login_path), self.data)
                    self.log_in()
                    session = self.get_session()

            else:
                # Unknown case, note where the file is so they can see what happened
                print(
                    "An unhandled case has come up.  Please view the page at {}"
                    .format(self.args.out_file))
        else:
            print("Unable to parse the html, please view it at {}".format(
                self.args.out_file))
Ejemplo n.º 18
0
                         quotechar='|', quoting=csv.QUOTE_MINIMAL)
    dataout.writerow(['season', 'week', 'away', 'home', 'awayscore', 'homescore'])

    chdir(sdir)

    weeks = regs[:]
    if y != 2009:
        weeks.extend(ps)

    for i in range(len(weeks)):
        w = weeks[i]
        if not exists(w):
            logging.error("No data for %d %s" % (y, w))
            continue

        tree  = TidyHTMLTreeBuilder.parse(w)
        for elt in tree.getiterator():
            if elt.tag == div and elt.get('class') == 'col scorebox-container':
                game = dict()
                uls = elt.findall('.//'+ul)
                for u in uls:
                    if u.get('class').endswith('-team'):
                        for l in u.getiterator():
                            if l.tag == li and l.get('class') == 'team-logo':
                                game[u.get('class')[0:4]] = l.get('id')
                                break
                divs = elt.findall('.//'+div)
                for d in divs:
                    if d.get('class') is not None and d.get('class').endswith('-score'):
                        for s in d.getiterator():
                            if s.tag == div and s.get('class') == 'the-score':
Ejemplo n.º 19
0
def getPluginListHTMLpage():
    """Returns the raw html of the wiki age that collect plugin hierarchy."""
    plugin_list_page_tree = TidyHTMLTreeBuilder.parse(urllib.urlopen(PLUGIN_LIST_URL))
    return plugin_list_page_tree