def get_courses(): try: response = requests.get(url, headers=headers, verify=False) parser = HTMLTableParser() parser.feed(response.text) info = parser.tables[0][0][1] summary = parser.tables[2][2] courses = [] for i in range(5, len(parser.tables[2])): course = parser.tables[2][i] if len(course[2]) > 0: # Non TA! courses.append({ 'title': course[1], 'title2': course[2], 'code': course[3], 'v': course[4], 'grp': course[5], 'score': course[6], 'prof': course[8], }) return { 'info': info, 'summary': summary, 'courses': courses, } except: return None
<tr> <td>Business Cloud</td> <td>$999.00</td> <td>14,500</td> <td>$0.07</td> <td>$0.08</td> <td>$25.55/mo</td> </tr> <tr> <td colspan="3"> </td> <td colspan="3" style="text-align: center; padding-top:10px;"> <a href="www.python.org">Go to Python</a> </td> </tr> </tbody> </table> </body> </html> """ # Parse the data p.feed(myData) except Exception, e: print e # Show results pprint(p.tables)
#!/usr/bin/env python import urllib from pprint import pprint from HTMLTableParser import HTMLTableParser # Create the parser p = HTMLTableParser() try: # Get tables from this webpage url = "http://www.franjeado.com/stats.php" req = urllib.urlopen(url) # Parse the data p.feed(req.read()) except Exception, e: print e # Show results pprint(p.tables)