def get_courses(): try: response = requests.get(url, headers=headers, verify=False) parser = HTMLTableParser() parser.feed(response.text) info = parser.tables[0][0][1] summary = parser.tables[2][2] courses = [] for i in range(5, len(parser.tables[2])): course = parser.tables[2][i] if len(course[2]) > 0: # Non TA! courses.append({ 'title': course[1], 'title2': course[2], 'code': course[3], 'v': course[4], 'grp': course[5], 'score': course[6], 'prof': course[8], }) return { 'info': info, 'summary': summary, 'courses': courses, } except: return None
def crawl(fileName): stockCodes = getInputStockCode(fileName) hp = HTMLTableParser() for code in stockCodes: finalDF = pd.DataFrame() for i in range(1, 51): tableDF = hp.parse_url(code, i) if (tableDF.empty): break if (finalDF.empty and not tableDF.empty): finalDF = tableDF elif (not tableDF.empty): finalDF = finalDF.append(tableDF) if (not finalDF.empty): finalDF.sort_values(by=['date'], inplace=True, ascending=True) finalDF.to_csv("./results/" + ''.join(code) + ".csv", index=False)
#!/usr/bin/env python from pprint import pprint from HTMLTableParser import HTMLTableParser # Create the parser p = HTMLTableParser() try: # Create some html data to feed in the parser myData = """ <html> <body> <table id="pricingTable"> <thead> <tr> <th class="rowHeader"> Server Sizes: </th> <th> Linux®<span style="font-size:70%; vertical-align: top;">***</span> <div class="subtitle">Hourly (Estimated Monthly)</div> </th> <th> Windows® <div class="subtitle">Hourly (Estimated Monthly)</div> </th> </tr> </thead> <tbody>
#!/usr/bin/env python import urllib from pprint import pprint from HTMLTableParser import HTMLTableParser # Create the parser p = HTMLTableParser() try: # Get tables from this webpage url = "http://www.franjeado.com/stats.php" req = urllib.urlopen(url) # Parse the data p.feed(req.read()) except Exception, e: print e # Show results pprint(p.tables)