def get_courses(): try: response = requests.get(url, headers=headers, verify=False) parser = HTMLTableParser() parser.feed(response.text) info = parser.tables[0][0][1] summary = parser.tables[2][2] courses = [] for i in range(5, len(parser.tables[2])): course = parser.tables[2][i] if len(course[2]) > 0: # Non TA! courses.append({ 'title': course[1], 'title2': course[2], 'code': course[3], 'v': course[4], 'grp': course[5], 'score': course[6], 'prof': course[8], }) return { 'info': info, 'summary': summary, 'courses': courses, } except: return None
def crawl(fileName): stockCodes = getInputStockCode(fileName) hp = HTMLTableParser() for code in stockCodes: finalDF = pd.DataFrame() for i in range(1, 51): tableDF = hp.parse_url(code, i) if (tableDF.empty): break if (finalDF.empty and not tableDF.empty): finalDF = tableDF elif (not tableDF.empty): finalDF = finalDF.append(tableDF) if (not finalDF.empty): finalDF.sort_values(by=['date'], inplace=True, ascending=True) finalDF.to_csv("./results/" + ''.join(code) + ".csv", index=False)