def cwb_crawler(): import Lily.crawler.url_string as curlstr db = cdb.database('data_crawler_cwb_earthquake_list.sqlite') db_target_tab = 'data_crawler_cwb_sensible_earthquake_download' eq_dir_pa = '''https://scweb.cwb.gov.tw/earthquake/Page.aspx?ItemId=20&Date={0}''' sub_url = '''https://scweb.cwb.gov.tw/earthquake/Page.aspx{0}''' days = pandas.date_range( datetime.datetime.today() - datetime.timedelta(days=90), datetime.datetime.today() + datetime.timedelta(days=31), freq='M') for mon in days: eq_list = pandas.DataFrame( columns=['id', 'lzma_html', 'download_time']) time.sleep(90 / 1000.0) day_url = eq_dir_pa.format(mon.strftime('%Y%m')) cur1 = curlstr.url_string(day_url) arg1 = LH.fromstring(cur1.to_str()) arg2 = arg1.xpath('//tr/td/a/@href') arg3 = {} for elem in arg2: if elem not in arg3 and elem[1:7] == 'ItemId': print('download html', elem) cur2 = curlstr.url_string(sub_url.format(elem)) arg3[elem] = True eq_list.loc[len(eq_list)] = [ elem[22:], cur2.to_lzma_xz(), datetime.datetime.now() ] eq_list.to_sql(db_target_tab, db.connect, if_exists='append', index=False) db.connect.execute('''delete from {0} where rowid not in (select max (rowid) from {0} group by id, lzma_html)''' .format(db_target_tab)) db.connect.commit() return
import Lily.ctao.database as cdb import Lily.crawler.url_string as url for a in range(1,10): target='''http://163.29.251.188/botedata/%E4%BA%A4%E9%80%9A%E6%B5%81%E9%87%8F/107%E5%B9%B4%E5%BA%A6/HTM/S00{0}.files/sheet001.htm''' target = target.format(a) file = '''D:/107_s00{0}_0001.html''' file = file.format(a) arg1 = url.url_string(target) arg1.to_file(file) print (target, file) for a in range(1,10): target='''http://163.29.251.188/botedata/%E4%BA%A4%E9%80%9A%E6%B5%81%E9%87%8F/107%E5%B9%B4%E5%BA%A6/HTM/S0{0}0.files/sheet001.htm''' target = target.format(a) file = '''D:/107_s0{0}0_0001.html''' file = file.format(a) arg1 = url.url_string(target) arg1.to_file(file) print (target, file)