コード例 #1
0
def cwb_crawler():

    import Lily.crawler.url_string as curlstr
    db = cdb.database('data_crawler_cwb_earthquake_list.sqlite')

    db_target_tab = 'data_crawler_cwb_sensible_earthquake_download'

    eq_dir_pa = '''https://scweb.cwb.gov.tw/earthquake/Page.aspx?ItemId=20&Date={0}'''
    sub_url = '''https://scweb.cwb.gov.tw/earthquake/Page.aspx{0}'''
    days = pandas.date_range(
        datetime.datetime.today() - datetime.timedelta(days=90),
        datetime.datetime.today() + datetime.timedelta(days=31),
        freq='M')

    for mon in days:

        eq_list = pandas.DataFrame(
            columns=['id', 'lzma_html', 'download_time'])
        time.sleep(90 / 1000.0)

        day_url = eq_dir_pa.format(mon.strftime('%Y%m'))
        cur1 = curlstr.url_string(day_url)
        arg1 = LH.fromstring(cur1.to_str())
        arg2 = arg1.xpath('//tr/td/a/@href')
        arg3 = {}

        for elem in arg2:
            if elem not in arg3 and elem[1:7] == 'ItemId':
                print('download html', elem)
                cur2 = curlstr.url_string(sub_url.format(elem))
                arg3[elem] = True
                eq_list.loc[len(eq_list)] = [
                    elem[22:],
                    cur2.to_lzma_xz(),
                    datetime.datetime.now()
                ]

        eq_list.to_sql(db_target_tab,
                       db.connect,
                       if_exists='append',
                       index=False)

    db.connect.execute('''delete from {0} where rowid not in 
                         (select max (rowid) from {0} group by id, lzma_html)'''
                       .format(db_target_tab))

    db.connect.commit()
    return
コード例 #2
0
import Lily.ctao.database as cdb
import Lily.crawler.url_string as url


for a in range(1,10):
    target='''http://163.29.251.188/botedata/%E4%BA%A4%E9%80%9A%E6%B5%81%E9%87%8F/107%E5%B9%B4%E5%BA%A6/HTM/S00{0}.files/sheet001.htm'''
    target = target.format(a)
    file  = '''D:/107_s00{0}_0001.html'''
    file  = file.format(a)
    arg1 =  url.url_string(target)
    arg1.to_file(file)
    print (target, file)

for a in range(1,10):
    target='''http://163.29.251.188/botedata/%E4%BA%A4%E9%80%9A%E6%B5%81%E9%87%8F/107%E5%B9%B4%E5%BA%A6/HTM/S0{0}0.files/sheet001.htm'''
    target = target.format(a)
    file  = '''D:/107_s0{0}0_0001.html'''
    file  = file.format(a)
    arg1 =  url.url_string(target)
    arg1.to_file(file)
    print (target, file)