def __init__(self, sub_group ='M03A'): # import Lily.ctao.hostmetadata as chmd import Lily.ctao.database as cdb self.sub_group = sub_group self.hostmetadata = chmd.hostmetadata() self.database = cdb.database(self.hostmetadata.database) self.sub_warehouse = '{0}/crawler_ETC_{1}'.format(self.hostmetadata.warehouse, self.sub_group) self.excel_filename = '{0}/data_clawler_ETC_{1}_list.xlsx'.format(self.hostmetadata.warehouse , self.sub_group) self.sqlite_tablename = 'data_crawler_ETC_{0}_list'.format(self.sub_group) self.sqlite_tablepull = 'data_crawler_ETC_{0}_pull'.format(self.sub_group) #check/create if not exists directory if not os.path.exists(self.sub_warehouse) : os.mkdir(self.sub_warehouse) #date regular expresstion YYYYMMDD date_YYYYMMDD_pattern = '''([12]\d{3}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01]))''' self.url = 'http://tisvcloud.freeway.gov.tw/history/TDCS/{0}/'.format(self.sub_group) # self.cloud_archive_pattern = 'href=\"({0}_{1}\.tar\.gz)\"' .format(self.sub_group, date_YYYYMMDD_pattern) self.local_archive_pattern = '({0}_{1}\.tar\.gz)' .format(self.sub_group , date_YYYYMMDD_pattern) self.check_archive_list()
def cwb_melt2(): db = cdb.database('data_crawler.sqlite') df = db.to_dataframe('data_rdset_pylily_cwb_sensible_earthquake') station = [] for ind, row in df.iterrows(): for st in df.at[ind, 'Stations'].split(';'): if u'''地區最大震度''' not in st and st != '': rdset = [ df.at[ind, 'id'], df.at[ind, 'time'], float(df.at[ind, 'px'][4:-2]), float(df.at[ind, 'py'][4:-2]), float(df.at[ind, 'depth'][:-3]), float(df.at[ind, 'ML']), df.at[ind, 'Location'], ''.join(st.split('\u3000')[:-1]), float(st.split('\u3000')[-1:][0]) ] station.append(rdset) df2 = pandas.DataFrame( station, columns=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']) df2.to_sql( 'data_rdset_pylily_cwb_sensible_earthquake_LocalSeismicIntensity', db.connect, if_exists='replace', index=False) return
def check_docx(docx_file_name): from Lily.ctao.database import database from Lily.ctao.nsgstring import alnum from Lily.ctao.hostmetadata import hostmetadata from Lily.blacksmith.file_feature import get_feature host = hostmetadata() db = database(host.database) doc = Document(docx_file_name) feature = get_feature(docx_file_name) excelfile = feature['path'] + '/' + feature['name'] + '.xlsx' tablename = (feature['name'] + '_{0}') writer = pandas.ExcelWriter( excelfile , engine = 'xlsxwriter') counter = 1 sheetlist = [] for tab in doc.tables: data1=[] for row in tab.rows: data1.append( [cell.text for cell in row.cells] ) df = pandas.DataFrame(data1) counter = counter + 1 table_name = tablename.format( str(counter).zfill(3) ) sheetlist.append(table_name) df.to_sql(table_name, db.connect, if_exists='replace') df.to_excel(writer, sheet_name=table_name) writer.save() writer.close() return sheetlist
def __init__(self): self.ctaohost = hmd.hostmetadata() today = datetime.datetime.today() self.database_filename = self.ctaohost.warehouse + '/ctao_data_crawler_vehicledetect_{0}.sqlite'.format( today.strftime('%Y%m')) self.database = cdb.database(self.database_filename) self.sub_group = 'data_crawler_vd' self.dict_data = { 'tpec_vddata': [ 'https://tcgbusfs.blob.core.windows.net/blobtisv/GetVDDATA.xml.gz', '<ExchangeTime>(.*)</ExchangeTime>', '%Y/%m/%dT%H:%M:%S' ], 'tpec_vd': [ 'https://tcgbusfs.blob.core.windows.net/blobtisv/GetVD.xml.gz', '<vd:ExchangeTime>(.*)</vd:ExchangeTime>', '%Y/%m/%dT%H:%M:%S' ], 'nfbx_1968': [ 'http://tisvcloud.freeway.gov.tw/xml/1min_incident_data_1968.xml', 'time="([^"]*)"', '%Y-%m-%d %H:%M:%S' ], 'nfbx_rlx1': [ 'http://tisvcloud.freeway.gov.tw/roadlevel_value.xml.gz', 'updatetime="([^"]*)"', '%Y/%m/%d %H:%M:%S' ], 'nfbx_rlx5': [ 'http://tisvcloud.freeway.gov.tw/roadlevel_value5.xml.gz', 'updatetime="([^"]*)"', '%Y/%m/%d %H:%M:%S' ], 'nfbx_vdx1': [ 'http://tisvcloud.freeway.gov.tw/vd_value.xml.gz', 'updatetime="([^"]*)"', '%Y/%m/%d %H:%M:%S' ], 'nfbx_vdx5': [ 'http://tisvcloud.freeway.gov.tw/vd_value5.xml.gz', 'updatetime="([^"]*)"', '%Y/%m/%d %H:%M:%S' ] } #all opendata source self.list_df = pandas.DataFrame.from_dict( self.dict_data, orient='index', columns=[ 'url', 'exchange_time_repattern', 'exchange_time_datetimepattern' ]) self.list_df['gzip_context'] = numpy.random.bytes(1) self.list_df['download_datetime'] = numpy.datetime64( datetime.datetime.now()) self.list_df['exchange_datetime'] = numpy.datetime64( datetime.datetime.now())
def cwb_melt1(): import lzma db = cdb.database('data_crawler_cwb_earthquake_list.sqlite') sql = ''' select max(rowid) rowid, id, lzma_html from data_crawler_cwb_sensible_earthquake_download group by id ''' df = pandas.read_sql(sql, db.connect, index_col=['rowid']) df = df.reindex(columns=[ 'id', 'lzma_html', 'time', 'py', 'px', 'depth', 'ML', 'Location', 'Stations' ], fill_value='') for ind, row in df.iterrows(): # print ('melt', row[0]) html_tables = pandas.read_html(lzma.decompress( sqlite3.Binary(row['lzma_html'])), encoding='utf-8') arg2 = html_tables[2] df.at[ind, 'time'] = arg2.iat[0, 1] df.at[ind, 'py'] = arg2.iat[1, 1] df.at[ind, 'px'] = arg2.iat[2, 1] df.at[ind, 'depth'] = arg2.iat[3, 1] df.at[ind, 'ML'] = arg2.iat[4, 1] df.at[ind, 'Location'] = arg2.iat[5, 1] for ind2, row2 in html_tables[3].iterrows(): if isinstance(row2, pandas.core.series.Series): for elem in row2: if isinstance(elem, str): df.at[ind, 'Stations'] = df.at[ind, 'Stations'] + ';' + elem else: if isinstance(elem, str): df.at[ind, 'Stations'] = df.at[ind, 'Stations'] + ';' + row2 df = df.drop(columns=['lzma_html']) df.to_sql('data_rdset_pylily_cwb_sensible_earthquake', db.connect, if_exists='append', index=False) db.connect.execute('''delete from {0} where rowid not in (select max (rowid) from {0} group by id)'''.format( 'data_rdset_pylily_cwb_sensible_earthquake')) db.connect.commit() return
def cwb_crawler(): import Lily.crawler.url_string as curlstr db = cdb.database('data_crawler_cwb_earthquake_list.sqlite') db_target_tab = 'data_crawler_cwb_sensible_earthquake_download' eq_dir_pa = '''https://scweb.cwb.gov.tw/earthquake/Page.aspx?ItemId=20&Date={0}''' sub_url = '''https://scweb.cwb.gov.tw/earthquake/Page.aspx{0}''' days = pandas.date_range( datetime.datetime.today() - datetime.timedelta(days=90), datetime.datetime.today() + datetime.timedelta(days=31), freq='M') for mon in days: eq_list = pandas.DataFrame( columns=['id', 'lzma_html', 'download_time']) time.sleep(90 / 1000.0) day_url = eq_dir_pa.format(mon.strftime('%Y%m')) cur1 = curlstr.url_string(day_url) arg1 = LH.fromstring(cur1.to_str()) arg2 = arg1.xpath('//tr/td/a/@href') arg3 = {} for elem in arg2: if elem not in arg3 and elem[1:7] == 'ItemId': print('download html', elem) cur2 = curlstr.url_string(sub_url.format(elem)) arg3[elem] = True eq_list.loc[len(eq_list)] = [ elem[22:], cur2.to_lzma_xz(), datetime.datetime.now() ] eq_list.to_sql(db_target_tab, db.connect, if_exists='append', index=False) db.connect.execute('''delete from {0} where rowid not in (select max (rowid) from {0} group by id, lzma_html)''' .format(db_target_tab)) db.connect.commit() return
def to_database( target_dir ): import Lily.ctao.database as cdb import Lily.ctao.nsgstring as nstr import Lily.ctao.hostmetadata as chmd host = chmd.hostmetadata() p1 = nstr.alnum(host.platform) h1 = nstr.alnum(host.hostname) d1 = nstr.alnum(target_dir) db = cdb.database(host.database) dflist = get_all_filefeature_with_md5sum(target_dir) table_name = '''data_rdset_filemd5_{0}_{1}_hhot_{2}'''.format(p1, h1, d1) dflist.to_sql(table_name, db.connect, if_exists='replace', index=False)
def cwb_melt1(): import lzma db = cdb.database('data_crawler.sqlite') sql = ''' select Id, routeId, nameZh, nameEn, seqNo, pgp, longitude, showLon, showLat, vector from group by id ''' df = pandas.read_sql(sql, db.connect, index_col=['rowid']) df = df.reindex(columns=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i'], fill_value='') for ind, row in df.iterrows(): # print ('melt', row[0]) json_tables = pandas.read_html(lzma.decompress( sqlite3.Binary(row['lzma_html'])), encoding='utf-8') arg2 = json_tables[2] df.at[ind, 'routeId'] = arg2.iat[0, 1] df.at[ind, 'nameZh'] = arg2.iat[1, 1] df.at[ind, 'nameEn'] = arg2.iat[2, 1] df.at[ind, 'seqNo'] = arg2.iat[3, 1] df.at[ind, 'pgp'] = arg2.iat[4, 1] df.at[ind, 'longitude'] = arg2.iat[5, 1] for ind2, row2 in json_tables[3].iterrows(): if isinstance(row2, pandas.core.series.Series): for elem in row2: if isinstance(elem, str): df.at[ind, 'i'] = df.at[ind, 'i'] + ',' + elem else: if isinstance(elem, str): df.at[ind, 'i'] = df.at[ind, 'i'] + ',' + row2 #df = df.drop(columns=['lzma_html']) df.to_sql('data_rdset_pylily', db.connect, if_exists='append', index=False) db.connect.execute('''delete from {0} where rowid not in (select max (rowid) from {0} group by id)'''.format( 'data_rdset_pylily')) db.connect.commit() return
def cwb_melt1(): import lzma db = cdb.database('data_crawler.sqlite') sql = ''' select Id, routeId, nameZh, nameEn, seqNo, pgp, longitude, showLon, showLat, vector from 'd:/0702/臺北市站牌.gz' group by id ''' df = pandas.read_sql(sql, db.connect , index_col=['routeId']) df = df.reindex ( columns=[ 'Id','routeId','nameZh', 'nameEn', 'seqNo', 'pgp', 'longitude', 'showLon','showLat','vector' ], fill_value='' ) for ind, row in df.iterrows(): # print ('melt', row[0]) arg2 = json_tables[0] df.at[ind,'routeId'] = arg2.iat[0,1] df.at[ind,'nameZh'] = arg2.iat[1,1] df.at[ind,'nameEn'] = arg2.iat[2,1] df.at[ind,'seqNo'] = arg2.iat[3,1] df.at[ind,'pgp'] = arg2.iat[4,1] df.at[ind,'longitude'] = arg2.iat[5,1] df.at[ind,'showLon'] = arg2.iat[6,1] df.at[ind,'showLat'] = arg2.iat[7,1] df.at[ind,'vector'] = arg2.iat[8,1] for ind2, row2 in json_tables[3].iterrows(): if isinstance(row2, pandas.core.series.Series): for elem in row2: if isinstance(elem,str): df.at[ind, 'i'] = df.at[ind, 'i'] + ',' + elem else: if isinstance(elem,str): df.at[ind, 'i'] = df.at[ind, 'i'] + ',' + row2 df.to_sql('data_rdset_pylily', db.connect, if_exists='append', index=False) db.connect.execute('''delete from {0} where rowid not in (select max (rowid) from {0} group by id)'''.format('data_rdset_pylily') ) db.connect.commit() return
def check_time(): import Lily.ctao.database as cdb import Lily.ctao.nsgstring as nstr import Lily.ctao.hostmetadata as chmd import re host = chmd.hostmetadata() db = cdb.database(host.database) #^\d\d\d\d-(0?[1-9]|1[0-2])-(0?[1-9]|[12][0-9]|3[01]) (00|[0-9]|1[0-9]|2[0-3]):([0-9]|[0-5][0-9]):([0-9]|[0-5][0-9])$ patern0 = r'''(0?[1-9]|1[0-2])/(0[1-9]|[12][0-9]|3[01])''' patern1 = r'''([0-2][0-9]):([0-5][0-9])''' patern2 = r'''^(0?[1-9]|1[0-2])/(0?[1-9]|[12][0-9]|3[01])|(0?[1-9]|1[0-2])/(0?[1-9]|[12][0-9]|3[01])$''' df = db.to_dataframe('hln_0206_3') df = df.iloc[1:] for ind, row in df.iterrows(): twoday = [day for day in re.findall(patern0, row[1])] twotim = [tim for tim in re.findall(patern1, row[2])] if len(twoday) == 0: twoday = [('01', '01'), ('01', '01')] if len(twoday) == 1: twoday = [twoday[0], twoday[0]] if len(twotim) == 0: twotim = [('00', '00'), ('00', '00')] if len(twotim) == 1: twotim = [twotim[0], twotim[0]] date1 = '2018-{0}-{1} {2}:{3}'.format(twoday[0][0], twoday[0][1], twotim[0][0], twotim[0][1]) date2 = '2018-{0}-{1} {2}:{3}'.format(twoday[1][0], twoday[1][1], twotim[1][0], twotim[1][1]) df.iloc[ind]['beg'] = datetime.datetime.strptime(date1, '%Y%m%d %H%M') df.iloc[ind]['end'] = datetime.datetime.strptime(date2, '%Y%m%d %H%M')
def __init__(self): import Lily.ctao.database as cdb import Lily.ctao.hostmetadata as chmd self.this_host = chmd.hostmetadata() self.log_database = cdb.database(self.this_host.database)