def __init__(self, sub_group ='M03A'): # import Lily.ctao.hostmetadata as chmd import Lily.ctao.database as cdb self.sub_group = sub_group self.hostmetadata = chmd.hostmetadata() self.database = cdb.database(self.hostmetadata.database) self.sub_warehouse = '{0}/crawler_ETC_{1}'.format(self.hostmetadata.warehouse, self.sub_group) self.excel_filename = '{0}/data_clawler_ETC_{1}_list.xlsx'.format(self.hostmetadata.warehouse , self.sub_group) self.sqlite_tablename = 'data_crawler_ETC_{0}_list'.format(self.sub_group) self.sqlite_tablepull = 'data_crawler_ETC_{0}_pull'.format(self.sub_group) #check/create if not exists directory if not os.path.exists(self.sub_warehouse) : os.mkdir(self.sub_warehouse) #date regular expresstion YYYYMMDD date_YYYYMMDD_pattern = '''([12]\d{3}(0[1-9]|1[0-2])(0[1-9]|[12]\d|3[01]))''' self.url = 'http://tisvcloud.freeway.gov.tw/history/TDCS/{0}/'.format(self.sub_group) # self.cloud_archive_pattern = 'href=\"({0}_{1}\.tar\.gz)\"' .format(self.sub_group, date_YYYYMMDD_pattern) self.local_archive_pattern = '({0}_{1}\.tar\.gz)' .format(self.sub_group , date_YYYYMMDD_pattern) self.check_archive_list()
def check_docx(docx_file_name): from Lily.ctao.database import database from Lily.ctao.nsgstring import alnum from Lily.ctao.hostmetadata import hostmetadata from Lily.blacksmith.file_feature import get_feature host = hostmetadata() db = database(host.database) doc = Document(docx_file_name) feature = get_feature(docx_file_name) excelfile = feature['path'] + '/' + feature['name'] + '.xlsx' tablename = (feature['name'] + '_{0}') writer = pandas.ExcelWriter( excelfile , engine = 'xlsxwriter') counter = 1 sheetlist = [] for tab in doc.tables: data1=[] for row in tab.rows: data1.append( [cell.text for cell in row.cells] ) df = pandas.DataFrame(data1) counter = counter + 1 table_name = tablename.format( str(counter).zfill(3) ) sheetlist.append(table_name) df.to_sql(table_name, db.connect, if_exists='replace') df.to_excel(writer, sheet_name=table_name) writer.save() writer.close() return sheetlist
def check_module(): import Lily.ctao.hostmetadata as chmd hobj1 = chmd.hostmetadata() if hobj1.platform[:7] == 'Windows': ui = tkui('check_module') for ind in ui.values: print(ind, ui.values[ind])
def __init__(self): self.ctaohost = hmd.hostmetadata() today = datetime.datetime.today() self.database_filename = self.ctaohost.warehouse + '/ctao_data_crawler_vehicledetect_{0}.sqlite'.format( today.strftime('%Y%m')) self.database = cdb.database(self.database_filename) self.sub_group = 'data_crawler_vd' self.dict_data = { 'tpec_vddata': [ 'https://tcgbusfs.blob.core.windows.net/blobtisv/GetVDDATA.xml.gz', '<ExchangeTime>(.*)</ExchangeTime>', '%Y/%m/%dT%H:%M:%S' ], 'tpec_vd': [ 'https://tcgbusfs.blob.core.windows.net/blobtisv/GetVD.xml.gz', '<vd:ExchangeTime>(.*)</vd:ExchangeTime>', '%Y/%m/%dT%H:%M:%S' ], 'nfbx_1968': [ 'http://tisvcloud.freeway.gov.tw/xml/1min_incident_data_1968.xml', 'time="([^"]*)"', '%Y-%m-%d %H:%M:%S' ], 'nfbx_rlx1': [ 'http://tisvcloud.freeway.gov.tw/roadlevel_value.xml.gz', 'updatetime="([^"]*)"', '%Y/%m/%d %H:%M:%S' ], 'nfbx_rlx5': [ 'http://tisvcloud.freeway.gov.tw/roadlevel_value5.xml.gz', 'updatetime="([^"]*)"', '%Y/%m/%d %H:%M:%S' ], 'nfbx_vdx1': [ 'http://tisvcloud.freeway.gov.tw/vd_value.xml.gz', 'updatetime="([^"]*)"', '%Y/%m/%d %H:%M:%S' ], 'nfbx_vdx5': [ 'http://tisvcloud.freeway.gov.tw/vd_value5.xml.gz', 'updatetime="([^"]*)"', '%Y/%m/%d %H:%M:%S' ] } #all opendata source self.list_df = pandas.DataFrame.from_dict( self.dict_data, orient='index', columns=[ 'url', 'exchange_time_repattern', 'exchange_time_datetimepattern' ]) self.list_df['gzip_context'] = numpy.random.bytes(1) self.list_df['download_datetime'] = numpy.datetime64( datetime.datetime.now()) self.list_df['exchange_datetime'] = numpy.datetime64( datetime.datetime.now())
def check_module(): import Lily.ctao.hostmetadata as chmd hobj1 = chmd.hostmetadata() print('check moudel Lily.ctao.hostmetadata') print(hobj1.callname, hobj1.hostname, hobj1.platform) print(hobj1.database, hobj1.warehouse, hobj1.factory) dobj2 = database(hobj1.database) print(dobj2.tables()) print('No news is good news') if hobj1.platform[:7] == 'Windows': ui = asktablename() print(ui.mydb.tables())
def __init__(self, database_path): self.database_path = database_path self.connect = sqlite3.connect(database_path) self.connect.enable_load_extension(True) import Lily.ctao.hostmetadata as ho self.platform = ho.hostmetadata().platform[:6] if self.platform == 'Linux-': self.connect.load_extension('libspatialite') else: self.connect.load_extension('mod_spatialite') self.cursor = self.connect.cursor() self.alias_count = 0
def to_database( target_dir ): import Lily.ctao.database as cdb import Lily.ctao.nsgstring as nstr import Lily.ctao.hostmetadata as chmd host = chmd.hostmetadata() p1 = nstr.alnum(host.platform) h1 = nstr.alnum(host.hostname) d1 = nstr.alnum(target_dir) db = cdb.database(host.database) dflist = get_all_filefeature_with_md5sum(target_dir) table_name = '''data_rdset_filemd5_{0}_{1}_hhot_{2}'''.format(p1, h1, d1) dflist.to_sql(table_name, db.connect, if_exists='replace', index=False)
def check_moudle(): import sys import Lily.ctao.hostmetadata as chmd from Lily.blacksmith.mppool import mppool pool = mppool() this_host= chmd.hostmetadata() if this_host.platform[:7] =='Windows': from Lily.ctao.userargument import tkui ui = tkui('select_target_directory',[['target','sel', 'directory']]) pool.run(to_database, ui.values['target'], 'get_all_file_feature') elif sys.argv == 2: pool.run(to_database, sys.argv[1], 'get_all_file_feature') else: target = input("Enter a directory name:(path)") pool.run(to_database, target, 'get_all_file_feature')
def check_time(): import Lily.ctao.database as cdb import Lily.ctao.nsgstring as nstr import Lily.ctao.hostmetadata as chmd import re host = chmd.hostmetadata() db = cdb.database(host.database) #^\d\d\d\d-(0?[1-9]|1[0-2])-(0?[1-9]|[12][0-9]|3[01]) (00|[0-9]|1[0-9]|2[0-3]):([0-9]|[0-5][0-9]):([0-9]|[0-5][0-9])$ patern0 = r'''(0?[1-9]|1[0-2])/(0[1-9]|[12][0-9]|3[01])''' patern1 = r'''([0-2][0-9]):([0-5][0-9])''' patern2 = r'''^(0?[1-9]|1[0-2])/(0?[1-9]|[12][0-9]|3[01])|(0?[1-9]|1[0-2])/(0?[1-9]|[12][0-9]|3[01])$''' df = db.to_dataframe('hln_0206_3') df = df.iloc[1:] for ind, row in df.iterrows(): twoday = [day for day in re.findall(patern0, row[1])] twotim = [tim for tim in re.findall(patern1, row[2])] if len(twoday) == 0: twoday = [('01', '01'), ('01', '01')] if len(twoday) == 1: twoday = [twoday[0], twoday[0]] if len(twotim) == 0: twotim = [('00', '00'), ('00', '00')] if len(twotim) == 1: twotim = [twotim[0], twotim[0]] date1 = '2018-{0}-{1} {2}:{3}'.format(twoday[0][0], twoday[0][1], twotim[0][0], twotim[0][1]) date2 = '2018-{0}-{1} {2}:{3}'.format(twoday[1][0], twoday[1][1], twotim[1][0], twotim[1][1]) df.iloc[ind]['beg'] = datetime.datetime.strptime(date1, '%Y%m%d %H%M') df.iloc[ind]['end'] = datetime.datetime.strptime(date2, '%Y%m%d %H%M')
for st in df.at[ind, 'Stations'].split(';'): if u'''地區最大震度''' not in st and st != '': rdset = [ df.at[ind, 'id'], df.at[ind, 'time'], float(df.at[ind, 'px'][4:-2]), float(df.at[ind, 'py'][4:-2]), float(df.at[ind, 'depth'][:-3]), float(df.at[ind, 'ML']), df.at[ind, 'Location'], ''.join(st.split('\u3000')[:-1]), float(st.split('\u3000')[-1:][0]) ] station.append(rdset) df2 = pandas.DataFrame( station, columns=['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i']) df2.to_sql( 'data_rdset_pylily_cwb_sensible_earthquake_LocalSeismicIntensity', db.connect, if_exists='replace', index=False) return if __name__ == '__console__' or __name__ == '__main__': import os thost = chmd.hostmetadata() os.chdir(thost.warehouse) cwb_crawler() cwb_melt1() cwb_melt2()
def __init__(self): import Lily.ctao.database as cdb import Lily.ctao.hostmetadata as chmd self.this_host = chmd.hostmetadata() self.log_database = cdb.database(self.this_host.database)