def pdf2hdfs(self, mode='append', hdfs_path=''): from sparkmodule import PySparkManager """ hdfs에 parquet 형식으로 저장 :param mode: 저장 방식, 'append', 'overwrite' :param hdfs_path: hdfs 경로 지정, 입력 없으면 기본 저장경로 :return: nothing """ if hdfs_path == '': path = self._hdfs_path else: path = hdfs_path try: firstrow = list(self._pdf.iloc[0]) except Exception as e: Log.e(self.tag, 'pdf is empty! : ', e.__class__.__name__) return Log.d(self.tag, 'pdf(%s) -> %s' % (firstrow, path)) # make spark dataframe self._spdf = PySparkManager().sqlctxt.createDataFrame(self._pdf) # append new data try: self._spdf.write.mode(mode).parquet(path) except Exception: Log.e(self.tag, 'cannot append row(s).') self._spdf.show() return Log.d(self.tag, 'parquet write completed.')
def _req_api(self, method: str, query_param: str, payload): """ api에 요청 1회 전송 기능 담당 response 데이터가 없을 때에는 올바로 받을 때까지 반복적으로 request 수행 :param method: HTTP method명. 'get' or 'post' :param query_param: self._make_query_param()의 return값 입력 :param payload: self._make_payload()의 return값 입력 :return: """ json_response = None while not json_response: try: Log.d(self.tag, 'req', method, ':', self._base_url + query_param, 'payload:', str(payload)) if method == 'get': json_response = requests.get(self._base_url + query_param) elif method == 'post': json_response = requests.post(self._base_url + query_param, data=payload) except Exception as e: Log.e(self.tag, '_req_api() : occurred Exception!', e.__class__.__name__) Log.e(self.tag, 'trying to recall api...') continue self._json_dict = json.loads(json_response.text)
def pdf2hdfs(self, mode='append', hdfs_path=''): """ hdfs에 parquet 형식으로 저장 :param mode: 저장 방식, 'append', 'overwrite' :param hdfs_path: hdfs 경로 지정, 입력 없으면 기본 저장경로 :return: nothing """ if hdfs_path == '': path = self._hdfs_path else: path = hdfs_path try: Log.d('pdf2hdfs()', 'pdf -> hdfs ::\n', self._pdf.iloc[:30]) except Exception as e: Log.e('pdf is empty! : ', e.__class__.__name__) return # make spark dataframe self._spdf = PySparkManager().sqlctxt.createDataFrame(self._pdf) # append new data self._spdf.write.mode(mode).parquet(path) Log.d('pdf2hdfs()', 'parquet write completed.')
def log(self, db_type: list, mode='append', **log_prop): # station_list = [] try: if log_prop['station'] == 'all': station_list = self.stnCodeList.values() else: try: station_list = [self.stnCodeList[log_prop['station']]] except KeyError: Log.e(self.tag, '측정소 이름이 잘못되었습니다.') return except Exception as e: Log.e(self.tag, 'Exception occurred!! : ', e.__class__.__name__) return for _station_code in station_list: payload = self._make_payload(stn_code=_station_code, **log_prop) self._req_api(method='post', query_param='', payload=payload) Log.d(self.tag, 'response:', self._json_dict) self._json2pdf(term=log_prop['term']) if 'hdfs' in db_type: self.pdf2hdfs(mode=mode) # if 'mysql' in db_type: # self.pdf2mysql(table_name='kma_uvi', if_exists='append') except KeyError: Log.e(self.tag, 'wrong log properties error! :', log_prop)
def _log(self): Log.d(self.tag, 'logging start...') self.api.log(db_type=self._log_properties['db_type'], mode=self._log_properties['mode'], station=self._log_properties['station'], term=self._log_properties['term']) Log.d(self.tag, 'logging end.')
def start_logging(self): Log.d(self.tag, 'start logging thread.') if not self._on: self._on = True if not self._running: self._running = True self.start()
def _init_driver(self): # 드라이버와 옵션을 클래스화하여 싱글톤으로 만들면 좋을듯 import os Log.d(self.tag, 'init driver...') chrome_driver_path = os.getcwd() + '/../driver/chromedriver' self._options = webdriver.ChromeOptions() self._options.add_argument('headless') self._options.add_argument('disable-gpu') self._driver = webdriver.Chrome(chrome_driver_path, options=self._options) Log.d(self.tag, 'driver init completed.')
def yearly(self, dirname, year: int): dirlist = ku.search(dirname) Log.d('yearly', 'dirlist:', dirlist) for dirpath in dirlist: fnamelist = ku.search(dirpath + '/' + str(year)) for fname in fnamelist: Log.d('yearly()', 'fname:', fname) ku.read_single_file(fname, fname.split('_')[1], fname.split('/')[4])
def to_db(self, pdf: pd.DataFrame, db_type='mysql', **kwargs): Log.d(self.tag, 'db type : ' + db_type) if db_type == 'mysql': from sqlalchemy import create_engine args = (kwargs['username'], kwargs['passwd'], kwargs['host'], kwargs['port'], kwargs['db_name']) engine = create_engine('mysql+pymysql://%s:%s@%s:%d/%s' % args, encoding='utf-8') conn = engine.connect() # db insert pdf.to_sql(name=kwargs['table_name'], con=engine, if_exists='append', index=False) conn.close()
def _bind_conf(self, conf_path): import configparser conf_section = 'ConnectionInfo' Log.d(self.tag, 'load .conf file:: filepath:', conf_path, ', section: ', conf_section) self.conf_parser = configparser.ConfigParser() self.conf_parser.read(conf_path) self.ip = self.conf_parser.get(conf_section, 'ip') self.port = self.conf_parser.get(conf_section, 'port') self.id = self.conf_parser.get(conf_section, 'id') self.password = self.conf_parser.get(conf_section, 'password')
def _json2pdf(self, station): """ 최근 1개 발표 데이터만 가져오게끔 구성됨 api에 request 날릴때는 basetime을 15분에 맞춰서 날렸지만 response로 받은 json의 basetime은 정각으로 표기됨. 따라서 datetime string format에서 '분'을 '00'으로 해야함 :param station: :return: """ obj_baseDt = self.get_last_basedt(datetime.datetime.now()) obj_fcstDt = obj_baseDt + datetime.timedelta(hours=4) baseDate, baseTime = obj_baseDt.strftime('%Y%m%d %H00').split(' ') fcstDate, fcstTime = obj_fcstDt.strftime('%Y%m%d %H00').split(' ') # self._dbg.print_e('json base time:', baseDate, baseTime, ', fcsttime', fcstDate, fcstTime) wdata = self._json_dict['response']['body']['items']['item'] # make dict for one measurement tmpdict = {} for col in self._column: tmpdict[col] = '' # fill dict using api measurement data for item in wdata: # debug: req제대로 작동하는지 확인 : json 내용 출력해보기 # self._dbg.print_e('item in wdata: ', item) # get last weather data that matches base datetime if str(item['baseDate']) == baseDate \ and str(item['baseTime']) == baseTime \ and str(item['fcstDate']) == fcstDate \ and str(item['fcstTime']) == fcstTime \ and item['category'] in self._column: # 원하는 시간의 데이터이고, 원하는 칼럼이면 가져오기 tmpdict[item['category']] = [str(item['fcstValue'])] else: pass # make pdf tmpdict['station'] = station tmpdict['datehour'] = [obj_fcstDt.strftime('%Y-%m-%d %H')] self._pdf = pd.DataFrame(tmpdict) Log.d(self.tag, 'kma last local weather data as pdf ↓\n' + str(self._pdf))
def scrap(self, **kwargs): url = self._make_url(**kwargs) Log.d(self.tag, 'request url: ' + url) try: self._driver.get(url) except Exception as e: Log.e(self.tag, 'chromedriver get error!:', e.__class__.__name__) return soup = BeautifulSoup(self._driver.page_source, 'lxml-xml') Log.d(self.tag, 'response received. parsing...') table_list = [] if kwargs['term'] == 'hourly': tr_list = soup.select('#sun-height-table > table > tbody')[0].find_all('tr') for html_row in tr_list: row = [] for html_element in html_row.find_all('td'): # print(html_element.getText()) if len(html_element.getText()) > 2: element = self._transform_angle(html_element.getText()) else: element = kwargs['date'] + ' ' + html_element.getText() row.append(element) solar_zenith = 90 - float(row[2]) row.insert(3, solar_zenith) table_list.append(row) elif kwargs['term'] == 'minutely': objdt = datetime.datetime(year=int(kwargs['date'].split('-')[0]), month=int(kwargs['date'].split('-')[1]), day=int(kwargs['date'].split('-')[2]), hour=kwargs['hour'], minute=kwargs['minute'], second=0) html_row = soup.select('#sun-height-table > table > tbody')[0].find('tr') row = [objdt.strftime('%Y-%m-%d %H:%M')] for html_element in html_row.find_all('td'): row.append(self._transform_angle(html_element.getText())) solar_zenith = 90 - float(row[2]) row.insert(3, solar_zenith) table_list.append(row) return table_list
def _connect(self): if self.conn: Log.d(self.tag, 'already connected: ', self.ip, self.port, self.id) try: self.conn = pymongo.MongoClient( 'mongodb://%s:%s@%s:%s/' % (self.id, self.password, self.ip, self.port)) dict_serverinfo = self.conn.server_info() Log.d(self.tag, 'Succesfully connected to MongoDB Server :: ok:', dict_serverinfo['ok'], ', version:', dict_serverinfo['version']) except Exception as e: Log.e( self.tag, 'failed to connect to MongoDB Server (%s)' % e.__class__.__name__) Log.e(self.tag, e.with_traceback)
def getLatLon(address: str): """ * api info GET /v2/local/search/address.{format} HTTP/1.1 Host: dapi.kakao.com Authorization: KakaoAK {app_key} request 키 설명 필수 타입 query 검색을 원하는 질의어(주소) O String page 결과 페이지 번호 X(기본 1) Integer size 한 페이지에 보여질 문서의 개수 X(기본 10) 1-30 사이 Integer * usage :param address: 위경도를 알아내고자 하는 대상의 주소 (도로명주소 권장) :return: tuple(lat: float, lon: float) * important issues 1. API 호출횟수 제한이 있으므로 사용에 유의 """ # address_test = '천안대로 1223-24' apikey = 'cf606c4c1964ec437d0134cbb5a8deb9' url = 'https://dapi.kakao.com/v2/local/search/address.json' headers = {'Authorization': 'KakaoAK ' + apikey} params = {'query': address} response = requests.get(url, headers=headers, params=params) try: lat = response.json()['documents'][0]['road_address']['y'] lon = response.json()['documents'][0]['road_address']['x'] except TypeError: # 지번주소로 써있는 경우 lat = response.json()['documents'][0]['address']['y'] lon = response.json()['documents'][0]['address']['x'] except IndexError: # 주소가 잘못되었으면 Log.e('getLatLon()', 'wrong address:', address) lat = 0.0 lon = 0.0 Log.d('getLatLon()', 'latitude:', lat, ', longitude:', lon) return lat, lon
def dataset_to_db(self, start_date, end_date): import pandas as pd # set vals for scrap oneday = timedelta(days=1) sdt = datetime.strptime(start_date, '%Y-%m-%d') edt = datetime.strptime(end_date, '%Y-%m-%d') while sdt <= edt: sunrslist = self.scrap(date=sdt.strftime('%Y-%m-%d')) pdf = pd.DataFrame([sunrslist], columns=['date', 'rise', 'culmination', 'set']) Log.d(self.tag, '\n', pdf) self.to_db(pdf, username='******', passwd='defacto8*jj', host='210.102.142.14', port=3306, db_name='nl_witlab', table_name='kasi_sun_riseset') sdt = sdt + oneday
def run(self): # 매 "발표시각"에 API 호출해서 데이터 받아오도록 수정해야함... dt_now = datetime.datetime.now() dt_next_base = self.api.get_last_basedt(dt_now) + datetime.timedelta( hours=3) # log once at start self._log() while self._on: next_base_str = dt_next_base.strftime('%Y-%m-%d %H:%M') now_str = datetime.datetime.now().strftime('%Y-%m-%d %H:%M') # self._dbg.print_e('compare time str:', next_base_str, now_str) if next_base_str <= now_str: self._log() dt_next_base += datetime.timedelta(hours=3) Log.d(self.tag, 'next base time:', dt_next_base.strftime('%Y-%m-%d %H:%M')) time.sleep(0.5)
def pdf2mysql(self, table_name: str, if_exists: str = 'append'): from dbs.mysqlmodule import MysqlManager """ mysql에 테이블 형식으로 저장, 테이블이 있어야 함 (테이블 없을 시 새로 생성 기능도 추가해야 함) :param table_name: 테이블 명, :param if_exists: to_sql() params, ex. 'append', 'replace', 'fail' :return: nothing """ Log.d(self.tag, 'pdf -> mysql :: ' + str(list(self._pdf.iloc[0]))) # connect to mysql mm = MysqlManager() mm.init(self._mysql_conn_args) # write to sql self._pdf.to_sql(name=table_name, con=mm.engine, if_exists=if_exists, index=False) # db close mm.close() Log.d(self.tag, 'mysql write completed.')
def normalize_parquet(self, hdfs_path='', sort_col=None): from sparkmodule import PySparkManager """ TROUBLE ISSUES 191022) deprecated: DON'T USE THIS! IT MAY CORRUPT YOUR DATAFRAME! parquet 형식의 spark dataframe을 중복제거, 시간 정렬 등 정규화(normalize)하는 메소드 로깅을 같은 날 데이터를 두번 했다거나 하면 한번씩 normalize 해줘야함 :param hdfs_path: :return: """ if hdfs_path == '': # default path path = self._hdfs_path else: # specific path path = hdfs_path if not sort_col: sort_col = ['station', 'datehour'] else: pass Log.d(self.tag, 'normalizing: read parquet from hdfs... :', path) spdf = PySparkManager().sqlctxt.read.parquet(path) Log.d(self.tag, 'normalizing: remove coupled rows and sort by %s...' % sort_col) spdf_new = spdf.distinct().sort(sort_col).cache() Log.d(self.tag, 'normalizing: write parquet...') spdf_new.write.mode('overwrite').parquet(path)
def xlsxdir2parquet(self, dirpath: str, hdfs_outpath: str): from pyspark.sql.functions import udf from pyspark.sql.types import StringType udf_mergeCol = udf(lambda s, t: s + ' ' + t, StringType()) infilelist = self.search(dirpath) # 디렉토리 내에 확정데이터 파일 하나씩 읽어서 merged로 통합시키기 Log.d(self.tag + '.xlsxdir2parquet()', 'target file name:', infilelist[0]) # read xlsx and make spdf merged = self.xlsx2spdf(infilelist[0]) # concatenate two columns merged = merged.withColumn('location', udf_mergeCol('location', 'station_name')) merged = merged.drop('station_name') Log.d(self.tag + '.xlsxdir2parquet()', 'target file converted to spdf') merged.show() for i in range(1, len(infilelist)): Log.d(self.tag + '.xlsxdir2parquet()', 'target file name:', infilelist[i]) # read xlsx and make spdf spdf = self.xlsx2spdf(infilelist[i]) # concatenate two columns spdf = spdf.withColumn('location', udf_mergeCol('location', 'station_name')) spdf = spdf.drop('station_name') # merge spdf merged = merged.union(spdf) Log.d(self.tag + '.xlsxdir2parquet()', 'target file converted to spdf') merged.show() merged.show() merged.write.mode('overwrite').parquet(hdfs_outpath) Log.d(self.tag + '.xlsxdir2parquet()', 'parquet write completed.')
def scrap(self, **kwargs): try: date = str(kwargs['date']) except KeyError: # set today date = datetime.now().strftime('%Y-%m-%d') kwargs['date'] = date try: lat, lon = str(kwargs['lat']), str(kwargs['lon']) except KeyError: # witlab lat = '36.850490744236744' lon = '127.15250390636234' kwargs['lat'] = lat kwargs['lon'] = lon url = self._make_url(**kwargs) Log.d(self.tag, 'request url :' + url) self._driver.get(url) soup = BeautifulSoup(self._driver.page_source, 'html.parser') sunrise = soup.find_all('span', {'class': 'sunrise'})[0].string culmination = soup.find_all('span', {'class': 'culmination'})[0].string sunset = soup.find_all('span', {'class': 'sunset'})[0].string Log.d(self.tag, 'result:', sunrise, culmination, sunset) sr = sunrise[0:2] + ':' + sunrise[4:-1] cul = culmination[0:2] + ':' + culmination[4:-1] ss = sunset[0:2] + ':' + sunset[4:-1] Log.d(self.tag, date, lat, lon, ': %s %s %s' % (sr, cul, ss)) return [kwargs['date'], sr, cul, ss]
def _get_localweather_coord(self, station='충청남도 천안시서북구 부성동' ): # 형식 : '시군구 시도 동면읍' top_url = 'http://www.kma.go.kr/DFSROOT/POINT/DATA/top' mdl_url = 'http://www.kma.go.kr/DFSROOT/POINT/DATA/mdl' leaf_url = 'http://www.kma.go.kr/DFSROOT/POINT/DATA/leaf' tail = '.json.txt' # top res1 = requests.get(top_url + tail) res1.encoding = 'utf-8' # MUST DO IT!!! json_top = json.loads(res1.text) dict_top = {} for item in json_top: dict_top[item['value']] = item['code'] # mdl res2 = requests.get(mdl_url + '.' + dict_top[station.split()[0]] + tail) res2.encoding = 'utf-8' # MUST DO IT!!! json_mdl = json.loads(res2.text) dict_mdl = {} for item in json_mdl: dict_mdl[item['value']] = item['code'] # leaf res3 = requests.get(leaf_url + '.' + dict_mdl[station.split()[1]] + tail) res3.encoding = 'utf-8' # MUST DO IT!!! json_leaf = json.loads(res3.text) dict_leaf = {} for item in json_leaf: dict_leaf[item['value']] = [item['x'], item['y']] coord = dict_leaf[station.split()[2]] Log.d(self.tag, 'kma coord:', coord) return coord[0], coord[1]
def xlsx2spdf(self, infilepath: str): data = pd.read_excel(infilepath, encoding='utf-8') # read as pandas dataframe Log.d(self.tag + '.xlsx2spdf()', 'before changing column\n', data.iloc[:2]) # for debug if '망' in data.columns: # if exists column name '망' data = data.drop(['망'], axis=1) # drop it Log.d(self.tag, 'dropped column "망"\n', data.iloc[:2]) # for debug data.columns = self.col_list_src # change column name Log.d(self.tag, 'after changing column\n', data.iloc[:2]) # for debug # correct datetime data['datetime'] = data['datetime'].apply(self._datetime_corrector) df = self.sqlctxt.createDataFrame(data) return df
if __name__ == '__main__': import pandas as pd sac = SunAngleCrawler(debug=True) # this code works with hourly term only!!!! obj_startdt = datetime.datetime.strptime('2020-06-02 14:19', '%Y-%m-%d %H:%M') obj_enddt = datetime.datetime.strptime('2020-12-31 23:59', '%Y-%m-%d %H:%M') while obj_startdt <= obj_enddt: table_list = sac.scrap(date=obj_startdt.strftime('%Y-%m-%d'), address='충남+천안시+서북구+천안대로+1223-24', hour=obj_startdt.hour, minute=obj_startdt.minute, second=0, term='minutely') pdf = pd.DataFrame(table_list, columns=['datetime', 'azimuth', 'altitude', 'solar_zenith', 'right_asc', 'left_asc']) Log.d(sac.tag, '\n', pdf) sac.to_db(pdf, username='******', passwd='defacto8*jj', host='210.102.142.14', port=3306, db_name='nl_witlab', table_name='kasi_sun_angle') obj_startdt += datetime.timedelta(minutes=1) sac.close() # must close!
return lat, lon if __name__ == "__main__": import pandas as pd from pyspark.sql.types import StructField, StructType, StringType, DoubleType from sparkmodule import PySparkManager pdf = pd.read_csv('/home/witlab/uvmon_location.csv', encoding='utf-8') lat_col = [] lon_col = [] for i in range(len(pdf)): address = pdf.iloc[i]['address'] Log.d('__main__', 'address:', address) lat, lon = getLatLon(address) lat_col.append(float(lat)) lon_col.append(float(lon)) pdf['lat'] = lat_col pdf['lon'] = lon_col Log.d('__main__', 'pdf:\n', pdf) # create spark dataframe # col : [location,station_code,address ] schema = StructType([ StructField('location', StringType()), StructField('station_code', StringType()), StructField('address', StringType()),
def _json2pdf(self, **kwargs): param = self._json_dict['parm'] data_list = self._json_dict['list'] """ dict to pandas df 예제 >>> data = {'row_1': [3, 2, 1, 0], 'row_2': ['a', 'b', 'c', 'd']} >>> pd.DataFrame.from_dict(data, orient='index') 0 1 2 3 row_1 3 2 1 0 row_2 a b c d 우리에게 rowkey는 필요없다 pm 데이터 스키마는 아래와 같다 parm stationName -> station list dataTime -> datehour khaiGrade khaiValue no2Value -> no2 coValue -> co o3Value -> o3 pm10Value -> pm10 pm25Value -> pm25 so2Value -> so2 mapping) json key 값 -> pandas DataFrame column name """ rawdata = [] # 리스트로 만든 raw data, 이걸 df로 변환할 예정 station = param['stationName'] for data in data_list: # api로 1회 가져온 데이터를 스키마에 맞게 매핑하고 한줄씩 리스트에 추가하기 row = [station, self._datetime_corrector(data['dataTime']), data['khaiGrade'], data['khaiValue'], data['no2Value'], data['coValue'], data['o3Value'], data['so2Value'], data['pm10Value'], data['pm25Value']] rawdata.append(row) if not rawdata: # rawdata mapping 실패시 탈출 Log.e(self.tag, 'parse error: rawdata is empty.') return False Log.d(self.tag, 'get data from api:', str(rawdata)) try: # make pandas dataframe self._pdf = pd.DataFrame(rawdata) # set new column name self._pdf.columns = self._column if kwargs['term'] == 'hourly': # hourly data self._pdf = self._pdf.sort_values(by=['datehour'], ascending=False).iloc[:1] return True except Exception as e: Log.e(self.tag, 'exception occurred: %s' % e) return False
def close(self): Log.d(self.tag, 'driver closing...') self._driver.close() Log.d(self.tag, 'driver closed.')
def insert_doc(self, data: dict, db_name: str, collection_name: str): db = self.conn.get_database(db_name) collection = db.get_collection(collection_name) id = collection.insert_one(data).inserted_id Log.d(self.tag, 'insertion succeed:', data[:50] + '...' if len(str(data)) > 50 else data, ':', id)
def close(self): Log.d(self.tag, 'disconnecting from MongoDB Server...') self.conn.close()