def 뉴스TBL에_뉴스URL이_존재하면_뉴스원본TBL의_파싱완료를_True로_일괄업뎃(dbg_on=False, shown_cnt=1, 사전검증=False): whoami = dbg.whoami(sys.modules[__name__].__file__, inspect.stack()[0][3], dbg_on) inputs = dbg.inputs(inspect.currentframe(), dbg_on) """ """ 입력 = {'shown_cnt': shown_cnt, '사전검증': 사전검증} query = {'뉴스_url': {'$exists': 1}} 뉴스url_li = mg.distinct(db명=DB명, tbl명=PARSED_TBL, col명='뉴스_url', query=query, dbg_on=dbg_on, shown_cnt=shown_cnt) query = {'뉴스_url': {'$in': 뉴스url_li}} update = {'$set': {'파싱완료': True}} mg.update_many(db명=DB명, tbl명=PARSING_TBL, query=query, update=update, upsert=False, dbg_on=dbg_on, 사전검증=사전검증)
def RD_TBL에서_뉴스XX의_뉴스id가_존재하면_뉴스TBL에_수집완료True를_업뎃(타겟col명='뉴스제목', analyzerMethod='srl', dbg_on=False, shown_cnt=1, 사전검증=False): whoami = dbg.whoami(sys.modules[__name__].__file__, inspect.stack()[0][3], dbg_on) inputs = dbg.inputs(inspect.currentframe(), dbg_on) 뉴스id_li = RD_TBL에서_ETRI언어분석수집이_정상완료된_뉴스idli로딩( 타겟col명=타겟col명, analyzerMethod=analyzerMethod, dbg_on=dbg_on, shown_cnt=shown_cnt) 수집완료col명 = 타겟col명 + '_ETRI언어분석_수집완료' query = {'_id': {'$in': 뉴스id_li}} update = {'$set': {수집완료col명: True}} mg.update_many(db명=DB명, tbl명='뉴스', query=query, update=update, upsert=False, dbg_on=dbg_on, 사전검증=사전검증) print('\n' + '*' * 60 + inspect.stack()[0][3] + '_검증') query = {'_id': {'$in': 뉴스id_li}, 수집완료col명: True} mg.find(db명=DB명, tbl명='뉴스', query=query, projection=None, dbg_on=dbg_on, 컬럼순서li=[], df보고형태='df')
def 뉴스_원본TBL에_있는_뉴스url이_화면배치TBL에도_있다면_수집완료True_업뎃(dbg_on=False, 사전검증=False): print('\n' + '='*60 + inspect.stack()[0][3]) 뉴스url_li = mg.distinct(db명=DB명, tbl명=PARSING_TBL, col명='뉴스_url', query=None, dbg_on=dbg_on, shown_cnt=1) query = {'href':{'$in':뉴스url_li}} update = {'$set':{'수집완료':True}} mg.update_many(db명=DB명, tbl명=COLLECTING_TBL, query=query, update=update, upsert=False, dbg_on=dbg_on, 사전검증=사전검증)
def 화면배치TBL의_수집일시를_뉴스원본TBL에_업뎃(dbg_on=False, 사전검증=False): print('\n' + '='*60 + inspect.stack()[0][3]) projection = {'_id':1, 'href':1, '수집일시':1} df = mg.find(db명=DB명, tbl명='화면배치', query=None, projection=projection, dbg_on=dbg_on, 컬럼순서li=[], df보고형태='df') df = df.sort_values(['href', '수집일시']) df = df.drop_duplicates(subset=['href'], keep='first', inplace=False) dicli = df.to_dict('records') dicli_len = len(dicli) i=1 for d in dicli: print('\n' + '-'*60 + '{}/{}, 수집일시:{}'.format(i, dicli_len, d['수집일시'])) query = {'뉴스_url':d['href']} update = {'$set':{'수집일시':d['수집일시'], '화면배치id':d['_id']}} mg.update_many(db명=DB명, tbl명=PARSING_TBL, query=query, update=update, upsert=True, dbg_on=dbg_on, 사전검증=사전검증) #break i+=1 if dbg_on == True: print('\n' + '='*60 + inspect.stack()[0][3]+'_검증') query = {'화면배치id':d['_id']} projection = {'_id':1, '화면배치id':1, '수집일시':1, '뉴스_url':1} dicli = mg.find(db명=DB명, tbl명=PARSING_TBL, query=query, projection=projection, dbg_on=dbg_on, 컬럼순서li=[], df보고형태='dicli')
def TBL의_뉴스id가_RDTBL에도_존재한다면_파싱완료는_True여야한다(타겟col명, analyzerMethod='srl', subMethod='WSD', texttype='NNG', dbg_on=False): 파싱타겟col명 = 타겟col명 + analyzerMethod + '_res' 파싱저장col명 = 타겟col명 + analyzerMethod + '_' + subMethod + texttype + 'li' 파싱완료col명 = 타겟col명 + analyzerMethod + '_res' + '파싱완료' query = {파싱저장col명: {'$ne': None}} 뉴스id_li = mg.distinct(db명=DB명, tbl명=TBL명, col명='뉴스id', query=query, dbg_on=dbg_on, shown_cnt=1) query = {'뉴스id': {'$in': 뉴스id_li}, 파싱타겟col명: {'$ne': None}} 뉴스id_RD_li = mg.distinct(db명=DB명, tbl명=RD_TBL명, col명='_id', query=None, dbg_on=dbg_on, shown_cnt=1) #projection = {'_id':1} #df = mg.find(db명=DB명, tbl명=RD_TBL명, query=None, projection=projection, dbg_on=dbg_on, 컬럼순서li=[], df보고형태='df') print(len(뉴스id_RD_li)) query = {'_id': {'$in': 뉴스id_RD_li}} update = {'$set': {파싱완료col명: True}} mg.update_many(db명=DB명, tbl명=RD_TBL명, query=query, update=update, upsert=False, dbg_on=dbg_on, 사전검증=False)