def get_uuid_date_condition(category): # 한달 전꺼면 해당 uuid 모두 소집 from datetime import datetime, timedelta from sqlalchemy.sql import func thirty_days_ago = func.now() - timedelta(days=30) refine_engine = db_setting.get_refineddata_engine() refine_db_session = db_setting.get_sesstion(refine_engine) uuid_list = refine_db_session.query(RefinedData.uuid).filter( RefinedData.update_date <= thirty_days_ago).all() # refine의 json category와 카테고리가 일치하는 uuid만 가져오기로 사용 refine_db_session.close() return [category, 'from get_uuid_date_condition', uuid_list]
def insert_refined(uuid_list): log_init('insert_refined' + crawler.get_date()) from sqlalchemy import and_ from sqlalchemy.sql import func # create connection to crawling db rawdata_engine = db_setting.get_rawdata_engine() rawdata_db_session = db_setting.get_sesstion( rawdata_engine) # create session refine_engine = db_setting.get_refineddata_engine() refine_db_session = db_setting.get_sesstion(refine_engine) # must be for uuid list , 개별 업데이트 시에는 uuid 넣기전에 리스트로 한번 감싸서 ㄱ ㄱ category = uuid_list[0] source = uuid_list[1] uuid_list = uuid_list[2] for uuid in uuid_list: try: uuid = uuid[0] if category == 'movie': sub = rawdata_db_session.query( func.max(Rawdata_movie.date).label('lastdate')).filter( Rawdata_movie.uuid == uuid).group_by( Rawdata_movie.source_site).subquery('sub') ans = rawdata_db_session.query( Rawdata_movie.source_site, Rawdata_movie.data, Rawdata_movie.recovery).filter( and_(Rawdata_movie.date == sub.c.lastdate)).all() else: sub = rawdata_db_session.query( func.max(RawdataRestaurant.date).label('lastdate')).filter( RawdataRestaurant.uuid == uuid).group_by( RawdataRestaurant.source_site).subquery('sub') ans = rawdata_db_session.query( RawdataRestaurant.source_site, RawdataRestaurant.data, RawdataRestaurant.recovery).filter( and_(RawdataRestaurant.date == sub.c.lastdate)).all() json_form = crawler.make_json(category, str(uuid), ans) # 완성된 제이슨 데이터! entry = RefinedData(uuid, json_form) refine_db_session.add(entry) refine_db_session.commit() except Exception as e: logging.error('In uuid : ' + str(uuid) + ' Exception ' + str(e)) logging.exception('Got exception.. ') logging.error('**********************************') continue rawdata_db_session.close() refine_db_session.close()
def get_uuid_refine_notexist_review(category): from sqlalchemy.sql import func refine_engine = db_setting.get_refineddata_engine() refine_db_session = db_setting.get_sesstion(refine_engine) # create connection to crawling db rawdata_engine = db_setting.get_rawdata_engine() rawdata_db_session = db_setting.get_sesstion( rawdata_engine) # create session review_uuid = rawdata_db_session.query(Rawdata_movie.uuid).group_by( Rawdata_movie.uuid).all() refine_uuid = refine_db_session.query(RefinedData.uuid).all() not_in_review_uuid = list(set(refine_uuid) - set(review_uuid)) rawdata_db_session.close() refine_db_session.close() return [category, 'from get_uuid_refine', not_in_review_uuid]
def get_uuid_not_exist(category): # refine table에 없는 중복되지않은 모든 uuid를 가져옴 # create connection to crawling db rawdata_engine = db_setting.get_rawdata_engine() rawdata_db_session = db_setting.get_sesstion( rawdata_engine) # create session refine_engine = db_setting.get_refineddata_engine() refine_db_session = db_setting.get_sesstion(refine_engine) refine_list = refine_db_session.query(RefinedData.uuid).all() if category == 'movie': raw_list = rawdata_db_session.query(Rawdata_movie.uuid).group_by( Rawdata_movie.uuid).all() else: raw_list = rawdata_db_session.query(RawdataRestaurant.uuid).group_by( RawdataRestaurant.uuid).all() # refine에서 카테고리가 일치하는 uuid만 가져오게해서 연산하게 만듬, 밑의 rawdata는 radata_+category로 테이블을 식별하게 만듬 new_uuid_list = list(set(raw_list) - set(refine_list)) rawdata_db_session.close() refine_db_session.close() return [category, 'from get_uuid_not_exist', new_uuid_list]
def get_uuid_refine(category): refine_engine = db_setting.get_refineddata_engine() refine_db_session = db_setting.get_sesstion(refine_engine) uuid_list = refine_db_session.query(RefinedData.uuid).all() refine_db_session.close() return [category, 'from get_uuid_refine', uuid_list]