def insert_review(category, source_site): from sqlalchemy import and_ log_init('insert_review_test' + crawler.get_date()) # create connection to crawling db rawdata_engine = db_setting.get_rawdata_engine() rawdata_db_session = db_setting.get_sesstion( rawdata_engine) # create session # elem = get_uuid_refine_notexist_review(category) elem = get_uuid_refine(category) uuid_list = elem[2] # [(UUID('asdfasdf',)),(UUID('asdfasdf',))] 이런꼴로 나옴 for uuid in uuid_list: uuid = uuid[0] source_list = rawdata_db_session.query(Rawdata_movie.source_id).filter( and_(Rawdata_movie.uuid == uuid, Rawdata_movie.source_site == source_site)).group_by( Rawdata_movie.source_id).all() # 쿼리 수정, 디비백업 쉘스크립트 for source in source_list: source_id = source[0] review_list = crawler.get_review(category, source_site, source_id) bulk_object = [] review_id = 0 for data in review_list: review_id += 1 user_name = data[0] review = data[1] date = data[2] rating = data[3] entry = Review_movie(review_id=review_id, uuid=uuid, source_site=source_site, source_id=source_id, user_name=user_name, date=date, review=review, rating=rating) # remove date # try: # rawdata_db_session.add(entry) # rawdata_db_session.commit() # except Exception as e: # logging.error('In uuid : '+str(uuid)+' Exception ' + str(e)) # logging.exception('Got exception.. ') # logging.error('**********************************') # continue bulk_object.append(entry) try: rawdata_db_session.bulk_save_objects(bulk_object) rawdata_db_session.commit() except Exception as e: logging.error('In uuid : ' + str(uuid) + ' Exception ' + str(e)) logging.exception('Got exception.. ') logging.error('**********************************') continue rawdata_db_session.close()
def get_uuid_list_all( category): # 존재하는 모든 중복되지 않은 uuid를 rawdata __tablename__에서 가져옴 engine = db_setting.get_rawdata_engine( ) # create connection to crawling db db_session = db_setting.get_sesstion(engine) # create session uuid_list = db_session.query(Rawdata_movie.uuid).group_by( Rawdata_movie.uuid).all() # 해당하는 카테고리의 테이블에서 모든 uuid를 가져옴 db_session.close() return [category, 'from get_uuid_lsit_all', uuid_list] # 튜플 반환형임;
def insert_refined(uuid_list): log_init('insert_refined' + crawler.get_date()) from sqlalchemy import and_ from sqlalchemy.sql import func # create connection to crawling db rawdata_engine = db_setting.get_rawdata_engine() rawdata_db_session = db_setting.get_sesstion( rawdata_engine) # create session refine_engine = db_setting.get_refineddata_engine() refine_db_session = db_setting.get_sesstion(refine_engine) # must be for uuid list , 개별 업데이트 시에는 uuid 넣기전에 리스트로 한번 감싸서 ㄱ ㄱ category = uuid_list[0] source = uuid_list[1] uuid_list = uuid_list[2] for uuid in uuid_list: try: uuid = uuid[0] if category == 'movie': sub = rawdata_db_session.query( func.max(Rawdata_movie.date).label('lastdate')).filter( Rawdata_movie.uuid == uuid).group_by( Rawdata_movie.source_site).subquery('sub') ans = rawdata_db_session.query( Rawdata_movie.source_site, Rawdata_movie.data, Rawdata_movie.recovery).filter( and_(Rawdata_movie.date == sub.c.lastdate)).all() else: sub = rawdata_db_session.query( func.max(RawdataRestaurant.date).label('lastdate')).filter( RawdataRestaurant.uuid == uuid).group_by( RawdataRestaurant.source_site).subquery('sub') ans = rawdata_db_session.query( RawdataRestaurant.source_site, RawdataRestaurant.data, RawdataRestaurant.recovery).filter( and_(RawdataRestaurant.date == sub.c.lastdate)).all() json_form = crawler.make_json(category, str(uuid), ans) # 완성된 제이슨 데이터! entry = RefinedData(uuid, json_form) refine_db_session.add(entry) refine_db_session.commit() except Exception as e: logging.error('In uuid : ' + str(uuid) + ' Exception ' + str(e)) logging.exception('Got exception.. ') logging.error('**********************************') continue rawdata_db_session.close() refine_db_session.close()
def insert_rawdata(input): log_init('insert_rawdata' + crawler.get_date()) engine = db_setting.get_rawdata_engine( ) # create connection to crawling db db_session = db_setting.get_sesstion(engine) # create session category = input[0] source = input[1] input = input[2] sites = crawler.read_site(category, 'all') for elem in input: uid = uuid.uuid4() sleep(1.3 + random.random()) for site in sites: try: content_url = crawler.get_url(category, site, elem) if content_url == None: continue raw_data = crawler.get_raw_data(content_url) source_site = site['site_name'] source_id = crawler.get_source_id(content_url) recovery = crawler.request_recovery(category, elem, content_url, site) if category == 'movie': entry = Rawdata_movie(uuid=check_uuid( db_session, category, source_id, uid), source_site=source_site, source_id=source_id, data=raw_data, recovery=recovery) # remove date elif category == 'restaurant': entry = RawdataRestaurant(uuid=check_uuid( db_session, category, source_id, uid), source_site=source_site, source_id=source_id, data=raw_data, recovery=recovery) # remove date db_session.add(entry) db_session.commit() except Exception as e: logging.error('In ' + source + ' Exception while <' + site['site_name'] + '> searching <' + str(elem) + '> ' + str(e)) logging.exception('Got exception.. ') logging.error('**********************************') continue db_session.close()
def get_uuid_refine_notexist_review(category): from sqlalchemy.sql import func refine_engine = db_setting.get_refineddata_engine() refine_db_session = db_setting.get_sesstion(refine_engine) # create connection to crawling db rawdata_engine = db_setting.get_rawdata_engine() rawdata_db_session = db_setting.get_sesstion( rawdata_engine) # create session review_uuid = rawdata_db_session.query(Rawdata_movie.uuid).group_by( Rawdata_movie.uuid).all() refine_uuid = refine_db_session.query(RefinedData.uuid).all() not_in_review_uuid = list(set(refine_uuid) - set(review_uuid)) rawdata_db_session.close() refine_db_session.close() return [category, 'from get_uuid_refine', not_in_review_uuid]
def get_identify(uuid_list): # uuid 튜플 리스트를 주면 identify 리스트를 반환한다. # create connection to crawling db rawdata_engine = db_setting.get_rawdata_engine() rawdata_db_session = db_setting.get_sesstion( rawdata_engine) # create session # 후에 이걸 이용해서 Rawdata_+category 형식으로 테이블 구분해 데이터 가져오기 category = uuid_list[0] source = uuid_list[1] uuid_list = uuid_list[2] identify_list = [] for uuid in uuid_list: uuid = uuid[0] ans = rawdata_db_session.query( Rawdata_movie.recovery).filter(Rawdata_movie.uuid == uuid).first() identify = ans[0][1] identify_list.append(identify) rawdata_db_session.close() return [category, 'uuid_to_identify', identify_list]
def get_uuid_not_exist(category): # refine table에 없는 중복되지않은 모든 uuid를 가져옴 # create connection to crawling db rawdata_engine = db_setting.get_rawdata_engine() rawdata_db_session = db_setting.get_sesstion( rawdata_engine) # create session refine_engine = db_setting.get_refineddata_engine() refine_db_session = db_setting.get_sesstion(refine_engine) refine_list = refine_db_session.query(RefinedData.uuid).all() if category == 'movie': raw_list = rawdata_db_session.query(Rawdata_movie.uuid).group_by( Rawdata_movie.uuid).all() else: raw_list = rawdata_db_session.query(RawdataRestaurant.uuid).group_by( RawdataRestaurant.uuid).all() # refine에서 카테고리가 일치하는 uuid만 가져오게해서 연산하게 만듬, 밑의 rawdata는 radata_+category로 테이블을 식별하게 만듬 new_uuid_list = list(set(raw_list) - set(refine_list)) rawdata_db_session.close() refine_db_session.close() return [category, 'from get_uuid_not_exist', new_uuid_list]
def create_table(): engine = db_setting.get_rawdata_engine() # db_setting.create_rawdata_table(engine) db_setting.create_rawdata_table_restaurant(engine)