Esempio n. 1
0
def insert_review(category, source_site):
    from sqlalchemy import and_
    log_init('insert_review_test' + crawler.get_date())
    # create connection to crawling db
    rawdata_engine = db_setting.get_rawdata_engine()
    rawdata_db_session = db_setting.get_sesstion(
        rawdata_engine)  # create session

    # elem = get_uuid_refine_notexist_review(category)
    elem = get_uuid_refine(category)
    uuid_list = elem[2]  # [(UUID('asdfasdf',)),(UUID('asdfasdf',))] 이런꼴로 나옴
    for uuid in uuid_list:
        uuid = uuid[0]
        source_list = rawdata_db_session.query(Rawdata_movie.source_id).filter(
            and_(Rawdata_movie.uuid == uuid,
                 Rawdata_movie.source_site == source_site)).group_by(
                     Rawdata_movie.source_id).all()  # 쿼리 수정, 디비백업 쉘스크립트
        for source in source_list:
            source_id = source[0]
            review_list = crawler.get_review(category, source_site, source_id)
            bulk_object = []
            review_id = 0
            for data in review_list:
                review_id += 1
                user_name = data[0]
                review = data[1]
                date = data[2]
                rating = data[3]
                entry = Review_movie(review_id=review_id,
                                     uuid=uuid,
                                     source_site=source_site,
                                     source_id=source_id,
                                     user_name=user_name,
                                     date=date,
                                     review=review,
                                     rating=rating)  # remove date
                # try:
                #     rawdata_db_session.add(entry)
                #     rawdata_db_session.commit()
                # except Exception as e:
                #     logging.error('In uuid :  '+str(uuid)+' Exception ' + str(e))
                #     logging.exception('Got exception.. ')
                #     logging.error('**********************************')
                #     continue
                bulk_object.append(entry)
            try:
                rawdata_db_session.bulk_save_objects(bulk_object)
                rawdata_db_session.commit()
            except Exception as e:
                logging.error('In uuid :  ' + str(uuid) + ' Exception ' +
                              str(e))
                logging.exception('Got exception.. ')
                logging.error('**********************************')
                continue

    rawdata_db_session.close()
Esempio n. 2
0
def get_uuid_list_all(
        category):  # 존재하는 모든 중복되지 않은 uuid를 rawdata __tablename__에서 가져옴
    engine = db_setting.get_rawdata_engine(
    )  # create connection to crawling db
    db_session = db_setting.get_sesstion(engine)  # create session
    uuid_list = db_session.query(Rawdata_movie.uuid).group_by(
        Rawdata_movie.uuid).all()
    # 해당하는 카테고리의 테이블에서 모든 uuid를 가져옴
    db_session.close()
    return [category, 'from get_uuid_lsit_all', uuid_list]  # 튜플 반환형임;
Esempio n. 3
0
def insert_refined(uuid_list):
    log_init('insert_refined' + crawler.get_date())
    from sqlalchemy import and_
    from sqlalchemy.sql import func

    # create connection to crawling db
    rawdata_engine = db_setting.get_rawdata_engine()
    rawdata_db_session = db_setting.get_sesstion(
        rawdata_engine)  # create session
    refine_engine = db_setting.get_refineddata_engine()
    refine_db_session = db_setting.get_sesstion(refine_engine)
    # must be for uuid list , 개별 업데이트 시에는 uuid 넣기전에 리스트로 한번 감싸서 ㄱ ㄱ

    category = uuid_list[0]
    source = uuid_list[1]
    uuid_list = uuid_list[2]

    for uuid in uuid_list:
        try:
            uuid = uuid[0]
            if category == 'movie':
                sub = rawdata_db_session.query(
                    func.max(Rawdata_movie.date).label('lastdate')).filter(
                        Rawdata_movie.uuid == uuid).group_by(
                            Rawdata_movie.source_site).subquery('sub')
                ans = rawdata_db_session.query(
                    Rawdata_movie.source_site, Rawdata_movie.data,
                    Rawdata_movie.recovery).filter(
                        and_(Rawdata_movie.date == sub.c.lastdate)).all()
            else:
                sub = rawdata_db_session.query(
                    func.max(RawdataRestaurant.date).label('lastdate')).filter(
                        RawdataRestaurant.uuid == uuid).group_by(
                            RawdataRestaurant.source_site).subquery('sub')
                ans = rawdata_db_session.query(
                    RawdataRestaurant.source_site, RawdataRestaurant.data,
                    RawdataRestaurant.recovery).filter(
                        and_(RawdataRestaurant.date == sub.c.lastdate)).all()

            json_form = crawler.make_json(category, str(uuid),
                                          ans)  # 완성된 제이슨 데이터!
            entry = RefinedData(uuid, json_form)
            refine_db_session.add(entry)
            refine_db_session.commit()
        except Exception as e:
            logging.error('In uuid :  ' + str(uuid) + ' Exception ' + str(e))
            logging.exception('Got exception.. ')
            logging.error('**********************************')
            continue
    rawdata_db_session.close()
    refine_db_session.close()
Esempio n. 4
0
def insert_rawdata(input):
    log_init('insert_rawdata' + crawler.get_date())
    engine = db_setting.get_rawdata_engine(
    )  # create connection to crawling db
    db_session = db_setting.get_sesstion(engine)  # create session

    category = input[0]
    source = input[1]
    input = input[2]

    sites = crawler.read_site(category, 'all')
    for elem in input:
        uid = uuid.uuid4()
        sleep(1.3 + random.random())
        for site in sites:
            try:
                content_url = crawler.get_url(category, site, elem)
                if content_url == None:
                    continue
                raw_data = crawler.get_raw_data(content_url)

                source_site = site['site_name']
                source_id = crawler.get_source_id(content_url)
                recovery = crawler.request_recovery(category, elem,
                                                    content_url, site)
                if category == 'movie':
                    entry = Rawdata_movie(uuid=check_uuid(
                        db_session, category, source_id, uid),
                                          source_site=source_site,
                                          source_id=source_id,
                                          data=raw_data,
                                          recovery=recovery)  # remove date
                elif category == 'restaurant':
                    entry = RawdataRestaurant(uuid=check_uuid(
                        db_session, category, source_id, uid),
                                              source_site=source_site,
                                              source_id=source_id,
                                              data=raw_data,
                                              recovery=recovery)  # remove date
                db_session.add(entry)
                db_session.commit()

            except Exception as e:
                logging.error('In ' + source + ' Exception while <' +
                              site['site_name'] + '> searching <' + str(elem) +
                              '> ' + str(e))
                logging.exception('Got exception.. ')
                logging.error('**********************************')
                continue
    db_session.close()
Esempio n. 5
0
def get_uuid_refine_notexist_review(category):
    from sqlalchemy.sql import func
    refine_engine = db_setting.get_refineddata_engine()
    refine_db_session = db_setting.get_sesstion(refine_engine)
    # create connection to crawling db
    rawdata_engine = db_setting.get_rawdata_engine()
    rawdata_db_session = db_setting.get_sesstion(
        rawdata_engine)  # create session

    review_uuid = rawdata_db_session.query(Rawdata_movie.uuid).group_by(
        Rawdata_movie.uuid).all()
    refine_uuid = refine_db_session.query(RefinedData.uuid).all()

    not_in_review_uuid = list(set(refine_uuid) - set(review_uuid))
    rawdata_db_session.close()
    refine_db_session.close()
    return [category, 'from get_uuid_refine', not_in_review_uuid]
Esempio n. 6
0
def get_identify(uuid_list):  # uuid 튜플 리스트를 주면 identify 리스트를 반환한다.
    # create connection to crawling db
    rawdata_engine = db_setting.get_rawdata_engine()
    rawdata_db_session = db_setting.get_sesstion(
        rawdata_engine)  # create session

    # 후에 이걸 이용해서 Rawdata_+category 형식으로 테이블 구분해 데이터 가져오기
    category = uuid_list[0]
    source = uuid_list[1]
    uuid_list = uuid_list[2]

    identify_list = []
    for uuid in uuid_list:
        uuid = uuid[0]
        ans = rawdata_db_session.query(
            Rawdata_movie.recovery).filter(Rawdata_movie.uuid == uuid).first()
        identify = ans[0][1]
        identify_list.append(identify)
    rawdata_db_session.close()
    return [category, 'uuid_to_identify', identify_list]
Esempio n. 7
0
def get_uuid_not_exist(category):  # refine table에 없는 중복되지않은 모든 uuid를 가져옴
    # create connection to crawling db
    rawdata_engine = db_setting.get_rawdata_engine()
    rawdata_db_session = db_setting.get_sesstion(
        rawdata_engine)  # create session
    refine_engine = db_setting.get_refineddata_engine()
    refine_db_session = db_setting.get_sesstion(refine_engine)

    refine_list = refine_db_session.query(RefinedData.uuid).all()
    if category == 'movie':
        raw_list = rawdata_db_session.query(Rawdata_movie.uuid).group_by(
            Rawdata_movie.uuid).all()
    else:
        raw_list = rawdata_db_session.query(RawdataRestaurant.uuid).group_by(
            RawdataRestaurant.uuid).all()
    # refine에서 카테고리가 일치하는 uuid만 가져오게해서 연산하게 만듬, 밑의 rawdata는 radata_+category로 테이블을 식별하게 만듬
    new_uuid_list = list(set(raw_list) - set(refine_list))
    rawdata_db_session.close()
    refine_db_session.close()

    return [category, 'from get_uuid_not_exist', new_uuid_list]
Esempio n. 8
0
def create_table():
    engine = db_setting.get_rawdata_engine()
    # db_setting.create_rawdata_table(engine)
    db_setting.create_rawdata_table_restaurant(engine)