コード例 #1
0
def get_max_page_num(pid):
    page_url = web_json_url % (pid, 0)
    headers1 = {
        'GET': '',
        'Host': "club.jd.com",
        'User-Agent':
        "Mozilla/5.0 (Windows NT 6.2; rv:29.0) Gecko/20100101 Firefox/29.0",
        'Referer': 'http://item.jd.com/%s.html' % (pid)
    }
    Logger.info(page_url)

    response = requests.request(method='GET', url=page_url, headers=headers1)
    page_data = response.content.decode('gbk', 'ignore')

    page_dict = json.loads(page_data)
    return int(page_dict['maxPage'])
コード例 #2
0
def insertToMongo(commentID_list, mongo_collection, oneDict, indexCollection):
    if oneDict['comment_id'] not in commentID_list:
        # myPrint('aaa')
        timeStr = oneDict['created_at']  # 时间转换
        oneDict['created_at'] = convertTimeStringToDateTime(timeStr)
        Logger.info(oneDict['created_at'])
        mongo_collection.insert(oneDict)
        # 更新commentID_list
        commentID_list.append(oneDict['comment_id'])
        updateCommentID_list(oneDict['product_id'], commentID_list,
                             indexCollection)
        return 0
    else:
        # myPrint(oneDict['commentID'],'已存在')
        # 已经存在该评论,不进行操作,直接返回1
        return 1
コード例 #3
0
class XpressEngine(object):
    logger = Logger("xe_crawler")

    LOGIN_PAGE = '/index.php?act=dispMemberLoginForm'
    MY_PAGE = '/index.php?act=dispMemberInfo'
    OWN_DOCUMENTS = '/index.php?act=dispMemberOwnDocument'
    OWN_COMMENTS = '/index.php?act=dispMemberOwnComment'
    DELETE_COMMENT = '/index.php?act=dispBoardDeleteComment&document_srl='
    DELETE_DOCUMENT = '/index.php?act=dispBoardDelete&document_srl='
    CSRL = '&comment_srl='

    def __init__(self, url=None, user_id=None, password=None, headless=False):
        self.logger.debug("xe_crawler instance created")
        if url is None:
            self.logger.error("웹사이트 주소가 올바르지 않습니다.")
            # exit(1)
        if user_id is None:
            self.logger.error("계정명이 올바르지 않습니다.")
            # exit(1)
        if password is None:
            self.logger.error("패스워드가 올바르지 않습니다.")
            # exit(1)
        self.url = url
        self.user_id = user_id
        self.password = password
        self.isHeadless = headless
        self.browser = None

    def load_browser(self, executable_path):
        options = webdriver.ChromeOptions()
        if self.isHeadless:
            options.add_argument('headless')
        options.add_argument('window-size=1920x1080')
        options.add_argument('lang=ko_KR')
        self.browser = webdriver.Chrome(executable_path,
                                        chrome_options=options)
        pass

    def load_xe(self):
        self.browser.get(self.url)
        self.browser.implicitly_wait(3)
        pass

    # 로그인 처리
    def login(self):
        self.browser.get(self.url + self.LOGIN_PAGE)
        self.insert_processing_overlay()
        self.browser.find_element_by_xpath('//*[@id="uid"]').send_keys(
            self.user_id)
        self.browser.find_element_by_xpath('//*[@id="upw"]').send_keys(
            self.password)
        # 로그인
        self.browser.find_element_by_xpath(
            '//*[@id="commonLogin"]/fieldset/span[2]/input').click()
        result = False
        try:
            WebDriverWait(self.browser,
                          2).until(EC.alert_is_present(),
                                   'Timed out waiting for PA creation')
            alert = self.browser.switch_to.alert
            if alert:
                self.logger.error("로그인에 실패했습니다.")
                self.logger.error(alert.text.replace('\n', ' '))
                alert.accept()
                return result
        except TimeoutException as e:
            pass

        try:
            if WebDriverWait(self.browser, 10).until(
                    EC.visibility_of_element_located(
                        (By.XPATH, '//*[@id="header_login"]/span[2]/a'))):
                self.logger.info("로그인 성공")
                self.insert_processing_overlay()
                result = True
            else:
                self.logger.error("로그인에 실패했습니다.")
                self.browser.close()
        except Exception as e:
            self.logger.error(e)
        finally:
            return result
            # exit(1)
        # time.sleep(1)

    def insert_processing_overlay(self):
        self.browser.execute_script("""
        var style = document.createElement('style');
        style.type='text/css';
        style.innerHTML="#text{position:absolute;top:50%;left:50%;font-size:50px;color:#fff;transform:translate(-50%,-50%);-ms-transform:translate(-50%,-50%)} .dogdrip-remover-overlay{z-index:1;position:fixed;width:100%;height:100%;left:0;top:0;background-color:rgba(0,0,0,0.4);overflow-x:hidden;}.loader,.loader:after,.loader:before{top:20%;border-radius:50%;width:2.5em;height:2.5em;-webkit-animation:load7 1.8s infinite ease-in-out;animation:load7 1.8s infinite ease-in-out}.loader{color:#fff;font-size:10px;margin:80px auto;position:relative;text-indent:-9999em;-webkit-transform:translateZ(0);-ms-transform:translateZ(0);transform:translateZ(0);-webkit-animation-delay:-.16s;animation-delay:-.16s}.loader:after,.loader:before{content:'';position:absolute;top:0}.loader:before{left:-3.5em;-webkit-animation-delay:-.32s;animation-delay:-.32s}.loader:after{left:3.5em}@-webkit-keyframes load7{0%,100%,80%{box-shadow:0 2.5em 0 -1.3em}40%{box-shadow:0 2.5em 0 0}}@keyframes load7{0%,100%,80%{box-shadow:0 2.5em 0 -1.3em}40%{box-shadow:0 2.5em 0 0}}";
        document.head.appendChild(style);
        var div = document.createElement('div');
        div.className='dogdrip-remover-overlay';
        document.body.appendChild(div);
        var loader = document.createElement('div');
        loader.className='loader';
        document.getElementsByClassName('dogdrip-remover-overlay')[0].appendChild(loader);
        var text = document.createElement('div');
        text.id='text';
        text.textContent='현재 [개드립 리무버] 작업중입니다! 창을 닫으면 프로그램이 종료되니 닫지마세요';
        document.getElementsByClassName('dogdrip-remover-overlay')[0].appendChild(text);
        """)

    def load_mypage(self):
        self.browser.get(self.url + self.MY_PAGE)
        pass

    def load_my_documents(self, page=1):
        self.browser.get(self.url + self.OWN_DOCUMENTS + '&page=' + str(page))
        self.insert_processing_overlay()
        self.browser.implicitly_wait(3)
        pass

    def load_my_documents_html(self, page=1):
        self.load_my_documents(page)
        html = self.browser.execute_script(
            'return window.document.getElementsByClassName("colTable")[0].innerHTML'
        )
        return BeautifulSoup(html, 'html.parser')

    def load_my_comments(self, page=1):
        self.browser.get(self.url + self.OWN_COMMENTS + '&page=' + str(page))
        self.insert_processing_overlay()
        self.browser.implicitly_wait(3)

    def load_my_comments_html(self, page=1):
        self.load_my_comments(page)
        html = self.browser.execute_script(
            'return window.document.getElementsByClassName("colTable")[0].innerHTML'
        )
        return BeautifulSoup(html, 'html.parser')

    def click_by_xpath(self, element):
        self.browser.find_element_by_xpath(element).click()

    def delete_comment(self, comment):
        try:
            self.browser.get(self.url + self.DELETE_COMMENT + comment[1] +
                             self.CSRL + comment[0])
            self.click_by_xpath(
                '//*[@id="content"]/div[1]/div/form/div/span/input')
            self.insert_processing_overlay()
            WebDriverWait(self.browser,
                          2).until(EC.alert_is_present(),
                                   'Timed out waiting for PA creation')
            alert = self.browser.switch_to.alert
            alert.accept()
            # self.browser.implicitly_wait(10)
        except TimeoutException as e:
            self.logger.debug("alert not found")
        except Exception as e:
            self.logger.error(e)
            pass

        try:
            if WebDriverWait(self.browser, 10).until(
                    EC.presence_of_element_located(
                        (By.CLASS_NAME, 'articleNum'))):
                return True
        except TimeoutException as e:
            self.logger.debug("could not find article list")
            return False

    def delete_document(self, document):
        try:
            self.browser.get(self.url + self.DELETE_DOCUMENT + document[0])
            self.click_by_xpath(
                '//*[@id="content"]/div[1]/div/form/div/span/input')
            self.insert_processing_overlay()
            WebDriverWait(self.browser,
                          2).until(EC.alert_is_present(),
                                   'Timed out waiting for PA creation')
            alert = self.browser.switch_to.alert
            alert.accept()
            # self.browser.implicitly_wait(10)
        except TimeoutException as e:
            self.logger.debug("alert not found")
        except Exception as e:
            self.logger.error(e)

        try:
            WebDriverWait(self.browser, 10).until(
                EC.presence_of_element_located((By.CLASS_NAME, 'articleNum')))
            return True
        except TimeoutException as e:
            self.logger.debug("could not find article list")
            return False

    def quit(self):
        if self.browser:
            self.browser.quit()
        pass

    def __del__(self):
        self.quit()
コード例 #4
0
def get_Max_Page_Nummber(item_id, shop_id, file_name, log_collection):
    page_url = base_url % (item_id, shop_id, 1)
    Logger.info(page_url)
    response = requests.request(method='GET', url=page_url, headers=headers)

    try:
        page_data = response.content.decode('gbk', 'ignore')
        # myPrint(page_data)
        # requests.session()
    except Exception:
        # myPrint('error page:',page_num)
        # os.system('say " error page "')
        log_dict = {
            'type': '跳出',
            'product_index': file_name,
            'current_page': 1,
            'log_time': time.time()
        }
        log_collection.insert(log_dict)
        # os.system('say "attention please,  your program has Exception  "')

        return 'Empty'

    temp = page_data.split('"rateDetail":')
    if len(temp) > 1:
        page_data_dict = json.loads(temp[1])
        myPrint(page_data_dict)
        print('1111111', page_data_dict)
        rate_list = page_data_dict['rateList']

        myPrint(page_data_dict['paginator'])
        pageinator = page_data_dict['paginator']['lastPage']

        # comments = []
        # for comment_info in rate_list:
        #     comment_info['item_id'] = item_id
        #     insertToMongo(comments_collection,comment_info)
        return pageinator
    else:
        time.sleep(5)
        response = requests.request(method='GET',
                                    url=page_url,
                                    headers=headers)
        try:
            page_data = response.content.decode('gbk', 'ignore')
            # myPrint(page_data)
            # requests.session()
        except Exception:
            # myPrint('error page:',page_num)
            # os.system('say " error page "')
            log_dict = {
                'type': '跳出',
                'product_index': file_name,
                'current_page': 1,
                'log_time': time.time()
            }
            log_collection.insert(log_dict)
            # os.system('say "attention please,  your program has Exception  "')

            return 'Empty'

        temp = page_data.split('"rateDetail":')
        if len(temp) > 1:
            page_data_dict = json.loads(temp[1])
            myPrint(page_data_dict)
            print('1111111', page_data_dict)
            rate_list = page_data_dict['rateList']

            myPrint(page_data_dict['paginator'])
            pageinator = page_data_dict['paginator']['lastPage']

            # comments = []
            # for comment_info in rate_list:
            #     comment_info['item_id'] =item_id
            #     insertToMongo(comments_collection,comment_info)
            return pageinator
        else:
            myPrint('max page Nummber failure')
            log_dict = {
                'type': '跳出',
                'product_index': file_name,
                'current_page': 1,
                'log_time': time.time()
            }
            log_collection.insert(log_dict)
            # os.system('say "attention please,  最大值获取失败  "')

            return 'Empty'
コード例 #5
0
def get_Tmall_comment(commentID_list, requestDict, comments_collection,
                      product_index, start_num, maxPage_num_control,
                      log_collection,
                      indexCollection):  # itemId 是商品的id, sellerId是卖家的id

    item_id = str(int(requestDict['original_id']))
    shop_id = str(round(float(requestDict['shop_id'])))
    title = str(requestDict['name'])
    catagory = str(requestDict['catagory'])
    # store_type = str(requestDict['store_type'])
    # spider_start_date =str(requestDict['spider_start_date'])

    maxPage = get_Max_Page_Nummber(item_id, shop_id, product_index,
                                   log_collection)
    time.sleep(2)

    Logger.info('maxPage' + str(maxPage))
    if maxPage == 'Empty':
        maxPage = maxPage_num_control
    if int(maxPage) > 99:
        maxPage = maxPage_num_control

    cunzai_count = 0
    # 增加判断条件呀
    if maxPage < start_num:
        log_dict = {
            'type': '跳出',
            'product_index': product_index,
            'current_page': maxPage,
            'log_time': time.time()
        }
        log_collection.insert(log_dict)
        # os.system('say "attention please,  your program has Exception  "')
        raise

    for page_num2 in range(start_num, int(maxPage) + 1):  # 最多只能爬取99页评论
        contents = get_one_page_comment(commentID_list, item_id, shop_id,
                                        title, catagory, page_num2,
                                        comments_collection, product_index,
                                        log_collection, indexCollection)
        Logger.info('商品索引:' + str(product_index))
        Logger.info('页码:' + str(page_num2))
        Logger.info(contents)

        if contents != 'Empty' and len(contents) > 0:
            log_dict = {
                'type': '成功',
                'product_index': product_index,
                'current_page': page_num2,
                'log_time': time.time()
            }
            log_collection.insert(log_dict)

            if sum(contents) >= 12:

                log_dict = {
                    'type': '已经存在',
                    'product_index': product_index,
                    'current_page': page_num2,
                    'log_time': time.time()
                }
                log_collection.insert(log_dict)
                cunzai_count += 1
                if cunzai_count >= 3:
                    raise
            else:
                cunzai_count = 0

        if contents == 'Empty':
            log_dict = {
                'type': '跳出',
                'product_index': product_index,
                'current_page': page_num2,
                'log_time': time.time()
            }
            log_collection.insert(log_dict)
            # os.system('say "attention please,  your program has Exception  "')
            raise

        time.sleep(random.randint(5, 10) / 10)  # 休眠片刻
コード例 #6
0
ファイル: DB.py プロジェクト: wonkwangyeon/Slack-Lunch-Bot
class DB(object):
    logger = Logger("DB")

    def __init__(self):
        if not Path('db/meal.db').is_file():
            self.conn = sqlite3.connect("db/meal.db")
            self.cur = self.conn.cursor()
            self.db_init_table()
        else:
            self.conn = sqlite3.connect("db/meal.db")
            self.cur = self.conn.cursor()

    def db_init_table(self):
        self.logger.info("SQLite3 초기화")
        self.conn.execute("""
                   CREATE TABLE IF NOT EXISTS menu (
                       name TEXT NOT NULL PRIMARY KEY  /* 메뉴 이름 */                     
                   )
               """)
        self.conn.commit()

        self.conn.execute("""
                           CREATE TABLE IF NOT EXISTS menu_log (
                               id integer primary key autoincrement,  /* 로그 고유번호 */
                               name text not null,                     /* 메뉴 이름 */
                               time timestamp DATE DEFAULT(datetime('now', 'localtime'))   /* 날짜 */                     
                           )
                       """)
        self.conn.commit()

    def select_all(self):
        try:
            sql = "SELECT name FROM menu"

            self.cur.execute(sql)
            rows = self.cur.fetchall()
            if (len(rows)) == 0:
                return None
            else:
                return rows

        except Exception as e:
            self.logger.debug(e)
            return "DB 에러발생."

    def insert_by_name(self, menu):
        try:
            sql = "insert into menu(name) values(?)"
            self.cur.execute(sql, (menu, ))
            self.conn.commit()
            return "추가하였습니다."

        except sqlite3.IntegrityError:
            self.logger.debug("메뉴 중복")
            return None

        except Exception as e:
            self.logger.debug(e)
            return True

    def delete_by_name(self, menu):
        try:
            sql = "delete from menu where name = ?"
            self.cur.execute(sql, (menu, ))
            self.conn.commit()
            return "삭제하였습니다."

        except Exception as e:
            self.logger.debug(e)
            return "DB 에러발생."

    def select_random(self):
        try:
            sql = "select name from menu order by random() limit 1"
            self.cur.execute(sql)
            row = self.cur.fetchone()
            if row is None:
                return None
            else:
                (ret_value, ) = row
                return ret_value

        except Exception as e:
            self.logger.debug(e)
            return "DB 에러발생."

    def insert_log(self, menu_name):
        try:
            sql = "insert into menu_log(name) values(?)"
            self.cur.execute(sql, (menu_name,))
            self.conn.commit()
            self.logger.debug("로그 추가")

        except Exception as e:
            self.logger.debug(e)
            return "DB 에러발생."

    def find_log_by_date(self, date):
        try:
            sql = "select * from menu_log where date(time) = date(?) limit 1"
            self.cur.execute(sql, (date,))
            r1, r2, r3 = self.cur.fetchone()
            return str(r1) + " : " + r2 + " "+ r3

        except Exception as e:
            self.logger.debug(e)
            return "DB 에러발생."

    def __del__(self):
         self.conn.close()
コード例 #7
0
class SlackBot(object):
    logger = Logger("SlackBot")

    def __init__(self):
        self.slack_client = SlackClient(config.get("slack_api"))
        self.starterbot_id = None
        self.RTM_READ_DELAY = 1
        self.EXAMPLE_COMMAND = "do"
        self.MENTION_REGEX = "^<@(|[WU].+?)>(.*)"
        self.cook_slave = MenuManager()
        self.alarm_time = "110000"  # default time
        self.channel_url = None

    def parse_bot_commands(self, slack_events):

        for event in slack_events:
            if event["type"] == "message" and not "subtype" in event:
                user_id, message = self.parse_direct_mention(event["text"])
                if user_id == self.starterbot_id:
                    return message, event["channel"]
        return None, None

    def parse_direct_mention(self, message_text):

        matches = re.search(self.MENTION_REGEX, message_text)

        return (matches.group(1),
                matches.group(2).strip()) if matches else (None, None)

    def handle_command(self, command, channel):

        response = None
        try:
            if self.channel_url is None:
                if command == "등록":
                    self.channel_url = channel
                    response = "등록되었습니다."
                else:
                    response = "채널등록 먼저 해주세요 슬랙에서 등록이라고 치면 됩니다. ex) 등록"

            elif command == "help" or command == "도움말":
                response = "※SQL인젝션 금지-----\n1. 밥추천/추천/메뉴추천/메뉴 추천 ex) 밥추천\n2. 추가/밥추가/메뉴추가 ex)밥추가 돈가스\n" \
                           "3. 삭제/밥삭제/메뉴삭제 ex) 삭제 돈가스\n4. 확인/메뉴기록/메뉴로그 ex) 확인 2018-11-07\n" \
                           "5. 초기화/재세팅 ex) 초기화\n 6. 모든메뉴 ex) 모든메뉴\n7. 알람설정 ex) 알람설정 110000\n8. 기타설명\n" \
                           "9. 채널변경 및 등록 ex) 등록"

            elif command == "alert":
                response = "오늘의 메뉴는 " + self.cook_slave.menu_rand_select(
                ) + " 입니다."

            elif command.startswith("밥추천") or command.startswith(
                    "추천") or command.startswith("메뉴추천") or command.startswith(
                        "메뉴 추천"):
                response = self.cook_slave.menu_recommend()

            elif command.startswith("추가"):
                if len(command) > 2:
                    result = self.cook_slave.menu_insert(command.split(' ')[1])
                    response = "(" + command.split(' ')[1] + ") " + result

            elif command.startswith("밥추가") or command.startswith("메뉴추가"):
                if len(command) > 3:
                    result = self.self.cook_slave.menu_insert(
                        command.split(' ')[1])
                    response = "(" + command.split(' ')[1] + ") " + result

            elif command.startswith("삭제"):
                if len(command) > 2:
                    result = self.cook_slave.menu_delete(command.split(' ')[1])
                    response = "(" + command.split(' ')[1] + ") " + result

            elif command.startswith("밥삭제") or command.startswith("메뉴삭제"):
                if len(command) > 3:
                    result = self.cook_slave.menu_delete(command.split(' ')[1])
                    response = "(" + command.split(' ')[1] + ") " + result

            elif command.startswith("확인"):
                if len(command) > 2:
                    result = self.cook_slave.find_menu_log(
                        command.split(' ')[1])
                    response = result

            elif command.startswith("메뉴기록") or command.startswith("메뉴로그"):
                if len(command) > 3:
                    result = self.cook_slave.find_menu_log(
                        command.split(' ')[1])
                    response = result

            elif command.startswith("초기화") or command.startswith("재세팅"):
                result = self.cook_slave.menu_setting()
                response = result

            elif command.startswith("알람설정"):
                if len(command) > 4:
                    alarm = command.split(' ')[1]
                    result = self.cook_slave.set_alarm_time(alarm)
                    if result is True:
                        self.alarm_time = alarm
                        response = "매일 " + alarm + " 시간에 메뉴와 알림이 옵니다"
                    else:
                        response = result

            elif command.startswith("모든메뉴"):
                response = self.cook_slave.all_menu_check()

            elif command.startswith("기타설명"):
                response = "1. 초기화/재세팅은 매주마다 먹은 음식이 겹치지 않도록 하는 것이며, 월요일 오전 9시마다 자동으로 재세팅 되는 것이므로" \
                               "따로 입력을 하지 않아도 되나, 하는 것은 자유이다.\n2. 로그는 항상 YYYY-MM-DD로 입력해야 한다." \
                               "\n3. 알람설정은 매일 같은시간에 점심메뉴를 추천해주는 알림을 제공해주며 기본값은 110000이고" \
                               "153030은 3시30분30초라는 의미이다."
            elif command == "등록":
                self.channel_url = channel
                response = "등록되었습니다."
            else:
                response = "help 또는 도움말 라고 입력해주세요"

        except Exception as e:
            self.logger.debug(e)
            response = "※ SQL인젝션 금지------\nhelp 또는 도움말을 통해 설명을 다시 확인해주세요"
        # Sends the response back to the channel
        self.slack_client.api_call("chat.postMessage",
                                   channel=channel,
                                   text=response)

    def start(self):
        if self.slack_client.rtm_connect(with_team_state=False):
            self.logger.debug("Starter Bot connected and running!")
            # Read bot's user ID by calling Web API method `auth.test`
            self.starterbot_id = self.slack_client.api_call(
                "auth.test")["user_id"]
            self.cook_slave.menu_setting()
            while True:
                command, channel = self.parse_bot_commands(
                    self.slack_client.rtm_read())
                time_check = time.strftime('%H%M%S')
                r = datetime.datetime.today().weekday()

                if r == 0 and time_check == "090000":  #월요일 9시에 데이터 초기화
                    self.cook_slave.menu_setting()

                if time_check == self.alarm_time and self.channel_url is not None:
                    self.handle_command("alert", self.channel_url)

                if command:
                    self.handle_command(command, channel)
                time.sleep(self.RTM_READ_DELAY)
        else:
            self.logger.debug(
                "Connection failed. Exception traceback printed above.")
コード例 #8
0
ファイル: dogdrip.py プロジェクト: luiseok/dogdrip_remover
class DogdripRemover(object):
    dogdripBrowser: XpressEngine
    logger = Logger("DogdripRemover")
    # Installing Requirements
    CHROME_DRIVER_URL = dict(Linux='https://chromedriver.storage.googleapis.com/2.38/chromedriver_linux64.zip',
                             Darwin='https://chromedriver.storage.googleapis.com/2.38/chromedriver_mac64.zip',
                             Windows='https://chromedriver.storage.googleapis.com/2.38/chromedriver_win32.zip')
    WEBSITE_URL = 'http://dogdrip.net'
    conn = None
    cur = None

    # 생성자
    # 크롬드라이버 설치, 데이터베이스 초기화 작업, Selenium 객체 생성
    def __init__(self, arch=platform.system()):
        self.logger.debug("DogdripRemover instance created")
        self.driverPath = None
        self.conn = None
        self.cur = None
        try:
            if not Path('./chromedriver.zip').is_file():
                # 1. Chromedriver 설치
                self.logger.info("크롬 드라이버를 다운로드합니다.")
                self.logger.debug("OS: %s", str(arch))
                self.logger.debug("URL: %s", str(self.CHROME_DRIVER_URL.get(arch)))
                wget.download(self.CHROME_DRIVER_URL.get(arch), './chromedriver.zip')
                self.logger.debug("크롬드라이버 다운로드 성공. 압축 해제")
                chromedriver = zipfile.ZipFile('./chromedriver.zip')
                chromedriver.extractall('./')
            else:
                chromedriver = zipfile.ZipFile('./chromedriver.zip')
            self.driverPath = os.path.realpath(chromedriver.namelist()[0])
            self.logger.debug("압축해제 완료. chromedriver 위치: %s", self.driverPath)
            chromedriver.close()

            # Chromedriver 실행권한 부여
            if not arch == "Windows":
                st = os.stat(self.driverPath)
                os.chmod(self.driverPath, st.st_mode | stat.S_IEXEC)
                self.logger.debug("크롬 드라이버 실행 권한 변경 완료")
            # SQLite3 초기화
            if not Path('./my_dogdrip.db').is_file(): #데이터베이스 없는 경우
                self.conn = sqlite3.connect("my_dogdrip.db")
                self.cur = self.conn.cursor()
                self.db_initialize()
            else:
                self.conn = sqlite3.connect("my_dogdrip.db")
                self.cur = self.conn.cursor()

        except sqlite3.Error as e:
            self.logger.error("데이터베이스 초기화에 실패했습니다.")
            self.logger.exception(e)
            exit(1)
        except Exception as e:
            self.logger.error("크롬 드라이버 다운로드에 실패했습니다.")
            self.logger.exception(e)
            exit(1)

        # self.dogdripBrowser = XpressEngine(url=self.WEBSITE_URL,
        #                                    user_id=config.get('user_id'),
        #                                    password=config.get('password'),
        #                                    headless=False)

    def db_initialize(self):
        """
            데이터베이스 초기화
            데이터베이스에서는 본인이 작성할 댓글과 게시물을 임시로 저장하고,
            조건 처리에 대한 모든 질의를 http request를 통해 하는것보다 빠른 처리성능을 꾀하고자 sqlite3를 사용한다.
        """
        # 문서
        self.logger.info("SQLite3 초기화중...")
        self.conn.execute("""
            CREATE TABLE IF NOT EXISTS documents (
                document_srl TEXT PRIMARY KEY,  /* 게시물 고유번호 */
                href TEXT NOT NULL,             /* 게시물 고유주소 */
                title TEXT,                     /* 게시물 제목 */
                content TEXT,                   /* 게시물 내용 */
                view_count INT,                 /* 조회 수 */
                comment_count INT,              /* 댓글 수 */
                vote_up INT,                    /* 개드립 */
                vote_down INT,                  /* 붐업 */
                created_at TEXT,                /* 작성시간 */
                target_board TEXT,              /* 게시판 이름 */
                is_deleted INTEGER DEFAULT 0    /* 삭제여부 */
            )
        """)
        self.conn.execute("CREATE INDEX IF NOT EXISTS d_target_board_idx ON documents(`target_board`)")
        self.conn.commit()
        # 댓글
        self.conn.execute("""
            CREATE TABLE IF NOT EXISTS comments (
                comment_srl TEXT PRIMARY KEY,  /* 댓글 고유번호 */
                target_srl  TEXT,              /* 댓글이 작성된 원래 문서의 고유번호 */
                href TEXT NOT NULL,            /* 고유 주소 */
                content TEXT,                  /* 댓글 내용 */
                created_at TEXT,               /* 작성 시간 */
                target_board TEXT,             /* 게시판 이름 */
                has_child INTEGER DEFAULT 0,    /* 대댓글 여부 */
                is_deleted INTEGER DEFAULT 0    /* 삭제여부 */
            )
        """)
        self.conn.execute("CREATE INDEX IF NOT EXISTS c_target_board_idx ON comments(`target_board`)")
        self.conn.commit()
        self.logger.info("SQLite3 초기화 완료!")

    # SQLite3 에 댓글 정보 삽입
    def insert_comments(self, comments):
        insert_into_comments = "INSERT OR REPLACE INTO comments(comment_srl, target_srl, href, content, created_at) " \
                               "VALUES (?,?,?,?,datetime(?)) "
        self.cur.executemany(insert_into_comments, comments)
        self.conn.commit()

    # SQLite3 에 문서 정보 삽입
    def insert_documents(self, documents):
        insert_into_documents = "INSERT OR REPLACE INTO documents(document_srl, href, title, comment_count, " \
                                "view_count, vote_up, created_at) VALUES (?,?,?,?,?,?,date(?)) "
        self.cur.executemany(insert_into_documents, documents)
        self.conn.commit()

    def login(self, user_id=config.get('user_id'), password=config.get('password')):
        self.dogdripBrowser = XpressEngine(url=self.WEBSITE_URL,
                                           user_id=user_id,
                                           password=password,
                                           headless=False)
        self.dogdripBrowser.load_browser(self.driverPath)
        return self.dogdripBrowser.login()

    def comments_find_all(self):
        with self.conn:
            self.cur.execute("SELECT * FROM COMMENTS")
            comments = self.cur.fetchall()
            return comments

    def comments_find_not_deleted(self):
        with self.conn:
            self.cur.execute("SELECT * FROM comments")
            comments = self.cur.fetchall()
            return comments

    def update_document_detail(self, results):
        self.logger.debug("게시물에 상세정보를 추가합니다.")
        updatesql = "UPDATE documents SET target_board=?, content=? WHERE document_srl=?"
        new_infos = []
        for result in results:
            if result:
                document_srl = result[0][0]
                target_board = result[1]
                content = result[2]
                new_infos.append((target_board, content, document_srl))
            else:
                pass
        self.cur.executemany(updatesql, new_infos)
        self.conn.commit()

    def update_comment_detail(self, results):
        self.logger.debug("댓글에 상세정보를 추가합니다.(소속 게사판, 대댓글여부)")
        updatesql = "UPDATE comments SET target_board=?, has_child=? WHERE comment_srl=?"
        new_infos = []
        for result in results:
            if result:
                comment_srl = result[0][0]
                target_board = result[1]
                has_child = result[2]
                new_infos.append((target_board, has_child, comment_srl))
        self.cur.executemany(updatesql, new_infos)
        self.conn.commit()

    def update_is_deleted(self, doc, type='comment'):
        self.logger.debug("삭제여부를 변경합니다.")
        self.logger.debug(doc)
        if type == 'comment':
            updatesql = "UPDATE comments SET is_deleted='1' WHERE comment_srl=?"
            self.cur.execute(updatesql, [doc[0]])
            self.conn.commit()
        elif type == 'document':
            updatesql = "UPDATE documents SET is_deleted='1' WHERE document_srl=?"
            self.cur.execute(updatesql, [doc[0]])
            self.conn.commit()
        pass

    # 게시물 목록으로부터 게시물 기초 정보 수집
    # 게시물 번호, 제목, 주소, 조회수, 추천수, 작성시각
    def fetch_document_list(self):
        self.logger.info("작성한 게시물 목록을 불러오고 있습니다...")
        html = self.dogdripBrowser.load_my_documents_html()
        current_page, total_page = self.get_pagination_info(html.find('caption').get_text())
        self.logger.info("총 %s페이지의 문서가 있습니다.", total_page)
        for page in range(int(current_page), int(total_page) + 1):
            self.logger.info("%s페이지 중 %s페이지를 수집하고있습니다.", str(total_page), str(page))
            html = self.dogdripBrowser.load_my_documents_html(page)
            html_document_list = html.find_all('tr')
            documents = self.parse_document(html_document_list)
            self.insert_documents(documents)

    # 문서 정보 파싱
    def parse_document(self, html_document_list):
        documents = []
        pattern = re.compile(r'\s\[\d*.\]$')
        if html_document_list:
            for document_list in html_document_list:
                # 마이페이지에서 수집할 수 있는 문서 정보 저장
                content = document_list.find("td", {"class": "wide"})
                if content:
                    document = content.find("a")
                    if document:
                        href = document['href']
                        document_srl = urlparse(href).path
                        while os.path.dirname(document_srl) != '/':
                            document_srl = os.path.dirname(document_srl)
                        document_srl = document_srl.replace('/', '')
                        comment_count = pattern.findall(content.get_text().strip())
                        if comment_count:
                            comment_count = comment_count[0].replace('[', '').replace(']', '').strip()
                        else:
                            comment_count = "0"
                        title = document.get_text()
                        view_count = document.findNext()
                        vote_up = view_count.findNext()
                        created_at = vote_up.findNext().get_text()
                        self.logger.debug("원본 게시물 번호: %s, 게시물 주소: %s, 제목: %s, 댓글 수: %s, 조회 수: %s, 개드립 수: %s, 작성일: %s",
                                          document_srl, href, title, comment_count, view_count.get_text(),
                                          vote_up.get_text(), created_at)

                        documents.append((document_srl, href, title, comment_count, view_count.get_text(),
                                          vote_up.get_text(), created_at))
            return documents
        else:
            return None

    # 댓글 목록으로부터 게시물 기초 정보 수집
    # 댓글내용, 댓글번호, 원게시물 번호, 댓글주소, 작성시각
    def fetch_comment_list(self):
        self.logger.info("작성한 댓글 목록을 불러오고 있습니다...")
        html = self.dogdripBrowser.load_my_comments_html()
        current_page, total_page = self.get_pagination_info(html.find('caption').get_text())
        self.logger.info("총 %s페이지의 댓글이 있습니다.", total_page)
        for page in range(int(current_page), int(total_page) + 1):
            self.logger.info("총 %s페이지 중 %s페이지를 수집하고있습니다.", str(total_page), str(page))
            html = self.dogdripBrowser.load_my_comments_html(page)
            html_comment_list = html.find_all('tr')
            comments = self.parse_comment(html_comment_list)
            self.insert_comments(comments)

    # 댓글 정보 파싱
    def parse_comment(self, html_comment_list):
        comments = []

        if html_comment_list:
            for comment_list in html_comment_list:
                # 마이페이지에서 수집할 수 있는 댓글 정보 저장
                content = comment_list.find("td", {"class": "wide"})
                if content:
                    comment = content.find("a")
                    if comment:
                        target_srl = urlparse(comment['href']).path
                        while os.path.dirname(target_srl) != '/':
                            target_srl = os.path.dirname(target_srl)
                        target_srl = target_srl.replace('/', '')
                        comment_srl = comment['href'].split("#")[1].split("_")[1]
                        created_at = comment_list.find("td", {"class": "nowrap"}).get_text()
                        self.logger.debug("원본 게시물 번호: %s, 댓글 고유번호: %s, 작성시간: %s", target_srl, comment_srl,
                                          created_at.strip().replace('\n', ' '))
                        if not comment.get_text() == "[삭제 되었습니다]":
                            comments.append((comment_srl, target_srl, comment['href'], comment.get_text(), created_at))
            return comments
        else:
            return None

    @classmethod
    def get_pagination_info(cls, text):
        pattern = re.compile(r'\b[0-9]*.\/.[0-9]*')
        pagination = pattern.findall(text)
        return pagination[0].split('/')

    # 댓글 상세 정보 수집
    # 대댓글 여부, 문서가 소속된 게시판 정보
    def add_comment_detail_job(self, comments):
        pool = Pool(processes=config.get('process_concurrency'))
        results = pool.imap_unordered(self.request_comment_info, comments)
        self.update_comment_detail(results)

    # 웹 요청 - 댓글 상세정보 수집
    def request_comment_info(self, comment):
        start_time = millis()
        with requests.get(comment[2]) as res:
            if res.status_code == 200:
                self.logger.debug("댓글 로드완료. 시간: %sms, url: %s", str(millis() - start_time), comment[2])
                page = res.text
                page = BeautifulSoup(page, 'html.parser')
                # 게시판 주소 찾기
                # board_name = page.find_all("li", {"class": "category"})[-1].find("a")["href"].replace("/", "")
                board_title = page.find_all("div", {"class": "boardHeaderBorder"})
                board_name = ''
                if board_title:
                    board_name = urlparse(board_title[0].find("a")["href"]).path.replace('/', '')
                # 대댓글 찾기
                has_child = "1" if page.find_all("div", {"parent_srl": comment[0]}) else "0"
                return comment, board_name, has_child

    def collect_comment_details(self):
        comments = self.comments_find_all()
        self.add_comment_detail_job(comments)

    def add_document_detail_job(self, documents):
        pool = Pool(processes=config.get('process_concurrency'))
        results = pool.imap_unordered(self.request_document_info, documents)
        self.update_document_detail(results)

    def request_document_info(self, document):
        start_time = millis()
        with requests.get(document[1]) as res:
            if res.status_code == 200:
                self.logger.debug("페이지 로드완료. 시간: %sms, url: %s", str(millis() - start_time), document[1])
                page = res.text
                page = BeautifulSoup(page, 'html.parser')
                # 게시판 주소 찾기
                board_title = page.find_all("div", {"class": "boardHeaderBorder"})
                board_name = ''
                if board_title:
                    board_name = urlparse(board_title[0].find("a")["href"]).path.replace('/', '')
                content = page.find_all("div", {"class": "xe_content"})
                if content:
                    content = content[0].get_text().strip()
                if board_name == '':
                    self.logger.debug("게시물 파싱 불가. 직접 삭제 요망. 주소: %s",  document[1])
                    return None
                if not content:
                    self.logger.debug("게시물 파싱 불가. 직접 삭제 요망. 주소: %s", document[1])
                    return None

                return document, board_name, content

    def collect_document_details(self):
        documents = self.documents_find_all()
        self.add_document_detail_job(documents)

    def documents_find_all(self):
        with self.conn:
            self.cur.execute("SELECT * FROM documents")
            documents = self.cur.fetchall()
            return documents

    def delete_all_documents_job(self):
        documents = self.documents_find_all()
        # pool = Pool(processes=config.get('process_concurrency'))
        # results = pool.imap_unordered(self.request_comment_info, documents)
        for document in documents:
            if self.delete_selenium_document(document):
                self.update_is_deleted(document, type="document")
                pass

    def delete_selenium_document(self, document):
        self.logger.debug(document)
        return self.dogdripBrowser.delete_document(document)

    def delete_all_comments_job(self):
        comments = self.comments_find_not_deleted()
        # pool = Pool(processes=config.get('process_concurrency'))
        # results = pool.imap_unordered(self.delete_selenium_comments, comments)
        for comment in comments:
            if self.delete_selenium_comment(comment):
                self.update_is_deleted(comment, type="comment")
                pass

    def delete_selenium_comment(self, comment):
        if not comment[5] == 'temp':
            self.logger.debug(comment)
            return self.dogdripBrowser.delete_comment(comment)

    def __del__(self):
        self.logger.debug("DogdripRemover 인스턴스가 종료되었습니다.")
        try:
            self.dogdripBrowser.quit()
            self.cur.close()
            self.conn.close()
        except Exception as e:
            self.logger.exception(str(e))
            pass
コード例 #9
0
class MenuManager(object):
    logger = Logger("MenuManager")

    def __init__(self):
        self.db = DB()
        self.menu = []

    def menu_setting(self):
        self.menu.clear()
        rows = self.db.select_all()  # 메뉴 전체 가져오기
        if rows is None:
            return "데이터베이스에 추가된 메뉴 없음"

        for (row, ) in rows:  # 메뉴 추가
            self.menu.append(row)
        return "메뉴 세팅완료."

    def all_menu_check(self):  # 모든 메뉴 확인
        rows = self.db.select_all()

        if rows is None:
            return "데이터베이스에 추가된 메뉴 없음"

        all_menu = ""
        i = 0
        for (row, ) in rows:
            i += 1
            if i == len(rows):
                all_menu += row
            else:
                all_menu += row + " / "
        return all_menu

    def menu_rand_select(self):
        if len(self.menu) == 0:
            self.menu_setting()
            if len(self.menu) == 0:
                return "데이터베이스에 추가된 메뉴 없음"

        rand_num = randrange(len(self.menu))  # 번호 추출
        today_menu = self.menu[rand_num]  # 랜덤으로 메뉴 뽑기
        del self.menu[rand_num]  # 뽑은 메뉴를 삭제.
        self.db.insert_log(today_menu)  # 오늘 먹은 메뉴 기록.
        return today_menu

    def menu_insert(self, menu_name):
        result = self.db.insert_by_name(menu_name)
        if result is None:
            return "이미 추가되어있는 메뉴입니다."
        elif result is True:
            return "DB 에러"
        self.menu.append(menu_name)
        return result

    def menu_delete(self, menu_name):
        try:
            result = self.db.delete_by_name(menu_name)
            self.menu.remove(menu_name)
            return result

        except ValueError:
            return "존재하지 않는 메뉴입니다"

    def menu_recommend(self):
        result = self.db.select_random()
        if result is None:
            return "추가된 메뉴가 없음."
        return result

    def find_menu_log(self, date):
        result = self.db.find_log_by_date(date)
        if result is None:
            return date + "에 먹은 음식이 존재하지 않습니다."

        return result

    def set_alarm_time(self, time):
        try:
            if len(time) == 6:
                int_time = int(time)
                if int_time > 240000:
                    return "시간이 24시를 초과할 수 없습니다."
                elif int_time < 0:
                    return "시간이 음수가 될 수 없습니다."
                return True
        except ValueError:
            return "숫자가 아닙니다."
        except Exception as e:
            self.logger.debug(e)
            return "시간오류"
コード例 #10
0
def get_comments(commentID_list, pid, ptitle, item_catagory, page_num,
                 mongo_collection, proxiesIP,
                 indexCollection):  # 爬取商品id为pid的商品的第page_num页的评论
    headers1 = {
        'GET': '',
        'Host': "club.jd.com",
        'User-Agent':
        "Mozilla/5.0 (Windows NT 6.2; rv:29.0) Gecko/20100101 Firefox/29.0",
        'Referer': 'http://item.jd.com/%s.html' % (pid)
    }
    Logger.info(page_num)

    page_url = web_json_url % (pid, page_num)
    # req = urllib2.Request(url = page_url, headers = headers1)
    Logger.info(page_url)

    try:
        response = requests.request(method='GET',
                                    url=page_url,
                                    headers=headers1)
        # session =requests.session()
        # session.get(url=page_url,)
    except (Exception):
        # myPrint(e)
        successtag = False
        # i+=1
        # myPrint(Exception)
        Logger.info('ccc  跳过')
        # import os
        os.system('say "attention please,  your program has Exception  "')
        # response=None
        return 'Empty'

    try:
        # page_data = urllib2.urlopen(req, timeout = 10).read()
        html_content = response.content.decode('gbk', 'ignore')
        page_dict = json.loads(html_content)
        # myPrint(page_dict)
        # raise
        comments = page_dict['comments']
        successtag = True
        myPrint(pid, len(comments))
    except (Exception):
        successtag = False
        # i+=1
        # myPrint('ccc  跳过')
        # os.system('say "attention please,  your program has Exception  "')
        return 'Empty'

    try:
        contents111 = []
        for comment in comments:
            # contents.append(comment['content'])
            # myPrint(comment)
            commentDict = {}
            content = deleteEscapeCharacter(comment['content'])
            # myPrint(content)

            created_at = comment['creationTime']
            userLevelName = comment['userLevelName']
            isMobile = comment['isMobile']
            score = comment['score']
            comment_id = comment['id']
            product_id = comment['referenceId']
            if product_id == '':
                raise

            # myPrint(creationTime,userLevelName,isMobile)
            tagsList = []
            if 'commentTags' in comment.keys():
                commentTags = comment['commentTags']

                # myPrint('ccc', commentTags)
                for commenTag in commentTags:
                    tagsList.append(commenTag['name'])
                    # myPrint(commenTag['name'])
            commentDict['comment_id'] = comment_id
            commentDict['product_id'] = str(product_id)
            commentDict['content'] = content
            commentDict['userLevelName'] = userLevelName
            commentDict['commentTags'] = tagsList
            commentDict['created_at'] = created_at
            commentDict['isMobile'] = isMobile
            commentDict['score'] = score
            commentDict['product_title'] = ptitle
            commentDict['category'] = item_catagory
            commentDict['store_id'] = 1
            # commentDict['store_type'] = item_store_type
            # commentDict['spider_start_date']=spider_start_date

            # print(commentDict['product_title'])
            # myPrint(commentDict)
            if commentDict['product_id'] == pid:
                insert_reslut = insertToMongo(commentID_list, mongo_collection,
                                              commentDict, indexCollection)
                contents111.append(insert_reslut)
    except Exception:
        Logger.info('ccc  跳过')
        # import os
        os.system('say "attention please,  your program has Exception  "')
        # response=None
        return 'Empty'

    Logger.info(contents111)
    return contents111
コード例 #11
0
def scraw_web_json(commentID_list, pid, ptitle, item_catagory, file_name,
                   mongo_collection, proxiesIP, stnum, maxPage_num_control,
                   log_collection, indexCollection):
    max_page_num = int(round(get_max_page_num(pid), 0))
    Logger.info('max_page_num =' + str(max_page_num))
    content_set = set()

    # max_page_num = 151
    if max_page_num >= 100:
        max_page_num = maxPage_num_control

    cunzai_count = 0
    for page_num2 in range(int(stnum), max_page_num):
        # if page_num%1==0:
        Logger.info('page_num:' + str(page_num2) + 'total' +
                    str(max_page_num) + 'pid =' + str(pid))
        contents = get_comments(commentID_list, pid, ptitle, item_catagory,
                                page_num2, mongo_collection, proxiesIP,
                                indexCollection)
        Logger.info('product Index' + str(file_name))
        Logger.info(comments)

        if contents != 'Empty' and len(contents) > 0:
            log_dict = {
                'type': '成功',
                'product_index': file_name,
                'current_page': page_num2,
                'log_time': time.time(),
                'product_id': pid
            }
            log_collection.insert(log_dict)
            Logger.info('Sum =' + str(sum(contents)))

            if sum(contents) >= 10:
                # a = sum(contents)
                # print(a)
                log_dict = {
                    'type': '已经存在',
                    'product_index': file_name,
                    'current_page': page_num2,
                    'log_time': time.time(),
                    'product_id': pid
                }
                log_collection.insert(log_dict)
                cunzai_count += 1

                if cunzai_count >= 3:
                    raise
            else:
                cunzai_count = 0

        if contents == 'Empty':
            log_dict = {
                'type': '跳出',
                'product_index': file_name,
                'current_page': page_num2,
                'log_time': time.time(),
                'product_id': pid
            }
            log_collection.insert(log_dict)
            # os.system('say "attention please,  your program has Exception  "')
            raise

        time.sleep(random.randint(5, 10) / 2)  # 休眠片刻