def get_max_page_num(pid): page_url = web_json_url % (pid, 0) headers1 = { 'GET': '', 'Host': "club.jd.com", 'User-Agent': "Mozilla/5.0 (Windows NT 6.2; rv:29.0) Gecko/20100101 Firefox/29.0", 'Referer': 'http://item.jd.com/%s.html' % (pid) } Logger.info(page_url) response = requests.request(method='GET', url=page_url, headers=headers1) page_data = response.content.decode('gbk', 'ignore') page_dict = json.loads(page_data) return int(page_dict['maxPage'])
def insertToMongo(commentID_list, mongo_collection, oneDict, indexCollection): if oneDict['comment_id'] not in commentID_list: # myPrint('aaa') timeStr = oneDict['created_at'] # 时间转换 oneDict['created_at'] = convertTimeStringToDateTime(timeStr) Logger.info(oneDict['created_at']) mongo_collection.insert(oneDict) # 更新commentID_list commentID_list.append(oneDict['comment_id']) updateCommentID_list(oneDict['product_id'], commentID_list, indexCollection) return 0 else: # myPrint(oneDict['commentID'],'已存在') # 已经存在该评论,不进行操作,直接返回1 return 1
class XpressEngine(object): logger = Logger("xe_crawler") LOGIN_PAGE = '/index.php?act=dispMemberLoginForm' MY_PAGE = '/index.php?act=dispMemberInfo' OWN_DOCUMENTS = '/index.php?act=dispMemberOwnDocument' OWN_COMMENTS = '/index.php?act=dispMemberOwnComment' DELETE_COMMENT = '/index.php?act=dispBoardDeleteComment&document_srl=' DELETE_DOCUMENT = '/index.php?act=dispBoardDelete&document_srl=' CSRL = '&comment_srl=' def __init__(self, url=None, user_id=None, password=None, headless=False): self.logger.debug("xe_crawler instance created") if url is None: self.logger.error("웹사이트 주소가 올바르지 않습니다.") # exit(1) if user_id is None: self.logger.error("계정명이 올바르지 않습니다.") # exit(1) if password is None: self.logger.error("패스워드가 올바르지 않습니다.") # exit(1) self.url = url self.user_id = user_id self.password = password self.isHeadless = headless self.browser = None def load_browser(self, executable_path): options = webdriver.ChromeOptions() if self.isHeadless: options.add_argument('headless') options.add_argument('window-size=1920x1080') options.add_argument('lang=ko_KR') self.browser = webdriver.Chrome(executable_path, chrome_options=options) pass def load_xe(self): self.browser.get(self.url) self.browser.implicitly_wait(3) pass # 로그인 처리 def login(self): self.browser.get(self.url + self.LOGIN_PAGE) self.insert_processing_overlay() self.browser.find_element_by_xpath('//*[@id="uid"]').send_keys( self.user_id) self.browser.find_element_by_xpath('//*[@id="upw"]').send_keys( self.password) # 로그인 self.browser.find_element_by_xpath( '//*[@id="commonLogin"]/fieldset/span[2]/input').click() result = False try: WebDriverWait(self.browser, 2).until(EC.alert_is_present(), 'Timed out waiting for PA creation') alert = self.browser.switch_to.alert if alert: self.logger.error("로그인에 실패했습니다.") self.logger.error(alert.text.replace('\n', ' ')) alert.accept() return result except TimeoutException as e: pass try: if WebDriverWait(self.browser, 10).until( EC.visibility_of_element_located( (By.XPATH, '//*[@id="header_login"]/span[2]/a'))): self.logger.info("로그인 성공") self.insert_processing_overlay() result = True else: self.logger.error("로그인에 실패했습니다.") self.browser.close() except Exception as e: self.logger.error(e) finally: return result # exit(1) # time.sleep(1) def insert_processing_overlay(self): self.browser.execute_script(""" var style = document.createElement('style'); style.type='text/css'; style.innerHTML="#text{position:absolute;top:50%;left:50%;font-size:50px;color:#fff;transform:translate(-50%,-50%);-ms-transform:translate(-50%,-50%)} .dogdrip-remover-overlay{z-index:1;position:fixed;width:100%;height:100%;left:0;top:0;background-color:rgba(0,0,0,0.4);overflow-x:hidden;}.loader,.loader:after,.loader:before{top:20%;border-radius:50%;width:2.5em;height:2.5em;-webkit-animation:load7 1.8s infinite ease-in-out;animation:load7 1.8s infinite ease-in-out}.loader{color:#fff;font-size:10px;margin:80px auto;position:relative;text-indent:-9999em;-webkit-transform:translateZ(0);-ms-transform:translateZ(0);transform:translateZ(0);-webkit-animation-delay:-.16s;animation-delay:-.16s}.loader:after,.loader:before{content:'';position:absolute;top:0}.loader:before{left:-3.5em;-webkit-animation-delay:-.32s;animation-delay:-.32s}.loader:after{left:3.5em}@-webkit-keyframes load7{0%,100%,80%{box-shadow:0 2.5em 0 -1.3em}40%{box-shadow:0 2.5em 0 0}}@keyframes load7{0%,100%,80%{box-shadow:0 2.5em 0 -1.3em}40%{box-shadow:0 2.5em 0 0}}"; document.head.appendChild(style); var div = document.createElement('div'); div.className='dogdrip-remover-overlay'; document.body.appendChild(div); var loader = document.createElement('div'); loader.className='loader'; document.getElementsByClassName('dogdrip-remover-overlay')[0].appendChild(loader); var text = document.createElement('div'); text.id='text'; text.textContent='현재 [개드립 리무버] 작업중입니다! 창을 닫으면 프로그램이 종료되니 닫지마세요'; document.getElementsByClassName('dogdrip-remover-overlay')[0].appendChild(text); """) def load_mypage(self): self.browser.get(self.url + self.MY_PAGE) pass def load_my_documents(self, page=1): self.browser.get(self.url + self.OWN_DOCUMENTS + '&page=' + str(page)) self.insert_processing_overlay() self.browser.implicitly_wait(3) pass def load_my_documents_html(self, page=1): self.load_my_documents(page) html = self.browser.execute_script( 'return window.document.getElementsByClassName("colTable")[0].innerHTML' ) return BeautifulSoup(html, 'html.parser') def load_my_comments(self, page=1): self.browser.get(self.url + self.OWN_COMMENTS + '&page=' + str(page)) self.insert_processing_overlay() self.browser.implicitly_wait(3) def load_my_comments_html(self, page=1): self.load_my_comments(page) html = self.browser.execute_script( 'return window.document.getElementsByClassName("colTable")[0].innerHTML' ) return BeautifulSoup(html, 'html.parser') def click_by_xpath(self, element): self.browser.find_element_by_xpath(element).click() def delete_comment(self, comment): try: self.browser.get(self.url + self.DELETE_COMMENT + comment[1] + self.CSRL + comment[0]) self.click_by_xpath( '//*[@id="content"]/div[1]/div/form/div/span/input') self.insert_processing_overlay() WebDriverWait(self.browser, 2).until(EC.alert_is_present(), 'Timed out waiting for PA creation') alert = self.browser.switch_to.alert alert.accept() # self.browser.implicitly_wait(10) except TimeoutException as e: self.logger.debug("alert not found") except Exception as e: self.logger.error(e) pass try: if WebDriverWait(self.browser, 10).until( EC.presence_of_element_located( (By.CLASS_NAME, 'articleNum'))): return True except TimeoutException as e: self.logger.debug("could not find article list") return False def delete_document(self, document): try: self.browser.get(self.url + self.DELETE_DOCUMENT + document[0]) self.click_by_xpath( '//*[@id="content"]/div[1]/div/form/div/span/input') self.insert_processing_overlay() WebDriverWait(self.browser, 2).until(EC.alert_is_present(), 'Timed out waiting for PA creation') alert = self.browser.switch_to.alert alert.accept() # self.browser.implicitly_wait(10) except TimeoutException as e: self.logger.debug("alert not found") except Exception as e: self.logger.error(e) try: WebDriverWait(self.browser, 10).until( EC.presence_of_element_located((By.CLASS_NAME, 'articleNum'))) return True except TimeoutException as e: self.logger.debug("could not find article list") return False def quit(self): if self.browser: self.browser.quit() pass def __del__(self): self.quit()
def get_Max_Page_Nummber(item_id, shop_id, file_name, log_collection): page_url = base_url % (item_id, shop_id, 1) Logger.info(page_url) response = requests.request(method='GET', url=page_url, headers=headers) try: page_data = response.content.decode('gbk', 'ignore') # myPrint(page_data) # requests.session() except Exception: # myPrint('error page:',page_num) # os.system('say " error page "') log_dict = { 'type': '跳出', 'product_index': file_name, 'current_page': 1, 'log_time': time.time() } log_collection.insert(log_dict) # os.system('say "attention please, your program has Exception "') return 'Empty' temp = page_data.split('"rateDetail":') if len(temp) > 1: page_data_dict = json.loads(temp[1]) myPrint(page_data_dict) print('1111111', page_data_dict) rate_list = page_data_dict['rateList'] myPrint(page_data_dict['paginator']) pageinator = page_data_dict['paginator']['lastPage'] # comments = [] # for comment_info in rate_list: # comment_info['item_id'] = item_id # insertToMongo(comments_collection,comment_info) return pageinator else: time.sleep(5) response = requests.request(method='GET', url=page_url, headers=headers) try: page_data = response.content.decode('gbk', 'ignore') # myPrint(page_data) # requests.session() except Exception: # myPrint('error page:',page_num) # os.system('say " error page "') log_dict = { 'type': '跳出', 'product_index': file_name, 'current_page': 1, 'log_time': time.time() } log_collection.insert(log_dict) # os.system('say "attention please, your program has Exception "') return 'Empty' temp = page_data.split('"rateDetail":') if len(temp) > 1: page_data_dict = json.loads(temp[1]) myPrint(page_data_dict) print('1111111', page_data_dict) rate_list = page_data_dict['rateList'] myPrint(page_data_dict['paginator']) pageinator = page_data_dict['paginator']['lastPage'] # comments = [] # for comment_info in rate_list: # comment_info['item_id'] =item_id # insertToMongo(comments_collection,comment_info) return pageinator else: myPrint('max page Nummber failure') log_dict = { 'type': '跳出', 'product_index': file_name, 'current_page': 1, 'log_time': time.time() } log_collection.insert(log_dict) # os.system('say "attention please, 最大值获取失败 "') return 'Empty'
def get_Tmall_comment(commentID_list, requestDict, comments_collection, product_index, start_num, maxPage_num_control, log_collection, indexCollection): # itemId 是商品的id, sellerId是卖家的id item_id = str(int(requestDict['original_id'])) shop_id = str(round(float(requestDict['shop_id']))) title = str(requestDict['name']) catagory = str(requestDict['catagory']) # store_type = str(requestDict['store_type']) # spider_start_date =str(requestDict['spider_start_date']) maxPage = get_Max_Page_Nummber(item_id, shop_id, product_index, log_collection) time.sleep(2) Logger.info('maxPage' + str(maxPage)) if maxPage == 'Empty': maxPage = maxPage_num_control if int(maxPage) > 99: maxPage = maxPage_num_control cunzai_count = 0 # 增加判断条件呀 if maxPage < start_num: log_dict = { 'type': '跳出', 'product_index': product_index, 'current_page': maxPage, 'log_time': time.time() } log_collection.insert(log_dict) # os.system('say "attention please, your program has Exception "') raise for page_num2 in range(start_num, int(maxPage) + 1): # 最多只能爬取99页评论 contents = get_one_page_comment(commentID_list, item_id, shop_id, title, catagory, page_num2, comments_collection, product_index, log_collection, indexCollection) Logger.info('商品索引:' + str(product_index)) Logger.info('页码:' + str(page_num2)) Logger.info(contents) if contents != 'Empty' and len(contents) > 0: log_dict = { 'type': '成功', 'product_index': product_index, 'current_page': page_num2, 'log_time': time.time() } log_collection.insert(log_dict) if sum(contents) >= 12: log_dict = { 'type': '已经存在', 'product_index': product_index, 'current_page': page_num2, 'log_time': time.time() } log_collection.insert(log_dict) cunzai_count += 1 if cunzai_count >= 3: raise else: cunzai_count = 0 if contents == 'Empty': log_dict = { 'type': '跳出', 'product_index': product_index, 'current_page': page_num2, 'log_time': time.time() } log_collection.insert(log_dict) # os.system('say "attention please, your program has Exception "') raise time.sleep(random.randint(5, 10) / 10) # 休眠片刻
class DB(object): logger = Logger("DB") def __init__(self): if not Path('db/meal.db').is_file(): self.conn = sqlite3.connect("db/meal.db") self.cur = self.conn.cursor() self.db_init_table() else: self.conn = sqlite3.connect("db/meal.db") self.cur = self.conn.cursor() def db_init_table(self): self.logger.info("SQLite3 초기화") self.conn.execute(""" CREATE TABLE IF NOT EXISTS menu ( name TEXT NOT NULL PRIMARY KEY /* 메뉴 이름 */ ) """) self.conn.commit() self.conn.execute(""" CREATE TABLE IF NOT EXISTS menu_log ( id integer primary key autoincrement, /* 로그 고유번호 */ name text not null, /* 메뉴 이름 */ time timestamp DATE DEFAULT(datetime('now', 'localtime')) /* 날짜 */ ) """) self.conn.commit() def select_all(self): try: sql = "SELECT name FROM menu" self.cur.execute(sql) rows = self.cur.fetchall() if (len(rows)) == 0: return None else: return rows except Exception as e: self.logger.debug(e) return "DB 에러발생." def insert_by_name(self, menu): try: sql = "insert into menu(name) values(?)" self.cur.execute(sql, (menu, )) self.conn.commit() return "추가하였습니다." except sqlite3.IntegrityError: self.logger.debug("메뉴 중복") return None except Exception as e: self.logger.debug(e) return True def delete_by_name(self, menu): try: sql = "delete from menu where name = ?" self.cur.execute(sql, (menu, )) self.conn.commit() return "삭제하였습니다." except Exception as e: self.logger.debug(e) return "DB 에러발생." def select_random(self): try: sql = "select name from menu order by random() limit 1" self.cur.execute(sql) row = self.cur.fetchone() if row is None: return None else: (ret_value, ) = row return ret_value except Exception as e: self.logger.debug(e) return "DB 에러발생." def insert_log(self, menu_name): try: sql = "insert into menu_log(name) values(?)" self.cur.execute(sql, (menu_name,)) self.conn.commit() self.logger.debug("로그 추가") except Exception as e: self.logger.debug(e) return "DB 에러발생." def find_log_by_date(self, date): try: sql = "select * from menu_log where date(time) = date(?) limit 1" self.cur.execute(sql, (date,)) r1, r2, r3 = self.cur.fetchone() return str(r1) + " : " + r2 + " "+ r3 except Exception as e: self.logger.debug(e) return "DB 에러발생." def __del__(self): self.conn.close()
class SlackBot(object): logger = Logger("SlackBot") def __init__(self): self.slack_client = SlackClient(config.get("slack_api")) self.starterbot_id = None self.RTM_READ_DELAY = 1 self.EXAMPLE_COMMAND = "do" self.MENTION_REGEX = "^<@(|[WU].+?)>(.*)" self.cook_slave = MenuManager() self.alarm_time = "110000" # default time self.channel_url = None def parse_bot_commands(self, slack_events): for event in slack_events: if event["type"] == "message" and not "subtype" in event: user_id, message = self.parse_direct_mention(event["text"]) if user_id == self.starterbot_id: return message, event["channel"] return None, None def parse_direct_mention(self, message_text): matches = re.search(self.MENTION_REGEX, message_text) return (matches.group(1), matches.group(2).strip()) if matches else (None, None) def handle_command(self, command, channel): response = None try: if self.channel_url is None: if command == "등록": self.channel_url = channel response = "등록되었습니다." else: response = "채널등록 먼저 해주세요 슬랙에서 등록이라고 치면 됩니다. ex) 등록" elif command == "help" or command == "도움말": response = "※SQL인젝션 금지-----\n1. 밥추천/추천/메뉴추천/메뉴 추천 ex) 밥추천\n2. 추가/밥추가/메뉴추가 ex)밥추가 돈가스\n" \ "3. 삭제/밥삭제/메뉴삭제 ex) 삭제 돈가스\n4. 확인/메뉴기록/메뉴로그 ex) 확인 2018-11-07\n" \ "5. 초기화/재세팅 ex) 초기화\n 6. 모든메뉴 ex) 모든메뉴\n7. 알람설정 ex) 알람설정 110000\n8. 기타설명\n" \ "9. 채널변경 및 등록 ex) 등록" elif command == "alert": response = "오늘의 메뉴는 " + self.cook_slave.menu_rand_select( ) + " 입니다." elif command.startswith("밥추천") or command.startswith( "추천") or command.startswith("메뉴추천") or command.startswith( "메뉴 추천"): response = self.cook_slave.menu_recommend() elif command.startswith("추가"): if len(command) > 2: result = self.cook_slave.menu_insert(command.split(' ')[1]) response = "(" + command.split(' ')[1] + ") " + result elif command.startswith("밥추가") or command.startswith("메뉴추가"): if len(command) > 3: result = self.self.cook_slave.menu_insert( command.split(' ')[1]) response = "(" + command.split(' ')[1] + ") " + result elif command.startswith("삭제"): if len(command) > 2: result = self.cook_slave.menu_delete(command.split(' ')[1]) response = "(" + command.split(' ')[1] + ") " + result elif command.startswith("밥삭제") or command.startswith("메뉴삭제"): if len(command) > 3: result = self.cook_slave.menu_delete(command.split(' ')[1]) response = "(" + command.split(' ')[1] + ") " + result elif command.startswith("확인"): if len(command) > 2: result = self.cook_slave.find_menu_log( command.split(' ')[1]) response = result elif command.startswith("메뉴기록") or command.startswith("메뉴로그"): if len(command) > 3: result = self.cook_slave.find_menu_log( command.split(' ')[1]) response = result elif command.startswith("초기화") or command.startswith("재세팅"): result = self.cook_slave.menu_setting() response = result elif command.startswith("알람설정"): if len(command) > 4: alarm = command.split(' ')[1] result = self.cook_slave.set_alarm_time(alarm) if result is True: self.alarm_time = alarm response = "매일 " + alarm + " 시간에 메뉴와 알림이 옵니다" else: response = result elif command.startswith("모든메뉴"): response = self.cook_slave.all_menu_check() elif command.startswith("기타설명"): response = "1. 초기화/재세팅은 매주마다 먹은 음식이 겹치지 않도록 하는 것이며, 월요일 오전 9시마다 자동으로 재세팅 되는 것이므로" \ "따로 입력을 하지 않아도 되나, 하는 것은 자유이다.\n2. 로그는 항상 YYYY-MM-DD로 입력해야 한다." \ "\n3. 알람설정은 매일 같은시간에 점심메뉴를 추천해주는 알림을 제공해주며 기본값은 110000이고" \ "153030은 3시30분30초라는 의미이다." elif command == "등록": self.channel_url = channel response = "등록되었습니다." else: response = "help 또는 도움말 라고 입력해주세요" except Exception as e: self.logger.debug(e) response = "※ SQL인젝션 금지------\nhelp 또는 도움말을 통해 설명을 다시 확인해주세요" # Sends the response back to the channel self.slack_client.api_call("chat.postMessage", channel=channel, text=response) def start(self): if self.slack_client.rtm_connect(with_team_state=False): self.logger.debug("Starter Bot connected and running!") # Read bot's user ID by calling Web API method `auth.test` self.starterbot_id = self.slack_client.api_call( "auth.test")["user_id"] self.cook_slave.menu_setting() while True: command, channel = self.parse_bot_commands( self.slack_client.rtm_read()) time_check = time.strftime('%H%M%S') r = datetime.datetime.today().weekday() if r == 0 and time_check == "090000": #월요일 9시에 데이터 초기화 self.cook_slave.menu_setting() if time_check == self.alarm_time and self.channel_url is not None: self.handle_command("alert", self.channel_url) if command: self.handle_command(command, channel) time.sleep(self.RTM_READ_DELAY) else: self.logger.debug( "Connection failed. Exception traceback printed above.")
class DogdripRemover(object): dogdripBrowser: XpressEngine logger = Logger("DogdripRemover") # Installing Requirements CHROME_DRIVER_URL = dict(Linux='https://chromedriver.storage.googleapis.com/2.38/chromedriver_linux64.zip', Darwin='https://chromedriver.storage.googleapis.com/2.38/chromedriver_mac64.zip', Windows='https://chromedriver.storage.googleapis.com/2.38/chromedriver_win32.zip') WEBSITE_URL = 'http://dogdrip.net' conn = None cur = None # 생성자 # 크롬드라이버 설치, 데이터베이스 초기화 작업, Selenium 객체 생성 def __init__(self, arch=platform.system()): self.logger.debug("DogdripRemover instance created") self.driverPath = None self.conn = None self.cur = None try: if not Path('./chromedriver.zip').is_file(): # 1. Chromedriver 설치 self.logger.info("크롬 드라이버를 다운로드합니다.") self.logger.debug("OS: %s", str(arch)) self.logger.debug("URL: %s", str(self.CHROME_DRIVER_URL.get(arch))) wget.download(self.CHROME_DRIVER_URL.get(arch), './chromedriver.zip') self.logger.debug("크롬드라이버 다운로드 성공. 압축 해제") chromedriver = zipfile.ZipFile('./chromedriver.zip') chromedriver.extractall('./') else: chromedriver = zipfile.ZipFile('./chromedriver.zip') self.driverPath = os.path.realpath(chromedriver.namelist()[0]) self.logger.debug("압축해제 완료. chromedriver 위치: %s", self.driverPath) chromedriver.close() # Chromedriver 실행권한 부여 if not arch == "Windows": st = os.stat(self.driverPath) os.chmod(self.driverPath, st.st_mode | stat.S_IEXEC) self.logger.debug("크롬 드라이버 실행 권한 변경 완료") # SQLite3 초기화 if not Path('./my_dogdrip.db').is_file(): #데이터베이스 없는 경우 self.conn = sqlite3.connect("my_dogdrip.db") self.cur = self.conn.cursor() self.db_initialize() else: self.conn = sqlite3.connect("my_dogdrip.db") self.cur = self.conn.cursor() except sqlite3.Error as e: self.logger.error("데이터베이스 초기화에 실패했습니다.") self.logger.exception(e) exit(1) except Exception as e: self.logger.error("크롬 드라이버 다운로드에 실패했습니다.") self.logger.exception(e) exit(1) # self.dogdripBrowser = XpressEngine(url=self.WEBSITE_URL, # user_id=config.get('user_id'), # password=config.get('password'), # headless=False) def db_initialize(self): """ 데이터베이스 초기화 데이터베이스에서는 본인이 작성할 댓글과 게시물을 임시로 저장하고, 조건 처리에 대한 모든 질의를 http request를 통해 하는것보다 빠른 처리성능을 꾀하고자 sqlite3를 사용한다. """ # 문서 self.logger.info("SQLite3 초기화중...") self.conn.execute(""" CREATE TABLE IF NOT EXISTS documents ( document_srl TEXT PRIMARY KEY, /* 게시물 고유번호 */ href TEXT NOT NULL, /* 게시물 고유주소 */ title TEXT, /* 게시물 제목 */ content TEXT, /* 게시물 내용 */ view_count INT, /* 조회 수 */ comment_count INT, /* 댓글 수 */ vote_up INT, /* 개드립 */ vote_down INT, /* 붐업 */ created_at TEXT, /* 작성시간 */ target_board TEXT, /* 게시판 이름 */ is_deleted INTEGER DEFAULT 0 /* 삭제여부 */ ) """) self.conn.execute("CREATE INDEX IF NOT EXISTS d_target_board_idx ON documents(`target_board`)") self.conn.commit() # 댓글 self.conn.execute(""" CREATE TABLE IF NOT EXISTS comments ( comment_srl TEXT PRIMARY KEY, /* 댓글 고유번호 */ target_srl TEXT, /* 댓글이 작성된 원래 문서의 고유번호 */ href TEXT NOT NULL, /* 고유 주소 */ content TEXT, /* 댓글 내용 */ created_at TEXT, /* 작성 시간 */ target_board TEXT, /* 게시판 이름 */ has_child INTEGER DEFAULT 0, /* 대댓글 여부 */ is_deleted INTEGER DEFAULT 0 /* 삭제여부 */ ) """) self.conn.execute("CREATE INDEX IF NOT EXISTS c_target_board_idx ON comments(`target_board`)") self.conn.commit() self.logger.info("SQLite3 초기화 완료!") # SQLite3 에 댓글 정보 삽입 def insert_comments(self, comments): insert_into_comments = "INSERT OR REPLACE INTO comments(comment_srl, target_srl, href, content, created_at) " \ "VALUES (?,?,?,?,datetime(?)) " self.cur.executemany(insert_into_comments, comments) self.conn.commit() # SQLite3 에 문서 정보 삽입 def insert_documents(self, documents): insert_into_documents = "INSERT OR REPLACE INTO documents(document_srl, href, title, comment_count, " \ "view_count, vote_up, created_at) VALUES (?,?,?,?,?,?,date(?)) " self.cur.executemany(insert_into_documents, documents) self.conn.commit() def login(self, user_id=config.get('user_id'), password=config.get('password')): self.dogdripBrowser = XpressEngine(url=self.WEBSITE_URL, user_id=user_id, password=password, headless=False) self.dogdripBrowser.load_browser(self.driverPath) return self.dogdripBrowser.login() def comments_find_all(self): with self.conn: self.cur.execute("SELECT * FROM COMMENTS") comments = self.cur.fetchall() return comments def comments_find_not_deleted(self): with self.conn: self.cur.execute("SELECT * FROM comments") comments = self.cur.fetchall() return comments def update_document_detail(self, results): self.logger.debug("게시물에 상세정보를 추가합니다.") updatesql = "UPDATE documents SET target_board=?, content=? WHERE document_srl=?" new_infos = [] for result in results: if result: document_srl = result[0][0] target_board = result[1] content = result[2] new_infos.append((target_board, content, document_srl)) else: pass self.cur.executemany(updatesql, new_infos) self.conn.commit() def update_comment_detail(self, results): self.logger.debug("댓글에 상세정보를 추가합니다.(소속 게사판, 대댓글여부)") updatesql = "UPDATE comments SET target_board=?, has_child=? WHERE comment_srl=?" new_infos = [] for result in results: if result: comment_srl = result[0][0] target_board = result[1] has_child = result[2] new_infos.append((target_board, has_child, comment_srl)) self.cur.executemany(updatesql, new_infos) self.conn.commit() def update_is_deleted(self, doc, type='comment'): self.logger.debug("삭제여부를 변경합니다.") self.logger.debug(doc) if type == 'comment': updatesql = "UPDATE comments SET is_deleted='1' WHERE comment_srl=?" self.cur.execute(updatesql, [doc[0]]) self.conn.commit() elif type == 'document': updatesql = "UPDATE documents SET is_deleted='1' WHERE document_srl=?" self.cur.execute(updatesql, [doc[0]]) self.conn.commit() pass # 게시물 목록으로부터 게시물 기초 정보 수집 # 게시물 번호, 제목, 주소, 조회수, 추천수, 작성시각 def fetch_document_list(self): self.logger.info("작성한 게시물 목록을 불러오고 있습니다...") html = self.dogdripBrowser.load_my_documents_html() current_page, total_page = self.get_pagination_info(html.find('caption').get_text()) self.logger.info("총 %s페이지의 문서가 있습니다.", total_page) for page in range(int(current_page), int(total_page) + 1): self.logger.info("%s페이지 중 %s페이지를 수집하고있습니다.", str(total_page), str(page)) html = self.dogdripBrowser.load_my_documents_html(page) html_document_list = html.find_all('tr') documents = self.parse_document(html_document_list) self.insert_documents(documents) # 문서 정보 파싱 def parse_document(self, html_document_list): documents = [] pattern = re.compile(r'\s\[\d*.\]$') if html_document_list: for document_list in html_document_list: # 마이페이지에서 수집할 수 있는 문서 정보 저장 content = document_list.find("td", {"class": "wide"}) if content: document = content.find("a") if document: href = document['href'] document_srl = urlparse(href).path while os.path.dirname(document_srl) != '/': document_srl = os.path.dirname(document_srl) document_srl = document_srl.replace('/', '') comment_count = pattern.findall(content.get_text().strip()) if comment_count: comment_count = comment_count[0].replace('[', '').replace(']', '').strip() else: comment_count = "0" title = document.get_text() view_count = document.findNext() vote_up = view_count.findNext() created_at = vote_up.findNext().get_text() self.logger.debug("원본 게시물 번호: %s, 게시물 주소: %s, 제목: %s, 댓글 수: %s, 조회 수: %s, 개드립 수: %s, 작성일: %s", document_srl, href, title, comment_count, view_count.get_text(), vote_up.get_text(), created_at) documents.append((document_srl, href, title, comment_count, view_count.get_text(), vote_up.get_text(), created_at)) return documents else: return None # 댓글 목록으로부터 게시물 기초 정보 수집 # 댓글내용, 댓글번호, 원게시물 번호, 댓글주소, 작성시각 def fetch_comment_list(self): self.logger.info("작성한 댓글 목록을 불러오고 있습니다...") html = self.dogdripBrowser.load_my_comments_html() current_page, total_page = self.get_pagination_info(html.find('caption').get_text()) self.logger.info("총 %s페이지의 댓글이 있습니다.", total_page) for page in range(int(current_page), int(total_page) + 1): self.logger.info("총 %s페이지 중 %s페이지를 수집하고있습니다.", str(total_page), str(page)) html = self.dogdripBrowser.load_my_comments_html(page) html_comment_list = html.find_all('tr') comments = self.parse_comment(html_comment_list) self.insert_comments(comments) # 댓글 정보 파싱 def parse_comment(self, html_comment_list): comments = [] if html_comment_list: for comment_list in html_comment_list: # 마이페이지에서 수집할 수 있는 댓글 정보 저장 content = comment_list.find("td", {"class": "wide"}) if content: comment = content.find("a") if comment: target_srl = urlparse(comment['href']).path while os.path.dirname(target_srl) != '/': target_srl = os.path.dirname(target_srl) target_srl = target_srl.replace('/', '') comment_srl = comment['href'].split("#")[1].split("_")[1] created_at = comment_list.find("td", {"class": "nowrap"}).get_text() self.logger.debug("원본 게시물 번호: %s, 댓글 고유번호: %s, 작성시간: %s", target_srl, comment_srl, created_at.strip().replace('\n', ' ')) if not comment.get_text() == "[삭제 되었습니다]": comments.append((comment_srl, target_srl, comment['href'], comment.get_text(), created_at)) return comments else: return None @classmethod def get_pagination_info(cls, text): pattern = re.compile(r'\b[0-9]*.\/.[0-9]*') pagination = pattern.findall(text) return pagination[0].split('/') # 댓글 상세 정보 수집 # 대댓글 여부, 문서가 소속된 게시판 정보 def add_comment_detail_job(self, comments): pool = Pool(processes=config.get('process_concurrency')) results = pool.imap_unordered(self.request_comment_info, comments) self.update_comment_detail(results) # 웹 요청 - 댓글 상세정보 수집 def request_comment_info(self, comment): start_time = millis() with requests.get(comment[2]) as res: if res.status_code == 200: self.logger.debug("댓글 로드완료. 시간: %sms, url: %s", str(millis() - start_time), comment[2]) page = res.text page = BeautifulSoup(page, 'html.parser') # 게시판 주소 찾기 # board_name = page.find_all("li", {"class": "category"})[-1].find("a")["href"].replace("/", "") board_title = page.find_all("div", {"class": "boardHeaderBorder"}) board_name = '' if board_title: board_name = urlparse(board_title[0].find("a")["href"]).path.replace('/', '') # 대댓글 찾기 has_child = "1" if page.find_all("div", {"parent_srl": comment[0]}) else "0" return comment, board_name, has_child def collect_comment_details(self): comments = self.comments_find_all() self.add_comment_detail_job(comments) def add_document_detail_job(self, documents): pool = Pool(processes=config.get('process_concurrency')) results = pool.imap_unordered(self.request_document_info, documents) self.update_document_detail(results) def request_document_info(self, document): start_time = millis() with requests.get(document[1]) as res: if res.status_code == 200: self.logger.debug("페이지 로드완료. 시간: %sms, url: %s", str(millis() - start_time), document[1]) page = res.text page = BeautifulSoup(page, 'html.parser') # 게시판 주소 찾기 board_title = page.find_all("div", {"class": "boardHeaderBorder"}) board_name = '' if board_title: board_name = urlparse(board_title[0].find("a")["href"]).path.replace('/', '') content = page.find_all("div", {"class": "xe_content"}) if content: content = content[0].get_text().strip() if board_name == '': self.logger.debug("게시물 파싱 불가. 직접 삭제 요망. 주소: %s", document[1]) return None if not content: self.logger.debug("게시물 파싱 불가. 직접 삭제 요망. 주소: %s", document[1]) return None return document, board_name, content def collect_document_details(self): documents = self.documents_find_all() self.add_document_detail_job(documents) def documents_find_all(self): with self.conn: self.cur.execute("SELECT * FROM documents") documents = self.cur.fetchall() return documents def delete_all_documents_job(self): documents = self.documents_find_all() # pool = Pool(processes=config.get('process_concurrency')) # results = pool.imap_unordered(self.request_comment_info, documents) for document in documents: if self.delete_selenium_document(document): self.update_is_deleted(document, type="document") pass def delete_selenium_document(self, document): self.logger.debug(document) return self.dogdripBrowser.delete_document(document) def delete_all_comments_job(self): comments = self.comments_find_not_deleted() # pool = Pool(processes=config.get('process_concurrency')) # results = pool.imap_unordered(self.delete_selenium_comments, comments) for comment in comments: if self.delete_selenium_comment(comment): self.update_is_deleted(comment, type="comment") pass def delete_selenium_comment(self, comment): if not comment[5] == 'temp': self.logger.debug(comment) return self.dogdripBrowser.delete_comment(comment) def __del__(self): self.logger.debug("DogdripRemover 인스턴스가 종료되었습니다.") try: self.dogdripBrowser.quit() self.cur.close() self.conn.close() except Exception as e: self.logger.exception(str(e)) pass
class MenuManager(object): logger = Logger("MenuManager") def __init__(self): self.db = DB() self.menu = [] def menu_setting(self): self.menu.clear() rows = self.db.select_all() # 메뉴 전체 가져오기 if rows is None: return "데이터베이스에 추가된 메뉴 없음" for (row, ) in rows: # 메뉴 추가 self.menu.append(row) return "메뉴 세팅완료." def all_menu_check(self): # 모든 메뉴 확인 rows = self.db.select_all() if rows is None: return "데이터베이스에 추가된 메뉴 없음" all_menu = "" i = 0 for (row, ) in rows: i += 1 if i == len(rows): all_menu += row else: all_menu += row + " / " return all_menu def menu_rand_select(self): if len(self.menu) == 0: self.menu_setting() if len(self.menu) == 0: return "데이터베이스에 추가된 메뉴 없음" rand_num = randrange(len(self.menu)) # 번호 추출 today_menu = self.menu[rand_num] # 랜덤으로 메뉴 뽑기 del self.menu[rand_num] # 뽑은 메뉴를 삭제. self.db.insert_log(today_menu) # 오늘 먹은 메뉴 기록. return today_menu def menu_insert(self, menu_name): result = self.db.insert_by_name(menu_name) if result is None: return "이미 추가되어있는 메뉴입니다." elif result is True: return "DB 에러" self.menu.append(menu_name) return result def menu_delete(self, menu_name): try: result = self.db.delete_by_name(menu_name) self.menu.remove(menu_name) return result except ValueError: return "존재하지 않는 메뉴입니다" def menu_recommend(self): result = self.db.select_random() if result is None: return "추가된 메뉴가 없음." return result def find_menu_log(self, date): result = self.db.find_log_by_date(date) if result is None: return date + "에 먹은 음식이 존재하지 않습니다." return result def set_alarm_time(self, time): try: if len(time) == 6: int_time = int(time) if int_time > 240000: return "시간이 24시를 초과할 수 없습니다." elif int_time < 0: return "시간이 음수가 될 수 없습니다." return True except ValueError: return "숫자가 아닙니다." except Exception as e: self.logger.debug(e) return "시간오류"
def get_comments(commentID_list, pid, ptitle, item_catagory, page_num, mongo_collection, proxiesIP, indexCollection): # 爬取商品id为pid的商品的第page_num页的评论 headers1 = { 'GET': '', 'Host': "club.jd.com", 'User-Agent': "Mozilla/5.0 (Windows NT 6.2; rv:29.0) Gecko/20100101 Firefox/29.0", 'Referer': 'http://item.jd.com/%s.html' % (pid) } Logger.info(page_num) page_url = web_json_url % (pid, page_num) # req = urllib2.Request(url = page_url, headers = headers1) Logger.info(page_url) try: response = requests.request(method='GET', url=page_url, headers=headers1) # session =requests.session() # session.get(url=page_url,) except (Exception): # myPrint(e) successtag = False # i+=1 # myPrint(Exception) Logger.info('ccc 跳过') # import os os.system('say "attention please, your program has Exception "') # response=None return 'Empty' try: # page_data = urllib2.urlopen(req, timeout = 10).read() html_content = response.content.decode('gbk', 'ignore') page_dict = json.loads(html_content) # myPrint(page_dict) # raise comments = page_dict['comments'] successtag = True myPrint(pid, len(comments)) except (Exception): successtag = False # i+=1 # myPrint('ccc 跳过') # os.system('say "attention please, your program has Exception "') return 'Empty' try: contents111 = [] for comment in comments: # contents.append(comment['content']) # myPrint(comment) commentDict = {} content = deleteEscapeCharacter(comment['content']) # myPrint(content) created_at = comment['creationTime'] userLevelName = comment['userLevelName'] isMobile = comment['isMobile'] score = comment['score'] comment_id = comment['id'] product_id = comment['referenceId'] if product_id == '': raise # myPrint(creationTime,userLevelName,isMobile) tagsList = [] if 'commentTags' in comment.keys(): commentTags = comment['commentTags'] # myPrint('ccc', commentTags) for commenTag in commentTags: tagsList.append(commenTag['name']) # myPrint(commenTag['name']) commentDict['comment_id'] = comment_id commentDict['product_id'] = str(product_id) commentDict['content'] = content commentDict['userLevelName'] = userLevelName commentDict['commentTags'] = tagsList commentDict['created_at'] = created_at commentDict['isMobile'] = isMobile commentDict['score'] = score commentDict['product_title'] = ptitle commentDict['category'] = item_catagory commentDict['store_id'] = 1 # commentDict['store_type'] = item_store_type # commentDict['spider_start_date']=spider_start_date # print(commentDict['product_title']) # myPrint(commentDict) if commentDict['product_id'] == pid: insert_reslut = insertToMongo(commentID_list, mongo_collection, commentDict, indexCollection) contents111.append(insert_reslut) except Exception: Logger.info('ccc 跳过') # import os os.system('say "attention please, your program has Exception "') # response=None return 'Empty' Logger.info(contents111) return contents111
def scraw_web_json(commentID_list, pid, ptitle, item_catagory, file_name, mongo_collection, proxiesIP, stnum, maxPage_num_control, log_collection, indexCollection): max_page_num = int(round(get_max_page_num(pid), 0)) Logger.info('max_page_num =' + str(max_page_num)) content_set = set() # max_page_num = 151 if max_page_num >= 100: max_page_num = maxPage_num_control cunzai_count = 0 for page_num2 in range(int(stnum), max_page_num): # if page_num%1==0: Logger.info('page_num:' + str(page_num2) + 'total' + str(max_page_num) + 'pid =' + str(pid)) contents = get_comments(commentID_list, pid, ptitle, item_catagory, page_num2, mongo_collection, proxiesIP, indexCollection) Logger.info('product Index' + str(file_name)) Logger.info(comments) if contents != 'Empty' and len(contents) > 0: log_dict = { 'type': '成功', 'product_index': file_name, 'current_page': page_num2, 'log_time': time.time(), 'product_id': pid } log_collection.insert(log_dict) Logger.info('Sum =' + str(sum(contents))) if sum(contents) >= 10: # a = sum(contents) # print(a) log_dict = { 'type': '已经存在', 'product_index': file_name, 'current_page': page_num2, 'log_time': time.time(), 'product_id': pid } log_collection.insert(log_dict) cunzai_count += 1 if cunzai_count >= 3: raise else: cunzai_count = 0 if contents == 'Empty': log_dict = { 'type': '跳出', 'product_index': file_name, 'current_page': page_num2, 'log_time': time.time(), 'product_id': pid } log_collection.insert(log_dict) # os.system('say "attention please, your program has Exception "') raise time.sleep(random.randint(5, 10) / 2) # 休眠片刻