def get_conn(self): logger.info("connecting to mysql...") logger.info((Services.host, Services.port, Services.username, Services.password, Services.database)) try: self.conn = mysql.connector.connect(host=Services.host, port=Services.port, user=Services.username, password=Services.password, database=Services.database, charset='utf8') self.conn.autocommit = True # self.conn = pymysql.connect(host=Services.host, # port=Services.port, # user=Services.username, # password=Services.password, # database=Services.database, # charset='utf8' # ) except Exception as e: logger.warning('File to connect database: %s' % e) logger.warning('stop') pass return self.conn
def _check_useful_url(self, url): """ 检查URL :return: """ try: logger.info("check url: " + str(url)) self.driver.get(url=url) logger.info("useful url: " + str(url)) return True except exceptions as e: logger.warning("Unuseful url: " + str(url)) return False
def _driver_open_url(self, url): """ 打开一个URL :param url: :return: """ try: logger.info("open url: " + str(url)) self.driver.get(url=url) logger.info("current url :" + self.driver.current_url) except exceptions as e: logger.warning("open url: %s fail" % url) logger.warning("error:" + e) self._update_current_url()
def select_id_text_point_url(self): logger.info("select_id_text_point_url...") sql = "select x.id, r.text, x.point_url from xpath as x,relation as r where x.id = r.id" try: cur = self.conn.cursor() cur.execute(sql) data = cur.fetchall() logger.info("select text success") except Exception as e: logger.warning("select_text fail: %s" % e) # 发生错误时回滚 self.conn.rollback() return data
def update_true_status_to_content(self, url): logger.info("update_true_status_to_content" + ' url:' + str(url)) sql = "UPDATE content SET status = TRUE WHERE url = '%s'" % url logger.debug('sql:' + sql) try: # 执行sql语句 self.conn.cursor().execute(sql) # 提交到数据库执行 # self.conn.commit() logger.info("update_true_status_to_content success") except Exception as e: logger.warning("update_true_status_to_content fail: %s" % e) # 发生错误时回滚 self.conn.rollback()
def select_text(self, url): logger.info("select text...") sql = "select relation.id,relation.text from relation,xpath where relation.id = xpath.id and xpath.url = '%s'" % url a = list() try: cur = self.conn.cursor() cur.execute(sql) data = cur.fetchall() logger.info("select text success") except Exception as e: logger.warning("select_text fail: %s" % e) # 发生错误时回滚 self.conn.rollback() return data
def insert_one_to_xpath(self, params): """xpath表中插入一条数据""" logger.info("insert_one_to_xpath " + str(params)) sql = "INSERT INTO xpath (id, url, xpath, point_url) VALUES(%s, %s, %s, %s)" logger.debug('sql:' + sql) try: # 执行sql语句 self.conn.cursor().execute(sql, params) # 提交到数据库执行 # self.conn.commit() logger.info("insert_one_to_xpath success") except Exception as e: logger.warning("insert_one_to_xpath fail: %s" % e) # 发生错误时回滚 self.conn.rollback()
def first_data(self, urls): """初始化数据""" logger.info("first_data") sql = "INSERT INTO content (layer_number, url, status) VALUES (%s, '%s', %s)" % ( 0, urls, 'true') logger.debug('sql:' + sql) try: # 执行sql语句 self.conn.cursor().execute(sql) # 提交到数据库执行 # self.conn.commit() logger.info("first_data success") except Exception as e: logger.warning("first_data fail: %s" % e) # 发生错误时回滚 self.conn.rollback()
def __init__(self, a): super(Parser, self).__init__() if a.lower() == "firefox": self.driver = self._firefox_driver() elif a.lower() == "chrome": self.driver = self._chrome_driver() elif a.lower() == "ie": self.driver = self._ie_driver() else: logger.warning("Wrong driver!Only for: firefox, chrome, ie.") raise ValueError("Wrong driver!Only for: firefox, chrome, ie.") self.current_url = None self.current_window_handle = None self.Xpath_list = list() self.number = 1 self.new_url = list() self.mysql = MySQLSingle()
def select_url(self): logger.info("select_distinct_url_from_xpath...") sql = "select distinct url from xpath" a = list() try: cur = self.conn.cursor() cur.execute(sql) data = cur.fetchall() for i in data: a.append(i[0]) logger.info("select point_url from_xpath success") except Exception as e: logger.warning("select_point_url_from_content fail: %s" % e) # 发生错误时回滚 self.conn.rollback() return a
def select_url_from_content(self): logger.info("select_url_from_content...") sql = "select url from content WHERE status = false" logger.debug('sql:' + sql) a = list() try: cur = self.conn.cursor() cur.execute(sql) data = cur.fetchall() for i in data: a.append(i[0]) logger.info("select_url_from_content success") except Exception as e: logger.warning("select_url_from_content fail: %s" % e) # 发生错误时回滚 self.conn.rollback() return a
def insert_many_to_relation(self, params): """ relation表中插入多条数据 :param params: :return: """ logger.info("insert_many_to_relation " + str(params)) sql = "INSERT INTO relation (layer_number, id, title, text) VALUES(%s, %s, %s, %s)" logger.debug('sql:' + sql) try: # 执行sql语句 self.conn.cursor().executemany(sql, params) # 提交到数据库执行 # self.conn.commit() logger.info("insert_many_to_relation success") except Exception as e: logger.warning("insert_many_to_relation fail: %s" % e) # 发生错误时回滚 self.conn.rollback()
def insert_many_to_content(self, params): """ content表中插入多条数据(url, father_url, layer_number) :param params: :return: """ logger.info("insert_many_to_content " + str(params)) sql = "INSERT INTO content (url, father_url, layer_number) VALUES(%s, %s, %s)" logger.debug('sql:' + sql) try: # 执行sql语句 self.conn.cursor().executemany(sql, params) # 提交到数据库执行 # self.conn.commit() logger.info("insert_many_to_content success") except mysql.connector.IntegrityError as e: # 唯一性约束去重 logger.warning("have same url, lose it: %s" % e) except mysql.connector.Error as e: logger.warning("insert_many_to_content fail: %s" % e) # 发生错误时回滚 self.conn.rollback()
def check_xpath(self, dr): """检查xpath并入库""" if dr in ['firefox', 'chrome', 'ie']: obj = Parser(dr) page = obj.parser_by_lxml() tag = obj.tag_a_has_href(page) obj.get_xpath(page=page, tag=tag) while len(obj.Xpath_list) > 0: xpath = obj.Xpath_list.pop() try: obj.driver.find_element_by_xpath(xpath=xpath).click() if obj.driver.current_url == obj.current_url: continue else: obj.new_url.append(obj.driver.current_url) all_handles = obj.driver.window_handles main_handle = obj.driver.current_window_handle for handle in all_handles: if handle != obj.driver.current_window_handle: obj.driver.switch_to_window(handle) obj.driver.close() obj.driver.switch_to_window(main_handle) logger.info("back to " + obj.driver.current_url) except: del obj.Xpath_list[i] logger.warning('delete wrong xpath') pass obj.update_current_url() params = (gen_rand_str(length=8, s_type='digit'), obj.driver.current_url, obj.Xpath_list[i]) self.insert_one_to_xpath(params) else: logger.warning('wrong driver, only for firefox, chrome, ie!')
def all_aa(self, url, layer_number): xpath = list() self._driver_open_url(url) page, html = self._parser_by_xml() self.mysql.update_html_to_content(url=url, html=html) tag = self._tag_a_has_href(page) text = [a.text for a in tag] main_handle = self.driver.current_window_handle self._get_xpath(page=page, tag=tag) while len(self.Xpath_list) > 0: xp = self.Xpath_list.pop() te = text.pop() logger.debug("len_of_list: " + str(len(self.Xpath_list))) logger.debug("xpath:" + str(xp)) try: element = self.driver.find_element_by_xpath(xpath=xp) except exceptions.NoSuchElementException as e: logger.warning(str(e)) continue if element.is_enabled() is True: try: element.click() xpath.append(xp) if self.driver.current_url == self.current_url: continue else: self.new_url.append(self.driver.current_url) all_handles = self.driver.window_handles if len(all_handles) > 1: for handle in all_handles: if handle != main_handle: try: self.driver.switch_to.window(handle) self.driver.close() logger.debug("close window: " + str(handle)) except exceptions.NoSuchWindowException as e: logger.warning( "selenium.common.exceptions.NoSuchWindowException: " + str(e)) self.driver.switch_to.window(main_handle) logger.info("back to main_handle") elif len(all_handles) == 1: self.driver.back() else: raise ValueError("window_handle wrong") except: logger.info('throw wrong xpath') continue else: logger.info('throw wrong xpath') continue uid = int(gen_rand_str(length=7, s_type='digit')) logger.debug( '(uid, self.driver.current_url, self.Xpath_list[i], self.driver.current_url):' + str(uid) + self.driver.current_url + str(xp) + self.driver.current_url) self.mysql.insert_one_to_xpath((uid, self.driver.current_url, str(xp), self.driver.current_url)) self.mysql.insert_one_to_relation( (layer_number, uid, self.driver.title, te)) for i in range(len(self.new_url)): if self._check_useful_url(self.new_url[i]): pass else: del self.new_url[i] params = [(Url, url, layer_number) for Url in self.new_url] logger.debug(self.new_url) self.mysql.insert_many_to_content(params=params) self.mysql.update_true_status_to_content(url=url) self.get_back_to_init()