def get_conn(self): logger.info("connecting to mysql...") logger.info((Services.host, Services.port, Services.username, Services.password, Services.database)) try: self.conn = mysql.connector.connect(host=Services.host, port=Services.port, user=Services.username, password=Services.password, database=Services.database, charset='utf8') self.conn.autocommit = True # self.conn = pymysql.connect(host=Services.host, # port=Services.port, # user=Services.username, # password=Services.password, # database=Services.database, # charset='utf8' # ) except Exception as e: logger.warning('File to connect database: %s' % e) logger.warning('stop') pass return self.conn
def get(self): args = self.getAttributeParse.parse_args() # 查询最优属性 if args['type'] == 0: pn = args['pn'] if args['pn'] else 1 rn = args['rn'] if args['rn'] else 5 offset = (pn - 1) * rn data = requests.get(urls['getBestAttr'] % { 'domain': args['domain'], 'count': rn, 'offset': offset }).json() data['message'] = 'success' logger.info( logging_format % { 'ip': request.remote_addr, 'url': request.url, 'data': json.dumps(data) }) return data # 查询同义属性 elif args['type'] == 1: data = requests.get(urls['getOtherAttr'] % args).json() data['message'] = 'success' logger.info( logging_format % { 'ip': request.remote_addr, 'url': request.url, 'data': json.dumps(data) }) return data
def post(self): args = self.parse.parse_args() data = requests.get(urls['addOtherAttr'] % args).json() data['message'] = 'success' logger.info( logging_format % { 'ip': request.remote_addr, 'url': request.url, 'data': json.dumps(data) }) return data
def add_edges_for_lost(self): text = self.mysql.select_id_text_point_url() data = list() for ii in text: li = list() id_text = str(ii[0]) + str(ii[1]) li.append(id_text) li.append(ii[2]) tu = tuple(li) data.append(tu) logger.info("add_edges_for_lost:" + str(data)) self.G.add_edges_from(data)
def _check_useful_url(self, url): """ 检查URL :return: """ try: logger.info("check url: " + str(url)) self.driver.get(url=url) logger.info("useful url: " + str(url)) return True except exceptions as e: logger.warning("Unuseful url: " + str(url)) return False
def make_data(self): urls = self.mysql.select_url() for i in urls: data = list() text = self.mysql.select_text(i) for ii in text: li = list() li.append(i) id_text = str(ii[0]) + str(ii[1]) li.append(id_text) tu = tuple(li) data.append(tu) logger.info("add_edges:" + str(data)) self.G.add_edges_from(data)
def select_id_text_point_url(self): logger.info("select_id_text_point_url...") sql = "select x.id, r.text, x.point_url from xpath as x,relation as r where x.id = r.id" try: cur = self.conn.cursor() cur.execute(sql) data = cur.fetchall() logger.info("select text success") except Exception as e: logger.warning("select_text fail: %s" % e) # 发生错误时回滚 self.conn.rollback() return data
def _driver_open_url(self, url): """ 打开一个URL :param url: :return: """ try: logger.info("open url: " + str(url)) self.driver.get(url=url) logger.info("current url :" + self.driver.current_url) except exceptions as e: logger.warning("open url: %s fail" % url) logger.warning("error:" + e) self._update_current_url()
def select_text(self, url): logger.info("select text...") sql = "select relation.id,relation.text from relation,xpath where relation.id = xpath.id and xpath.url = '%s'" % url a = list() try: cur = self.conn.cursor() cur.execute(sql) data = cur.fetchall() logger.info("select text success") except Exception as e: logger.warning("select_text fail: %s" % e) # 发生错误时回滚 self.conn.rollback() return data
def update_true_status_to_content(self, url): logger.info("update_true_status_to_content" + ' url:' + str(url)) sql = "UPDATE content SET status = TRUE WHERE url = '%s'" % url logger.debug('sql:' + sql) try: # 执行sql语句 self.conn.cursor().execute(sql) # 提交到数据库执行 # self.conn.commit() logger.info("update_true_status_to_content success") except Exception as e: logger.warning("update_true_status_to_content fail: %s" % e) # 发生错误时回滚 self.conn.rollback()
def insert_one_to_xpath(self, params): """xpath表中插入一条数据""" logger.info("insert_one_to_xpath " + str(params)) sql = "INSERT INTO xpath (id, url, xpath, point_url) VALUES(%s, %s, %s, %s)" logger.debug('sql:' + sql) try: # 执行sql语句 self.conn.cursor().execute(sql, params) # 提交到数据库执行 # self.conn.commit() logger.info("insert_one_to_xpath success") except Exception as e: logger.warning("insert_one_to_xpath fail: %s" % e) # 发生错误时回滚 self.conn.rollback()
def select_url(self): logger.info("select_distinct_url_from_xpath...") sql = "select distinct url from xpath" a = list() try: cur = self.conn.cursor() cur.execute(sql) data = cur.fetchall() for i in data: a.append(i[0]) logger.info("select point_url from_xpath success") except Exception as e: logger.warning("select_point_url_from_content fail: %s" % e) # 发生错误时回滚 self.conn.rollback() return a
def first_data(self, urls): """初始化数据""" logger.info("first_data") sql = "INSERT INTO content (layer_number, url, status) VALUES (%s, '%s', %s)" % ( 0, urls, 'true') logger.debug('sql:' + sql) try: # 执行sql语句 self.conn.cursor().execute(sql) # 提交到数据库执行 # self.conn.commit() logger.info("first_data success") except Exception as e: logger.warning("first_data fail: %s" % e) # 发生错误时回滚 self.conn.rollback()
def select_url_from_content(self): logger.info("select_url_from_content...") sql = "select url from content WHERE status = false" logger.debug('sql:' + sql) a = list() try: cur = self.conn.cursor() cur.execute(sql) data = cur.fetchall() for i in data: a.append(i[0]) logger.info("select_url_from_content success") except Exception as e: logger.warning("select_url_from_content fail: %s" % e) # 发生错误时回滚 self.conn.rollback() return a
def insert_many_to_relation(self, params): """ relation表中插入多条数据 :param params: :return: """ logger.info("insert_many_to_relation " + str(params)) sql = "INSERT INTO relation (layer_number, id, title, text) VALUES(%s, %s, %s, %s)" logger.debug('sql:' + sql) try: # 执行sql语句 self.conn.cursor().executemany(sql, params) # 提交到数据库执行 # self.conn.commit() logger.info("insert_many_to_relation success") except Exception as e: logger.warning("insert_many_to_relation fail: %s" % e) # 发生错误时回滚 self.conn.rollback()
def check_xpath(self, dr): """检查xpath并入库""" if dr in ['firefox', 'chrome', 'ie']: obj = Parser(dr) page = obj.parser_by_lxml() tag = obj.tag_a_has_href(page) obj.get_xpath(page=page, tag=tag) while len(obj.Xpath_list) > 0: xpath = obj.Xpath_list.pop() try: obj.driver.find_element_by_xpath(xpath=xpath).click() if obj.driver.current_url == obj.current_url: continue else: obj.new_url.append(obj.driver.current_url) all_handles = obj.driver.window_handles main_handle = obj.driver.current_window_handle for handle in all_handles: if handle != obj.driver.current_window_handle: obj.driver.switch_to_window(handle) obj.driver.close() obj.driver.switch_to_window(main_handle) logger.info("back to " + obj.driver.current_url) except: del obj.Xpath_list[i] logger.warning('delete wrong xpath') pass obj.update_current_url() params = (gen_rand_str(length=8, s_type='digit'), obj.driver.current_url, obj.Xpath_list[i]) self.insert_one_to_xpath(params) else: logger.warning('wrong driver, only for firefox, chrome, ie!')
def insert_many_to_content(self, params): """ content表中插入多条数据(url, father_url, layer_number) :param params: :return: """ logger.info("insert_many_to_content " + str(params)) sql = "INSERT INTO content (url, father_url, layer_number) VALUES(%s, %s, %s)" logger.debug('sql:' + sql) try: # 执行sql语句 self.conn.cursor().executemany(sql, params) # 提交到数据库执行 # self.conn.commit() logger.info("insert_many_to_content success") except mysql.connector.IntegrityError as e: # 唯一性约束去重 logger.warning("have same url, lose it: %s" % e) except mysql.connector.Error as e: logger.warning("insert_many_to_content fail: %s" % e) # 发生错误时回滚 self.conn.rollback()
def end_conn(self): """关闭连接""" self.conn.close() logger.info("close connect")
def all_aa(self, url, layer_number): xpath = list() self._driver_open_url(url) page, html = self._parser_by_xml() self.mysql.update_html_to_content(url=url, html=html) tag = self._tag_a_has_href(page) text = [a.text for a in tag] main_handle = self.driver.current_window_handle self._get_xpath(page=page, tag=tag) while len(self.Xpath_list) > 0: xp = self.Xpath_list.pop() te = text.pop() logger.debug("len_of_list: " + str(len(self.Xpath_list))) logger.debug("xpath:" + str(xp)) try: element = self.driver.find_element_by_xpath(xpath=xp) except exceptions.NoSuchElementException as e: logger.warning(str(e)) continue if element.is_enabled() is True: try: element.click() xpath.append(xp) if self.driver.current_url == self.current_url: continue else: self.new_url.append(self.driver.current_url) all_handles = self.driver.window_handles if len(all_handles) > 1: for handle in all_handles: if handle != main_handle: try: self.driver.switch_to.window(handle) self.driver.close() logger.debug("close window: " + str(handle)) except exceptions.NoSuchWindowException as e: logger.warning( "selenium.common.exceptions.NoSuchWindowException: " + str(e)) self.driver.switch_to.window(main_handle) logger.info("back to main_handle") elif len(all_handles) == 1: self.driver.back() else: raise ValueError("window_handle wrong") except: logger.info('throw wrong xpath') continue else: logger.info('throw wrong xpath') continue uid = int(gen_rand_str(length=7, s_type='digit')) logger.debug( '(uid, self.driver.current_url, self.Xpath_list[i], self.driver.current_url):' + str(uid) + self.driver.current_url + str(xp) + self.driver.current_url) self.mysql.insert_one_to_xpath((uid, self.driver.current_url, str(xp), self.driver.current_url)) self.mysql.insert_one_to_relation( (layer_number, uid, self.driver.title, te)) for i in range(len(self.new_url)): if self._check_useful_url(self.new_url[i]): pass else: del self.new_url[i] params = [(Url, url, layer_number) for Url in self.new_url] logger.debug(self.new_url) self.mysql.insert_many_to_content(params=params) self.mysql.update_true_status_to_content(url=url) self.get_back_to_init()