コード例 #1
0
    def update_true_status_to_content(self, url):
        logger.info("update_true_status_to_content" + ' url:' + str(url))
        sql = "UPDATE content SET status = TRUE WHERE url = '%s'" % url
        logger.debug('sql:' + sql)
        try:
            # 执行sql语句
            self.conn.cursor().execute(sql)
            # 提交到数据库执行
            # self.conn.commit()
            logger.info("update_true_status_to_content success")

        except Exception as e:
            logger.warning("update_true_status_to_content fail: %s" % e)
            # 发生错误时回滚
            self.conn.rollback()
コード例 #2
0
    def insert_one_to_xpath(self, params):
        """xpath表中插入一条数据"""
        logger.info("insert_one_to_xpath " + str(params))
        sql = "INSERT INTO xpath (id, url, xpath, point_url) VALUES(%s, %s, %s, %s)"
        logger.debug('sql:' + sql)
        try:
            # 执行sql语句
            self.conn.cursor().execute(sql, params)
            # 提交到数据库执行
            # self.conn.commit()
            logger.info("insert_one_to_xpath success")

        except Exception as e:
            logger.warning("insert_one_to_xpath fail: %s" % e)
            # 发生错误时回滚
            self.conn.rollback()
コード例 #3
0
    def first_data(self, urls):
        """初始化数据"""
        logger.info("first_data")
        sql = "INSERT INTO content (layer_number, url, status) VALUES (%s, '%s', %s)" % (
            0, urls, 'true')
        logger.debug('sql:' + sql)
        try:
            # 执行sql语句
            self.conn.cursor().execute(sql)
            # 提交到数据库执行
            # self.conn.commit()
            logger.info("first_data success")

        except Exception as e:
            logger.warning("first_data fail: %s" % e)
            # 发生错误时回滚
            self.conn.rollback()
コード例 #4
0
    def select_url_from_content(self):
        logger.info("select_url_from_content...")
        sql = "select url from content WHERE status = false"
        logger.debug('sql:' + sql)
        a = list()
        try:
            cur = self.conn.cursor()
            cur.execute(sql)
            data = cur.fetchall()
            for i in data:
                a.append(i[0])
            logger.info("select_url_from_content success")

        except Exception as e:
            logger.warning("select_url_from_content fail: %s" % e)
            # 发生错误时回滚
            self.conn.rollback()
        return a
コード例 #5
0
    def insert_many_to_relation(self, params):
        """
        relation表中插入多条数据
        :param params:
        :return:
        """
        logger.info("insert_many_to_relation " + str(params))
        sql = "INSERT INTO relation (layer_number, id, title, text) VALUES(%s, %s, %s, %s)"
        logger.debug('sql:' + sql)
        try:
            # 执行sql语句
            self.conn.cursor().executemany(sql, params)
            # 提交到数据库执行
            # self.conn.commit()
            logger.info("insert_many_to_relation success")

        except Exception as e:
            logger.warning("insert_many_to_relation fail: %s" % e)
            # 发生错误时回滚
            self.conn.rollback()
コード例 #6
0
    def insert_many_to_content(self, params):
        """
        content表中插入多条数据(url, father_url, layer_number)
        :param params:
        :return:
        """
        logger.info("insert_many_to_content " + str(params))
        sql = "INSERT INTO content (url, father_url, layer_number) VALUES(%s, %s, %s)"
        logger.debug('sql:' + sql)
        try:
            # 执行sql语句
            self.conn.cursor().executemany(sql, params)
            # 提交到数据库执行
            # self.conn.commit()
            logger.info("insert_many_to_content success")

        except mysql.connector.IntegrityError as e:
            # 唯一性约束去重
            logger.warning("have same url, lose it: %s" % e)

        except mysql.connector.Error as e:
            logger.warning("insert_many_to_content fail: %s" % e)
            # 发生错误时回滚
            self.conn.rollback()
コード例 #7
0
    def all_aa(self, url, layer_number):
        xpath = list()
        self._driver_open_url(url)
        page, html = self._parser_by_xml()
        self.mysql.update_html_to_content(url=url, html=html)
        tag = self._tag_a_has_href(page)
        text = [a.text for a in tag]
        main_handle = self.driver.current_window_handle
        self._get_xpath(page=page, tag=tag)
        while len(self.Xpath_list) > 0:
            xp = self.Xpath_list.pop()
            te = text.pop()
            logger.debug("len_of_list: " + str(len(self.Xpath_list)))
            logger.debug("xpath:" + str(xp))
            try:
                element = self.driver.find_element_by_xpath(xpath=xp)
            except exceptions.NoSuchElementException as e:
                logger.warning(str(e))
                continue
            if element.is_enabled() is True:
                try:
                    element.click()
                    xpath.append(xp)
                    if self.driver.current_url == self.current_url:
                        continue
                    else:
                        self.new_url.append(self.driver.current_url)
                        all_handles = self.driver.window_handles
                        if len(all_handles) > 1:
                            for handle in all_handles:
                                if handle != main_handle:
                                    try:
                                        self.driver.switch_to.window(handle)
                                        self.driver.close()
                                        logger.debug("close window: " +
                                                     str(handle))
                                    except exceptions.NoSuchWindowException as e:
                                        logger.warning(
                                            "selenium.common.exceptions.NoSuchWindowException: "
                                            + str(e))

                            self.driver.switch_to.window(main_handle)
                            logger.info("back to main_handle")
                        elif len(all_handles) == 1:
                            self.driver.back()
                        else:
                            raise ValueError("window_handle wrong")
                except:
                    logger.info('throw wrong xpath')
                    continue
            else:
                logger.info('throw wrong xpath')
                continue
            uid = int(gen_rand_str(length=7, s_type='digit'))
            logger.debug(
                '(uid, self.driver.current_url, self.Xpath_list[i], self.driver.current_url):'
                + str(uid) + self.driver.current_url + str(xp) +
                self.driver.current_url)
            self.mysql.insert_one_to_xpath((uid, self.driver.current_url,
                                            str(xp), self.driver.current_url))
            self.mysql.insert_one_to_relation(
                (layer_number, uid, self.driver.title, te))
        for i in range(len(self.new_url)):
            if self._check_useful_url(self.new_url[i]):
                pass
            else:
                del self.new_url[i]
        params = [(Url, url, layer_number) for Url in self.new_url]
        logger.debug(self.new_url)
        self.mysql.insert_many_to_content(params=params)
        self.mysql.update_true_status_to_content(url=url)

        self.get_back_to_init()