Example #1
0
    def get_conn(self):
        logger.info("connecting to mysql...")
        logger.info((Services.host, Services.port, Services.username,
                     Services.password, Services.database))
        try:
            self.conn = mysql.connector.connect(host=Services.host,
                                                port=Services.port,
                                                user=Services.username,
                                                password=Services.password,
                                                database=Services.database,
                                                charset='utf8')
            self.conn.autocommit = True

            # self.conn = pymysql.connect(host=Services.host,
            #                             port=Services.port,
            #                             user=Services.username,
            #                             password=Services.password,
            #                             database=Services.database,
            #                             charset='utf8'
            #                             )
        except Exception as e:
            logger.warning('File to connect database: %s' % e)
            logger.warning('stop')
            pass
        return self.conn
    def get(self):
        args = self.getAttributeParse.parse_args()
        # 查询最优属性
        if args['type'] == 0:
            pn = args['pn'] if args['pn'] else 1
            rn = args['rn'] if args['rn'] else 5
            offset = (pn - 1) * rn
            data = requests.get(urls['getBestAttr'] % {
                'domain': args['domain'],
                'count': rn,
                'offset': offset
            }).json()
            data['message'] = 'success'
            logger.info(
                logging_format % {
                    'ip': request.remote_addr,
                    'url': request.url,
                    'data': json.dumps(data)
                })
            return data

        # 查询同义属性
        elif args['type'] == 1:
            data = requests.get(urls['getOtherAttr'] % args).json()
            data['message'] = 'success'
            logger.info(
                logging_format % {
                    'ip': request.remote_addr,
                    'url': request.url,
                    'data': json.dumps(data)
                })
            return data
 def post(self):
     args = self.parse.parse_args()
     data = requests.get(urls['addOtherAttr'] % args).json()
     data['message'] = 'success'
     logger.info(
         logging_format % {
             'ip': request.remote_addr,
             'url': request.url,
             'data': json.dumps(data)
         })
     return data
Example #4
0
 def add_edges_for_lost(self):
     text = self.mysql.select_id_text_point_url()
     data = list()
     for ii in text:
         li = list()
         id_text = str(ii[0]) + str(ii[1])
         li.append(id_text)
         li.append(ii[2])
         tu = tuple(li)
         data.append(tu)
     logger.info("add_edges_for_lost:" + str(data))
     self.G.add_edges_from(data)
Example #5
0
 def _check_useful_url(self, url):
     """
     检查URL
     :return:
     """
     try:
         logger.info("check url: " + str(url))
         self.driver.get(url=url)
         logger.info("useful url: " + str(url))
         return True
     except exceptions as e:
         logger.warning("Unuseful url: " + str(url))
         return False
Example #6
0
 def make_data(self):
     urls = self.mysql.select_url()
     for i in urls:
         data = list()
         text = self.mysql.select_text(i)
         for ii in text:
             li = list()
             li.append(i)
             id_text = str(ii[0]) + str(ii[1])
             li.append(id_text)
             tu = tuple(li)
             data.append(tu)
         logger.info("add_edges:" + str(data))
         self.G.add_edges_from(data)
Example #7
0
    def select_id_text_point_url(self):
        logger.info("select_id_text_point_url...")
        sql = "select x.id, r.text, x.point_url from xpath as x,relation as r where x.id = r.id"
        try:
            cur = self.conn.cursor()
            cur.execute(sql)
            data = cur.fetchall()
            logger.info("select text success")

        except Exception as e:
            logger.warning("select_text fail: %s" % e)
            # 发生错误时回滚
            self.conn.rollback()
        return data
Example #8
0
 def _driver_open_url(self, url):
     """
     打开一个URL
     :param url:
     :return:
     """
     try:
         logger.info("open url: " + str(url))
         self.driver.get(url=url)
         logger.info("current url :" + self.driver.current_url)
     except exceptions as e:
         logger.warning("open url: %s fail" % url)
         logger.warning("error:" + e)
     self._update_current_url()
Example #9
0
    def select_text(self, url):
        logger.info("select text...")
        sql = "select relation.id,relation.text from relation,xpath where relation.id = xpath.id and xpath.url = '%s'" % url
        a = list()
        try:
            cur = self.conn.cursor()
            cur.execute(sql)
            data = cur.fetchall()
            logger.info("select text success")

        except Exception as e:
            logger.warning("select_text fail: %s" % e)
            # 发生错误时回滚
            self.conn.rollback()
        return data
Example #10
0
    def update_true_status_to_content(self, url):
        logger.info("update_true_status_to_content" + ' url:' + str(url))
        sql = "UPDATE content SET status = TRUE WHERE url = '%s'" % url
        logger.debug('sql:' + sql)
        try:
            # 执行sql语句
            self.conn.cursor().execute(sql)
            # 提交到数据库执行
            # self.conn.commit()
            logger.info("update_true_status_to_content success")

        except Exception as e:
            logger.warning("update_true_status_to_content fail: %s" % e)
            # 发生错误时回滚
            self.conn.rollback()
Example #11
0
    def insert_one_to_xpath(self, params):
        """xpath表中插入一条数据"""
        logger.info("insert_one_to_xpath " + str(params))
        sql = "INSERT INTO xpath (id, url, xpath, point_url) VALUES(%s, %s, %s, %s)"
        logger.debug('sql:' + sql)
        try:
            # 执行sql语句
            self.conn.cursor().execute(sql, params)
            # 提交到数据库执行
            # self.conn.commit()
            logger.info("insert_one_to_xpath success")

        except Exception as e:
            logger.warning("insert_one_to_xpath fail: %s" % e)
            # 发生错误时回滚
            self.conn.rollback()
Example #12
0
    def select_url(self):
        logger.info("select_distinct_url_from_xpath...")
        sql = "select distinct url from xpath"
        a = list()
        try:
            cur = self.conn.cursor()
            cur.execute(sql)
            data = cur.fetchall()
            for i in data:
                a.append(i[0])
            logger.info("select point_url from_xpath success")

        except Exception as e:
            logger.warning("select_point_url_from_content fail: %s" % e)
            # 发生错误时回滚
            self.conn.rollback()
        return a
Example #13
0
    def first_data(self, urls):
        """初始化数据"""
        logger.info("first_data")
        sql = "INSERT INTO content (layer_number, url, status) VALUES (%s, '%s', %s)" % (
            0, urls, 'true')
        logger.debug('sql:' + sql)
        try:
            # 执行sql语句
            self.conn.cursor().execute(sql)
            # 提交到数据库执行
            # self.conn.commit()
            logger.info("first_data success")

        except Exception as e:
            logger.warning("first_data fail: %s" % e)
            # 发生错误时回滚
            self.conn.rollback()
Example #14
0
    def select_url_from_content(self):
        logger.info("select_url_from_content...")
        sql = "select url from content WHERE status = false"
        logger.debug('sql:' + sql)
        a = list()
        try:
            cur = self.conn.cursor()
            cur.execute(sql)
            data = cur.fetchall()
            for i in data:
                a.append(i[0])
            logger.info("select_url_from_content success")

        except Exception as e:
            logger.warning("select_url_from_content fail: %s" % e)
            # 发生错误时回滚
            self.conn.rollback()
        return a
Example #15
0
    def insert_many_to_relation(self, params):
        """
        relation表中插入多条数据
        :param params:
        :return:
        """
        logger.info("insert_many_to_relation " + str(params))
        sql = "INSERT INTO relation (layer_number, id, title, text) VALUES(%s, %s, %s, %s)"
        logger.debug('sql:' + sql)
        try:
            # 执行sql语句
            self.conn.cursor().executemany(sql, params)
            # 提交到数据库执行
            # self.conn.commit()
            logger.info("insert_many_to_relation success")

        except Exception as e:
            logger.warning("insert_many_to_relation fail: %s" % e)
            # 发生错误时回滚
            self.conn.rollback()
Example #16
0
    def check_xpath(self, dr):
        """检查xpath并入库"""
        if dr in ['firefox', 'chrome', 'ie']:
            obj = Parser(dr)
            page = obj.parser_by_lxml()
            tag = obj.tag_a_has_href(page)
            obj.get_xpath(page=page, tag=tag)

            while len(obj.Xpath_list) > 0:
                xpath = obj.Xpath_list.pop()
                try:
                    obj.driver.find_element_by_xpath(xpath=xpath).click()
                    if obj.driver.current_url == obj.current_url:
                        continue
                    else:
                        obj.new_url.append(obj.driver.current_url)
                        all_handles = obj.driver.window_handles
                        main_handle = obj.driver.current_window_handle
                        for handle in all_handles:
                            if handle != obj.driver.current_window_handle:
                                obj.driver.switch_to_window(handle)
                                obj.driver.close()
                        obj.driver.switch_to_window(main_handle)
                        logger.info("back to " + obj.driver.current_url)

                except:
                    del obj.Xpath_list[i]
                    logger.warning('delete wrong xpath')
                    pass
                obj.update_current_url()
                params = (gen_rand_str(length=8, s_type='digit'),
                          obj.driver.current_url, obj.Xpath_list[i])
                self.insert_one_to_xpath(params)

        else:
            logger.warning('wrong driver, only for firefox, chrome, ie!')
Example #17
0
    def insert_many_to_content(self, params):
        """
        content表中插入多条数据(url, father_url, layer_number)
        :param params:
        :return:
        """
        logger.info("insert_many_to_content " + str(params))
        sql = "INSERT INTO content (url, father_url, layer_number) VALUES(%s, %s, %s)"
        logger.debug('sql:' + sql)
        try:
            # 执行sql语句
            self.conn.cursor().executemany(sql, params)
            # 提交到数据库执行
            # self.conn.commit()
            logger.info("insert_many_to_content success")

        except mysql.connector.IntegrityError as e:
            # 唯一性约束去重
            logger.warning("have same url, lose it: %s" % e)

        except mysql.connector.Error as e:
            logger.warning("insert_many_to_content fail: %s" % e)
            # 发生错误时回滚
            self.conn.rollback()
Example #18
0
 def end_conn(self):
     """关闭连接"""
     self.conn.close()
     logger.info("close connect")
Example #19
0
    def all_aa(self, url, layer_number):
        xpath = list()
        self._driver_open_url(url)
        page, html = self._parser_by_xml()
        self.mysql.update_html_to_content(url=url, html=html)
        tag = self._tag_a_has_href(page)
        text = [a.text for a in tag]
        main_handle = self.driver.current_window_handle
        self._get_xpath(page=page, tag=tag)
        while len(self.Xpath_list) > 0:
            xp = self.Xpath_list.pop()
            te = text.pop()
            logger.debug("len_of_list: " + str(len(self.Xpath_list)))
            logger.debug("xpath:" + str(xp))
            try:
                element = self.driver.find_element_by_xpath(xpath=xp)
            except exceptions.NoSuchElementException as e:
                logger.warning(str(e))
                continue
            if element.is_enabled() is True:
                try:
                    element.click()
                    xpath.append(xp)
                    if self.driver.current_url == self.current_url:
                        continue
                    else:
                        self.new_url.append(self.driver.current_url)
                        all_handles = self.driver.window_handles
                        if len(all_handles) > 1:
                            for handle in all_handles:
                                if handle != main_handle:
                                    try:
                                        self.driver.switch_to.window(handle)
                                        self.driver.close()
                                        logger.debug("close window: " +
                                                     str(handle))
                                    except exceptions.NoSuchWindowException as e:
                                        logger.warning(
                                            "selenium.common.exceptions.NoSuchWindowException: "
                                            + str(e))

                            self.driver.switch_to.window(main_handle)
                            logger.info("back to main_handle")
                        elif len(all_handles) == 1:
                            self.driver.back()
                        else:
                            raise ValueError("window_handle wrong")
                except:
                    logger.info('throw wrong xpath')
                    continue
            else:
                logger.info('throw wrong xpath')
                continue
            uid = int(gen_rand_str(length=7, s_type='digit'))
            logger.debug(
                '(uid, self.driver.current_url, self.Xpath_list[i], self.driver.current_url):'
                + str(uid) + self.driver.current_url + str(xp) +
                self.driver.current_url)
            self.mysql.insert_one_to_xpath((uid, self.driver.current_url,
                                            str(xp), self.driver.current_url))
            self.mysql.insert_one_to_relation(
                (layer_number, uid, self.driver.title, te))
        for i in range(len(self.new_url)):
            if self._check_useful_url(self.new_url[i]):
                pass
            else:
                del self.new_url[i]
        params = [(Url, url, layer_number) for Url in self.new_url]
        logger.debug(self.new_url)
        self.mysql.insert_many_to_content(params=params)
        self.mysql.update_true_status_to_content(url=url)

        self.get_back_to_init()