Beispiel #1
0
 def get(self):
     if request.method != "GET":
         return "error, method not allowed !"
     # mysql_db = PyMysql(HOST, PORT, USER, PASSWORD, DATABASE)
     # print(type(mysql_db))
     #
     # connection, cursor = mysql_db.connectAndGetCursor()
     # current_app.logger.info(type(connection))
     # current_app.logger.info(type(cursor))
     #
     # # 查询表数据
     # sql = """select CONCAT("alter table ",a.table_name," convert to character set utf8mb4 collate utf8mb4_bin;") from (select table_name from information_schema.`TABLES` where TABLE_SCHEMA = "contract") a;"""
     #
     # res = mysql_db.executeBySelect(connection, cursor, sql)
     # for sql_str in res:
     #     print(sql_str[0])
     # # current_app.logger.info(res)
     # res = re.search(r"\d+", "read12343123asd")
     res = {"a": 123}
     # cache.set("test", serialize(HtmlItem("test")))
     # cache.lpush("aaa", serialize(HtmlItem("zxcvbnm")))
     # res = cache.get('test')
     # # a = cache.rpop("aaa")
     # logger.info(f"{res, type(res)}")
     # logger.info(f"{a, type(a)}")
     logger.info("test")
     return {"code": 200, "mes": "OK", "data": [res, "a"]}
Beispiel #2
0
    def uncompress(self, compressed_files_path):
        """
        对传入的压缩文件进行内部解压处理
        Args:
            compressed_files_path: 需要解压的文件地址

        Returns:

        """
        logger.info(f"start uncompress file: {compressed_files_path}")
        if compressed_files_path and os.path.exists(compressed_files_path):

            # 根据文件类型做解压操作
            uncompress_dir = self.un_pack(compressed_files_path)

            # 解压后的文件归一处理
            files_path = self.file_processing(
                uncompress_dir) if uncompress_dir else None

            # 遍历解压后的文件,判断是否还含有压缩包
            self.recursion_decompressing(files_path) if files_path else None

            # 此处表示当前文件夹中没有需要再次解压的压缩包,整理内部所有文件,转移至一处
            files_path = self.file_processing(uncompress_dir)

            # 删除文件夹中的隐藏文件
            # self.del_file(files_path)

        else:
            logger.info(f"文件不存在、或路径存在问题,请检查!")
            files_path = ""
        return files_path
Beispiel #3
0
    def word_to_pdf_single(word_path: str, dir_path, delete=True):
        """
        ubuntu中使用命令行将doc、docx文件转换为pdf,并根据状态是否删除源文件
        Args:
            word_path: doc、docx文件路径
            dir_path: 转换后文件保存位置
            delete: 是否删除,默认删除

        Returns:

        """
        try:
            if word_path and os.path.exists(word_path):
                try:
                    status = os.system(
                        f"soffice --headless -convert-to pdf {word_path} --outdir {dir_path}"
                    )
                    if status in [0, "0"] and delete:
                        try:
                            os.remove(word_path)
                            logger.info(f"rm {word_path}")
                        except Exception as e:
                            logger.exception(e)
                except Exception as e:
                    logger.exception(e)

        except Exception as e:
            logger.exception(e)
Beispiel #4
0
    def download_page(self, item):
        """
        获取下一页的网页地址等
        根据当前状态标记新的状态
        0:初始页面
        1:全局列表页面
        2: 抽取下载连接
        """
        item = MyDict(item)
        try:
            print(f" download_page info: {item.url}")
            logger.info("开始下载网页!{}。类型:{}".format(item.url, item.type))
            # current_app.logger.info("开始下载网页!{}。类型:{}".format(item.url, item.type))
            old_type = item.type
            if item.url != "None" and not cache.get(item.url):
                # if item.url != "None":
                html_obj = requests.get(item.url, headers=self.headers)
                html_str = html_obj.content.decode("gbk")
                item.content = html_str
                print(len(html_str))

                # 请求结果存入redis数据库
                cache.set(item.url, html_str)
                cache.expire(item.url, self.redis_key_timeout)

                # item.xpath_obj = lxml.html.fromstring(html_str)
            logger.info("下载前类型:{}, 下载后类型:{}".format(old_type, item.type))
            self.push_item_in_redis_list(self.message_b, item)
        except Exception as e:
            logger.error("download_page:  {}".format(e))
Beispiel #5
0
 def _wrap(*args, **kwargs):
     st = time.time()
     rst = func(*args, **kwargs)
     et = time.time()
     output_str = "func: '{}' time: {}s".format(func.__name__, et - st)
     logger.info(output_str)
     return rst
Beispiel #6
0
 def executeByInsOrUpdOrDel(self,
                            connection,
                            cursor,
                            sql: str,
                            values=None):
     """
     执行单个sql语句
     :param connection: 数据库连接对象
     :param cursor: mysql游标
     :param sql: 要执行的sql语句
     :param values: 需要插入的数据
     :return: 
     """
     try:
         res = []
         if connection and cursor and sql:
             if values:
                 if isinstance(values, tuple):
                     cursor.execute(sql, values)
                 elif isinstance(values, list):
                     cursor.executemany(sql, values)
                 connection.commit()
                 res = cursor.fetchall()
                 logger.info("操作成功")
                 logger.info("执行结果:{}".format(res))
                 return res
             logger.info("错误:")
             return res
         logger.info("None")
         return res
     except Exception as e:
         logger.error("操作失败:{}".format(e))
         connection.roback()
         return None
Beispiel #7
0
    def file_processing(uncompress_dir):
        """
        移动转移文件到当前目录
        Args:
            uncompress_dir:

        Returns:

        """
        logger.info(f"start organize files : {uncompress_dir}")
        if os.path.exists(uncompress_dir):
            for path, dirs, files in os.walk(uncompress_dir):
                files_list = [
                    os.path.join(path, file) for file in files if files
                ]
                [
                    shutil.move(x, uncompress_dir) for x in files_list
                    if not os.path.exists(
                        os.path.join(uncompress_dir, os.path.basename(x)))
                ]
            logger.info(f"文件汇总转移完毕!")
            [[shutil.rmtree(os.path.join(path, son_dir)) for son_dir in dirs]
             for path, dirs, files in os.walk(uncompress_dir)]
            logger.info(f"空文件夹已清空!")

            [[
                os.remove(os.path.join(path, son_dir)) for son_dir in dirs
                if son_dir.startswith(".")
            ] for path, dirs, files in os.walk(uncompress_dir)]
            logger.info(f"无关文件已删除!")

        return uncompress_dir
Beispiel #8
0
 def _word_to_pdf_batch(dir_path, transform=None, delete=True):
     if transform is None:
         transform = ["doc", "docx"]
     try:
         if dir_path and os.path.exists(dir_path) and transform:
             for file_type in transform:
                 if os.system(
                         f"soffice --headless -convert-to pdf {dir_path}/*.{file_type} --outdir {dir_path}"
                 ) in [0, "0"] and delete:
                     try:
                         os.system(rf"rm -f {dir_path}/*.{file_type}")
                         logger.info(f"rm -f {dir_path}/*.{file_type}")
                     except Exception as e:
                         logger.exception(e)
     except Exception as e:
         logger.exception(e)
Beispiel #9
0
 def get_message(self, key, func):
     try:
         index = 0
         while True:
             message = cache.rpop(key)
             if message:
                 logger.info(f"{key}, message{message}")
                 # message = deserialization(self.__cls__().Default, message, url="default")
                 merge_process = Thread(target=func, args=(eval(message), ))
                 merge_process.start()
                 index = 0
             else:
                 if index >= self.timeout:
                     exit()
                 index += 1
                 # logger.info(f"{key}: {index}")
                 time.sleep(1)
     except Exception as e:
         logger.exception(e)
Beispiel #10
0
 def get(self):
     """
     
     # @description: 
     # @param {type} 
     # @return: 
     
     """
     try:
         key = request.args.get("key", None)
         if key == "search":
             values = request.args.get("search", None)
             if values:
                 search_res = ZeroCrawler(values).start()
                 print(search_res)
                 res = {"code": 200, "msg": "OK", "data": search_res}
                 logger.info("请求成功:{}".format(res))
                 return res
             res = {"code": 1002, "msg": "error", "data": []}
             logger.info("请求失败:{}".format(res))
             return res
         res = {"code": 1000, "msg": "不做处理", "data": []}
         logger.info("请求成功:{}".format(res))
         return res
     except Exception as e:
         logger.error(e)
Beispiel #11
0
    def word_to_pdf_batch(self, dir_path):
        """
        文件夹内的word文件批量转换为PDF,
        主要使用的是word中插入的图片,对文本没有要求
        Args:
            dir_path: 需要处理的文件夹

        Returns:

        """
        if dir_path:
            try:
                all_word_files = []
                for root, dirs, files in os.walk(dir_path):
                    [
                        self.word_to_pdf_single(os.path.join(root, x),
                                                dir_path) for x in files
                        if x.rsplit(".", 1)[-1] in ['doc', "docx"]
                        and not x.startswith(".")
                    ]
                logger.info(all_word_files)
            except Exception as e:
                logger.exception(e)
Beispiel #12
0
    def processes(crawlerProcess, start_url):
        logger.info("start to get info with dytt")
        number, result = crawlerProcess(start_url).start()
        logger.info("start write info to mysql")
        # self.write_to_mysql(result, dyHeaven_fields, DyHeaven)
        print("数据库字段:{}".format(dyHeaven_fields))
        logger.info("抽取到的结果:{}".format(result))

        return number, result
Beispiel #13
0
    def executeBySelect(self, connection, cursor, sql: str):
        try:

            res = []
            if connection and cursor and sql:
                cursor.execute(sql)
                res = cursor.fetchall()
                logger.info("操作成功")
                logger.info("执行结果:{}".format(res))
                return res
            logger.info("缺少关键参数")
            return res
        except Exception as e:
            logger.error(e)
            return []
Beispiel #14
0
    def start(self):
        try:
            self.push_item_in_redis_list(self.message_a, self.start_item)
            thr_downloader = Thread(target=self.get_message, args=(self.message_a, self.download_page))
            thr_primary = Thread(target=self.get_message, args=(self.message_b, self.primary))
            thr_merge_result = Thread(target=self.get_message, args=(self.message_c, self.merger_result))
            thr_downloader.start()
            logger.info("下载线程开启!")
            time.sleep(3)

            thr_primary.start()
            logger.info("抽取线程开启!")

            thr_merge_result.start()
            logger.info("数据合并线程开启")

            thr_merge_result.join()
            return self.result
        except Exception as identifier:
            logger.exception(identifier)
Beispiel #15
0
    def check_libre_office_status():
        """
        检查系统是否安装了libreoffice,没有安装的话,进行软件的安装
        Returns:

        """
        try:
            office_info = os.system("libreoffice --version")
            logger.info(f"{office_info}")
            if office_info not in [0, "0"]:
                logger.info(f"系统内部没有安装")
                logger.info(f"start install libreoffice")
                if os.system("yum install -y libreoffice") in [0, "0"]:
                    logger.info(f"安装成功")
                else:
                    logger.info(f"安装失败")
            else:
                logger.info(f"系统已安装")
        except Exception as e:
            logger.exception(e)
Beispiel #16
0
    def primary(self, item):
        """
        抽取主方法,抽取连接等所需要等内容
        """
        item = MyDict(item)
        try:
            logger.info("开始抽取:{}".format(item.url))
            xpath_obj = lxml.html.fromstring(item.content)
            # current_app.logger.info("开始抽取:{}".format(item.type))
            if item.type == "1":
                try:

                    tables = xpath_obj.xpath(
                        "//div[@class='bd3']//div[@class='co_content8']/ul//table"
                    )
                    print(len(tables))
                    for table in tables:
                        # URL抽取
                        url = table.xpath(".//tr[2]//a/@href")
                        if url:
                            new_url = url[0]
                            if not new_url.startswith("http"):
                                new_url = dyttUrl + new_url
                            tr_item = MyDict()
                            tr_item.url = new_url
                            tr_item.type = "2"
                            title = table.xpath(".//tr[2]//a/text()")
                            tr_item.title = title[0] if title else "None"

                            name = table.xpath(
                                ".//tr[2]//a/text()")[0] if table.xpath(
                                    ".//tr[2]//a/text()") else "《None》"
                            tr_item.name = "《{}》".format("".join(
                                re.findall(r"《(.*?)》", name)))

                            date = table.xpath(".//tr[3]//font/text()")
                            tr_item.date = date[0] if date else "None"

                            introduction = table.xpath(".//tr[4]/td/text()")
                            tr_item.introduction = introduction[
                                0] if introduction else "None"
                            self.push_item_in_redis_list(
                                self.message_a, tr_item)
                            logger.info("{}".format({
                                "url":
                                tr_item.url[:30],
                                "name":
                                tr_item.name[:6],
                                "title":
                                tr_item.title[:6],
                                "introduction":
                                tr_item.introduction[:30],
                                "type":
                                tr_item.type[:6]
                            }))
                except Exception as e:
                    logger.error("列表页抽取:  {}".format(e))
                    # current_app.logger.error("列表页抽取:  {}".format(e))
            elif item.type == "2":
                try:
                    logger.info("开始抽取详情页信息:{}".format(item.url))
                    # current_app.logger.info("开始抽取详情页信息:{}".format(item.url))
                    down_lists = xpath_obj.xpath("//div[@id='downlist']/table")
                    if not down_lists:
                        down_lists = xpath_obj.xpath("//div[@id='Zoom']/table")
                        # print("再次处理得到的连接:{}".format(len(down_lists)))
                    # print("共有下载连接:{}".format(len(down_lists)))
                    # magnet_info = {}
                    magnet_info = []
                    for downloader in down_lists:
                        magnet = downloader.xpath(
                            ".//a/@href")[0] if downloader.xpath(
                                ".//a/@href") else ""
                        download_name = downloader.xpath(
                            ".//a/text()")[0] if downloader.xpath(
                                ".//a/text()") else "]"
                        magnet_name = re.split(r"[=\]/]", download_name)[-1]
                        # magnet_info.update({magnet_name: magnet})
                        magnet_info.append((magnet_name, magnet))
                    item.magnet = magnet_info
                    self.push_item_in_redis_list(self.message_c, item)
                except Exception as e:
                    logger.error("信息页抽取:  {}".format(e))
                    # current_app.logger.error("信息页抽取:  {}".format(e))
            elif item.type == "0":
                try:
                    area2s = xpath_obj.xpath(
                        "//div[@class='bd2']/div[@class='index_list']/div[@class='co_area2']"
                    )
                    for area in area2s:
                        url = area.xpath(
                            ".//div[@class='title_all']/p/span/a/@href")
                        if url:
                            new_url = url[0]
                            if not new_url.startswith("http"):
                                new_url = dyttUrl + new_url
                            logger.info("首页抽取抽取到的网址:{}".format(new_url))
                            # current_app.logger.info("首页抽取抽取到的网址:{}".format(new_url))
                            # tr_item = DyHeavenCrawler.Default(new_url)

                            tr_item = MyDict()
                            tr_item.url = new_url

                            tr_item.type = "1"
                            tr_item.classify = area.xpath(
                                ".//div[@class='title_all']/p/span/a/text()"
                            )[0]
                            # print("首页抽取抽取到的分类:{}".format(tr_item.classify))
                            self.push_item_in_redis_list(
                                self.message_a, tr_item)
                except Exception as e:
                    logger.error("首页抽取:  {}".format(e))
                    # current_app.logger.error("首页抽取:  {}".format(e))
        except Exception as e:
            logger.error("primary:  {}".format(e))