Ejemplo n.º 1
0
    def __add_item_to_db(self, items, update_items, requests, callbacks,
                         items_fingerprints):
        export_success = False
        self._is_adding_to_db = True

        # 去重
        if setting.ITEM_FILTER_ENABLE:
            items, items_fingerprints = self.__dedup_items(
                items, items_fingerprints)

        # 分捡
        items_dict = self.__pick_items(items)
        update_items_dict = self.__pick_items(update_items,
                                              is_update_item=True)

        # item批量入库
        while items_dict:
            tab_item, datas = items_dict.popitem()

            log.debug("""
                -------------- item 批量入库 --------------
                表名: %s
                datas: %s
                    """ % (tab_item, tools.dumps_json(datas, indent=16)))

            export_success = self.__export_to_db(tab_item, datas)

        # 执行批量update
        while update_items_dict:
            tab_item, datas = update_items_dict.popitem()
            log.debug("""
                -------------- item 批量更新 --------------
                表名: %s
                datas: %s
                    """ % (tab_item, tools.dumps_json(datas, indent=16)))

            update_keys = self._item_update_keys.get(tab_item)
            export_success = self.__export_to_db(tab_item,
                                                 datas,
                                                 is_update=True,
                                                 update_keys=update_keys)

        # 执行回调
        while callbacks:
            try:
                callback = callbacks.pop(0)
                callback()
            except Exception as e:
                log.exception(e)

        # 删除做过的request
        if requests:
            self._db.zrem(self._table_request, requests)

        # 去重入库
        if export_success and setting.ITEM_FILTER_ENABLE:
            if items_fingerprints:
                self.__class__.dedup.add(items_fingerprints, skip_check=True)

        self._is_adding_to_db = False
Ejemplo n.º 2
0
    def create(self):
        __all__ = []

        import os

        path = os.getcwd()
        for file in os.listdir(path):
            if file.endswith(".py") and not file.startswith("__init__"):
                model = file.split(".")[0]
                __all__.append(model)

        del os

        with open("__init__.py", "w") as file:
            text = "__all__ = %s" % dumps_json(__all__)
            file.write(text)
Ejemplo n.º 3
0
    def create(self, sort_keys=False):
        contents = self.get_data()

        json = {}
        for content in contents:
            content = content.strip()
            if not content or content.startswith(":"):
                continue

            regex = "([^:\s]*)[:|\s]*(.*)"

            result = tools.get_info(content, regex, fetch_one=True)
            if result[0] in json:
                json[result[0]] = json[result[0]] + "&" + result[1]
            else:
                json[result[0]] = result[1].strip()

        print(tools.dumps_json(json, sort_keys=sort_keys))
Ejemplo n.º 4
0
def main():
    contents = tools.read_file('to_json.txt', readlines=True)

    json = {}
    for content in contents:
        content = content.strip()
        if not content or content.startswith(':'):
            continue

        regex = "([^:\s=]*)[:=\s]*(.*)"

        result = tools.get_info(content, regex, fetch_one=True)
        if result[0] in json:
            json[result[0]] = json[result[0]] + '&' + result[1]
        else:
            json[result[0]] = result[1].strip()

    print(tools.dumps_json(json))
Ejemplo n.º 5
0
 def __repr__(self):
     return "<{}: {}>".format(self.item_name,
                              tools.dumps_json(self.to_dict))
Ejemplo n.º 6
0
    def deal_requests(self, requests):
        for request in requests:

            response = None
            request_redis = request["request_redis"]
            request = request["request_obj"]

            del_request_redis_after_item_to_db = False
            del_request_redis_after_request_to_db = False

            for parser in self._parsers:
                if parser.name == request.parser_name:
                    used_download_midware_enable = False
                    try:
                        # 记录需下载的文档
                        self.record_download_status(
                            PaserControl.DOWNLOAD_TOTAL, parser.name)

                        # 解析request
                        if request.auto_request:
                            request_temp = None
                            if request.download_midware:
                                download_midware = (
                                    request.download_midware
                                    if callable(request.download_midware)
                                    else tools.get_method(
                                        parser, request.download_midware))
                                request_temp = download_midware(request)
                            elif request.download_midware != False:
                                request_temp = parser.download_midware(request)

                            if request_temp:
                                if not isinstance(request_temp, Request):
                                    raise Exception(
                                        "download_midware need return a request, but received type: {}"
                                        .format(type(request_temp)))
                                used_download_midware_enable = True
                                response = (
                                    request_temp.get_response()
                                    if not setting.RESPONSE_CACHED_USED else
                                    request_temp.get_response_from_cached(
                                        save_cached=False))
                            else:
                                response = (request.get_response()
                                            if not setting.RESPONSE_CACHED_USED
                                            else
                                            request.get_response_from_cached(
                                                save_cached=False))

                            if response == None:
                                raise Exception(
                                    "连接超时 url: %s" %
                                    (request.url or request_temp.url))

                        else:
                            response = None

                        if request.callback:  # 如果有parser的回调函数,则用回调处理
                            callback_parser = (request.callback if callable(
                                request.callback) else tools.get_method(
                                    parser, request.callback))
                            results = callback_parser(request, response)
                        else:  # 否则默认用parser处理
                            results = parser.parser(request, response)

                        if results and not isinstance(results, Iterable):
                            raise Exception(
                                "%s.%s返回值必须可迭代" %
                                (parser.name, request.callback or "parser"))

                        # 标识上一个result是什么
                        result_type = 0  # 0\1\2 (初始值\request\item)
                        # 此处判断是request 还是 item
                        for result in results or []:
                            if isinstance(result, Request):
                                result_type = 1
                                # 给request的 parser_name 赋值
                                result.parser_name = result.parser_name or parser.name

                                # 判断是同步的callback还是异步的
                                if result.request_sync:  # 同步
                                    request_dict = {
                                        "request_obj": result,
                                        "request_redis": None,
                                    }
                                    requests.append(request_dict)
                                else:  # 异步
                                    # 将next_request 入库
                                    self._request_buffer.put_request(result)
                                    del_request_redis_after_request_to_db = True

                            elif isinstance(result, Item):
                                result_type = 2
                                # 将item入库
                                self._item_buffer.put_item(result)
                                # 需删除正在做的request
                                del_request_redis_after_item_to_db = True

                            elif callable(result):  # result为可执行的无参函数
                                if (result_type == 2
                                    ):  # item 的 callback,buffer里的item均入库后再执行
                                    self._item_buffer.put_item(result)
                                    del_request_redis_after_item_to_db = True

                                else:  # result_type == 1: # request 的 callback,buffer里的request均入库后再执行。可能有的parser直接返回callback
                                    self._request_buffer.put_request(result)
                                    del_request_redis_after_request_to_db = True

                            # else:
                            #     raise TypeError('Expect Request、Item、callback func, bug get type: {}'.format(type(result)))

                    except Exception as e:
                        exception_type = (str(type(e)).replace("<class '",
                                                               "").replace(
                                                                   "'>", ""))
                        if exception_type.startswith("requests"):
                            # 记录下载失败的文档
                            self.record_download_status(
                                PaserControl.DOWNLOAD_EXCEPTION, parser.name)

                        else:
                            # 记录解析程序异常
                            self.record_download_status(
                                PaserControl.PAESERS_EXCEPTION, parser.name)

                        if setting.LOG_LEVEL == "DEBUG":  # 只有debug模式下打印, 超时的异常篇幅太多
                            log.exception(e)

                        log.error("""
                            -------------- %s.%s error -------------
                            error          %s
                            response       %s
                            deal request   %s
                            """ % (
                            parser.name,
                            (request.callback and callable(request.callback)
                             and getattr(request.callback, "__name__")
                             or request.callback) or "parser",
                            str(e),
                            response,
                            tools.dumps_json(request.to_dict, indent=28)
                            if setting.LOG_LEVEL == "DEBUG" else request,
                        ))

                        request.error_msg = "%s: %s" % (exception_type, e)
                        request.response = str(response)

                        if "Invalid URL" in str(e):
                            request.is_abandoned = True

                        requests = parser.exception_request(
                            request, response) or [request]
                        if not isinstance(requests, Iterable):
                            raise Exception("%s.%s返回值必须可迭代" %
                                            (parser.name, "exception_request"))
                        for request in requests:
                            if callable(request):
                                self._request_buffer.put_request(request)
                                continue

                            if not isinstance(request, Request):
                                raise Exception(
                                    "exception_request 需return request")

                            if (request.retry_times + 1 >
                                    setting.PARSER_MAX_RETRY_TIMES
                                    or request.is_abandoned):
                                self.__class__._failed_task_count += 1  # 记录失败任务数

                                # 处理failed_request的返回值 request 或 func
                                results = parser.failed_request(
                                    request, response) or [request]
                                if not isinstance(results, Iterable):
                                    raise Exception(
                                        "%s.%s返回值必须可迭代" %
                                        (parser.name, "failed_request"))

                                for result in results:
                                    if isinstance(result, Request):
                                        if setting.SAVE_FAILED_REQUEST:
                                            if used_download_midware_enable:
                                                # 去掉download_midware 添加的属性
                                                original_request = (
                                                    Request.from_dict(
                                                        eval(request_redis)) if
                                                    request_redis else result)
                                                original_request.error_msg = (
                                                    request.error_msg)
                                                original_request.response = (
                                                    request.response)

                                                self._request_buffer.put_failed_request(
                                                    original_request)
                                            else:
                                                self._request_buffer.put_failed_request(
                                                    result)

                                    elif callable(result):
                                        self._request_buffer.put_request(
                                            result)

                                    elif isinstance(result, Item):
                                        self._item_buffer.put_item(result)

                                del_request_redis_after_request_to_db = True

                            else:
                                # 将 requests 重新入库 爬取
                                request.retry_times += 1
                                request.filter_repeat = False
                                log.info("""
                                    入库 等待重试
                                    url     %s
                                    重试次数 %s
                                    最大允许重试次数 %s""" % (
                                    request.url,
                                    request.retry_times,
                                    setting.PARSER_MAX_RETRY_TIMES,
                                ))
                                if used_download_midware_enable:
                                    # 去掉download_midware 添加的属性 使用原来的requests
                                    original_request = (Request.from_dict(
                                        eval(request_redis)) if request_redis
                                                        else request)
                                    if hasattr(request, "error_msg"):
                                        original_request.error_msg = request.error_msg
                                    if hasattr(request, "response"):
                                        original_request.response = request.response
                                    original_request.retry_times = request.retry_times
                                    original_request.filter_repeat = (
                                        request.filter_repeat)

                                    self._request_buffer.put_request(
                                        original_request)
                                else:
                                    self._request_buffer.put_request(request)
                                del_request_redis_after_request_to_db = True

                    else:
                        # 记录下载成功的文档
                        self.record_download_status(
                            PaserControl.DOWNLOAD_SUCCESS, parser.name)
                        # 记录成功任务数
                        self.__class__._success_task_count += 1

                        # 缓存下载成功的文档
                        if setting.RESPONSE_CACHED_ENABLE:
                            request.save_cached(
                                response=response,
                                expire_time=setting.
                                RESPONSE_CACHED_EXPIRE_TIME,
                            )

                    break

            # 删除正在做的request 跟随item优先
            if request_redis:
                if del_request_redis_after_item_to_db:
                    self._item_buffer.put_item(request_redis)

                elif del_request_redis_after_request_to_db:
                    self._request_buffer.put_del_request(request_redis)

                else:
                    self._request_buffer.put_del_request(request_redis)

        if setting.PARSER_SLEEP_TIME:
            time.sleep(setting.PARSER_SLEEP_TIME)
Ejemplo n.º 7
0
    def deal_requests(self, requests):
        for request in requests:

            response = None

            for parser in self._parsers:
                if parser.name == request.parser_name:
                    try:
                        # 记录需下载的文档
                        self.record_download_status(
                            PaserControl.DOWNLOAD_TOTAL, parser.name)

                        # 解析request
                        if request.auto_request:
                            request_temp = None
                            if request.download_midware:
                                download_midware = (
                                    request.download_midware
                                    if callable(request.download_midware)
                                    else tools.get_method(
                                        parser, request.download_midware))
                                request_temp = download_midware(request)
                            elif request.download_midware != False:
                                request_temp = parser.download_midware(request)

                            if request_temp:
                                if not isinstance(request_temp, Request):
                                    raise Exception(
                                        "download_midware need return a request, but received type: {}"
                                        .format(type(request_temp)))
                                request = request_temp

                            response = (request.get_response()
                                        if not setting.RESPONSE_CACHED_USED
                                        else request.get_response_from_cached(
                                            save_cached=False))

                        else:
                            response = None

                        if request.callback:  # 如果有parser的回调函数,则用回调处理
                            callback_parser = (request.callback if callable(
                                request.callback) else tools.get_method(
                                    parser, request.callback))
                            results = callback_parser(request, response)
                        else:  # 否则默认用parser处理
                            results = parser.parser(request, response)

                        if results and not isinstance(results, Iterable):
                            raise Exception(
                                "%s.%s返回值必须可迭代" %
                                (parser.name, request.callback or "parser"))

                        # 此处判断是request 还是 item
                        for result in results or []:
                            if isinstance(result, Request):
                                # 给request的 parser_name 赋值
                                result.parser_name = result.parser_name or parser.name

                                # 判断是同步的callback还是异步的
                                if result.request_sync:  # 同步
                                    requests.append(result)
                                else:  # 异步
                                    # 将next_request 入库
                                    self._memory_db.add(result)

                    except Exception as e:
                        exception_type = (str(type(e)).replace("<class '",
                                                               "").replace(
                                                                   "'>", ""))
                        if exception_type.startswith("requests"):
                            # 记录下载失败的文档
                            self.record_download_status(
                                PaserControl.DOWNLOAD_EXCEPTION, parser.name)

                        else:
                            # 记录解析程序异常
                            self.record_download_status(
                                PaserControl.PAESERS_EXCEPTION, parser.name)

                        if setting.LOG_LEVEL == "DEBUG":  # 只有debug模式下打印, 超时的异常篇幅太多
                            log.exception(e)

                        log.error("""
                                -------------- %s.%s error -------------
                                error          %s
                                response       %s
                                deal request   %s
                                """ % (
                            parser.name,
                            (request.callback and callable(request.callback)
                             and getattr(request.callback, "__name__")
                             or request.callback) or "parser",
                            str(e),
                            response,
                            tools.dumps_json(request.to_dict, indent=28)
                            if setting.LOG_LEVEL == "DEBUG" else request,
                        ))

                        request.error_msg = "%s: %s" % (exception_type, e)
                        request.response = str(response)

                        if "Invalid URL" in str(e):
                            request.is_abandoned = True

                        requests = parser.exception_request(
                            request, response) or [request]
                        if not isinstance(requests, Iterable):
                            raise Exception("%s.%s返回值必须可迭代" %
                                            (parser.name, "exception_request"))
                        for request in requests:
                            if not isinstance(request, Request):
                                raise Exception(
                                    "exception_request 需return request")

                            if (request.retry_times + 1 >
                                    setting.PARSER_MAX_RETRY_TIMES
                                    or request.is_abandoned):
                                self.__class__._failed_task_count += 1  # 记录失败任务数

                                # 处理failed_request的返回值 request 或 func
                                results = parser.failed_request(
                                    request, response) or [request]
                                if not isinstance(results, Iterable):
                                    raise Exception(
                                        "%s.%s返回值必须可迭代" %
                                        (parser.name, "failed_request"))

                                log.info("""
                                    任务超过最大重试次数,丢弃
                                    url     %s
                                    重试次数 %s
                                    最大允许重试次数 %s""" % (
                                    request.url,
                                    request.retry_times,
                                    setting.PARSER_MAX_RETRY_TIMES,
                                ))

                            else:
                                # 将 requests 重新入库 爬取
                                request.retry_times += 1
                                request.filter_repeat = False
                                log.info("""
                                        入库 等待重试
                                        url     %s
                                        重试次数 %s
                                        最大允许重试次数 %s""" % (
                                    request.url,
                                    request.retry_times,
                                    setting.PARSER_MAX_RETRY_TIMES,
                                ))
                                self._memory_db.add(request)

                    else:
                        # 记录下载成功的文档
                        self.record_download_status(
                            PaserControl.DOWNLOAD_SUCCESS, parser.name)
                        # 记录成功任务数
                        self.__class__._success_task_count += 1

                        # 缓存下载成功的文档
                        if setting.RESPONSE_CACHED_ENABLE:
                            request.save_cached(
                                response=response,
                                expire_time=setting.
                                RESPONSE_CACHED_EXPIRE_TIME,
                            )

                    break

        if setting.PARSER_SLEEP_TIME:
            time.sleep(setting.PARSER_SLEEP_TIME)