Example #1
0
def get_status(sku):
    response = {"status": __in_queue__}
    status = None
    try:
        status = list(db_status.find({"sku": sku}))[0]
        status = status.get("msg")
        db_details = DB.init_db(config.get("details_db")).product_details
        product = list(db_details.find({"sku": sku}))
        product_url, product_name, image_url = "", "", ""
        if product:
            product_name = product[0].get("product_name")
            product_url = product[0].get("url")
            image_url = product[0].get("img")
        logger.info("Status for {}: {}".format(sku, status))
        return {
            "status": status,
            "product_name": product_name,
            "product_url": product_url,
            "image_url": image_url,
        }
    except IndexError:
        # this happens due to a race condition because the sku hasn't been
        # added to the database yet or because it simply doesn't exist. The
        # second case only true if the URL has been typed in manually or
        # bookmarked but the sku is missing from the URL.
        logger.warning(
            "Product status not yet available for sku {}".format(sku))
        _set_status(__in_queue__, sku)
        response = {"status": status}
    except Exception as e:
        logger.exception(e)
        response = {"status": __error__}
    return response
Example #2
0
def get_answer(question, sku):
    """
    Open a websocket and keep the connection alive for the duration 
    of the session so that the sku model is loaded only once. 
    """
    response = {"status": __error__}
    try:
        db_votes = DB.init_db(config.get("votes_db"))
        inf = inference.Inference(sku)
        answer, confidence = inf.infer(question)
        cluster = qna_clustering.Cluster({
            "question": question,
            "answer": answer,
            "sku": sku
        })
        votes = cluster.get_votes(db_votes)
        response = {
            "confidence": confidence,
            "answer": answer,
            "status": "OK",
        }
        response.update(votes)
    except Exception as e:
        msg = '{}'.format(type(e).__name__, e.args[0])
        logger.exception(msg)
    return response
Example #3
0
    def start(self):
        """
        启动引擎
        1.引擎可以多次循环启动,有先后顺序
        2.先启动start_requests任务(必有),然后end_requests_1任务、end_requests_2任务、end_requests_3任务...以此类推
        3.end_request系列的任务不一定会有,这取决于所有加载成功的业务建造器
        4.当按顺序检索到所有业务建造器到某一点没有end_request任务,引擎不再循环启动,真正结束
        """

        end_num = 1
        try:
            self.__start_engine('start_requests')  # 先执行start_request任务
            while True:
                request_type = 'end_requests_%s' % end_num
                self.__start_engine(request_type)
                if not self.__while_run or end_num >= 10:  # 防止未知BUG导致死循环,限制10次内结束,根据实际需求再调整
                    cf.print_log('%s没有请求任务,引擎循环启动结束!' % request_type)
                    break
                end_num += 1
        except CheckUnPass as e:
            logger.exception(e)
        except Exception as e:
            logger.ding_exception(self.__f_exception, e, self.framework_key)
        cf.print_log('总共完成业务%s个!添加请求%s个,完成响应%s个,其中错误响应%s个!' %
                     (self.__builders_num, self.total_request_nums,
                      self.total_response_nums, self.total_error_nums))
Example #4
0
    def _amazon_detail_parser(self, soup):
        logger.info("Started parsing Amazon product detail page for {}".format(self.sku))
        rcount_sel_outer = selectors.get(self.source).get("review_count_outer")
        rcount_sel_inner = selectors.get(self.source).get("review_count_inner")
        name_selector = selectors.get(self.source).get("product_name")
        image_selector = selectors.get(self.source).get("product_image")
        
        # get review count
        review_count = -1
        try:
            review_count = self._amazon_review_count(soup, rcount_sel_outer, rcount_sel_inner)
        except Exception as e:
            # if we have a problem parsing the review count, then 
            # we have nothing to work with
            logger.exception(e)
            return {}
        
        page_count = self._get_page_count(review_count, divisor=10)

        # get product name and image url
        name = soup.select(name_selector)[0].text.strip()
        img_url = self._amazon_image_url(soup, image_selector)
        logger.info("Finished parsing Amazon product detail page for {}".format(self.sku))
        return {"product_name": name,
                "review_count": review_count,
                "page_count": page_count,
                "img_url": img_url,
            }
Example #5
0
    def get_request(self, url, init=True):
        '''
        Return HTTP response object.

        :param init: initial request for product detail or image download
        :return response: an html response
        '''
        try:
            response = None
            params = {
                'headers': self._set_headers(),
                'timeout': settings.get("http_timeout"),
            }
            proxies = {
                'http': settings.get("proxies").get("http"),
                'https': settings.get("proxies").get("https"),
            }
            logger.info("About to make a request on url: " + url)
            try:
                if not init:
                    t = self._get_duration()
                    logger.info('Throttling by {} second/s'.format(t))
                    time.sleep(t)
                response = requests.get(url, proxies=proxies, **params)
                # response = requests.get(url, **params) # Enable when debugging
                msg = 'New response: status_code={} url={}'
                logger.info(msg.format(response.status_code, response.url))
            except Exception as e:
                msg = '{}: {} url={}'.format(type(e).__name__, e.args[0], url)
                logger.exception(msg)
                if init:
                    # If exception raised while parsing detail page, we're in trouble so need 
                    # to update the status of the job 
                    main._set_status(__error__, self.sku)
                    
            
            if not init and response != None:
                # parse the reviews and save them to the database 
                logger.info("About to call review parser for url " + url)
                pr = parser.Parser(sku=self.sku, prod_name=self.prod_name, source=self.source)
                pr.parse(response.text)
                logger.info("Parser has finished parsing (may or may not have succeeded) url " + url)

                # remove it from the queue; Log the counts for some sanity check
                logger.info("Attempting to remove from the queue: " + url)
                before = len(list(q_db.find({"sku": self.sku})))
                q_db.delete_one({"sku": self.sku, "url": url})
                after = len(list(q_db.find({"sku": self.sku})))
                logger.info("Count in db before: {} after: {}".format(before, after))

            if response:
                logger.info("HTTP status code: " + str(response.status_code)) 
                return response.text

        except Exception as e:
            msg = '{}: {} url={}'.format(type(e).__name__, e.args[0], url)
            logger.exception(msg)
Example #6
0
 def parse(self, response, init=False):
     what_for = "detail parsing"
     if not init:
         what_for = "review parsing"
     try:
         if init:
             return self._parse_detail(response)
         self._parse_reviews(response)
     except Exception as e:
         logger.exception(e)
Example #7
0
def _workflow(decoded, url):
    """
    Run the whole data scraping, processing, and analysis in new threads.
    Each thread, beginning with this one, will make its calls in a try-except
    block. Why do it this way? Because the parent thread that launched this 
    thread dies immediately. Therefore, when an exception is raised in the child
    thread, there's no one to receive it. This is bad for the client. The client
    relies on the status of the job. If an exception is raised in the child thread,
    the thread would die and the status would no longer be updated. This would cause
    the client to stall forever with a progress animation. If an exception 
    is raised, we want to update the status right away so that the user 
    doesn't have to wait. 

    All operations down the line like scraping or some other launch their own 
    child threads. Those operations also need to update the status before 
    exiting.
    """
    logger.info("Running a new thread for scraping and data processing")
    source = decoded[0]
    sku = decoded[1]
    url = decoded[2]
    _set_status(__in_queue__, sku)
    parsed = _db_product_details(sku)
    try:
        # Has the detail page been parsed?
        if not parsed:
            logger.info(
                "Detail page not available for {}. Proceeding to download...".
                format(sku))
            parsed = _get_product_details(source, url, sku)
            if not parsed:
                logger.error("Error while parsing product detail page for " +
                             sku)
                logger.error("Aborting process")
                _set_status(__error__, sku)
                return
        else:
            logger.info(
                "Detail page for {} already parsed. Skipping download...".
                format(sku))

        _set_status("Gathering data", sku)
        prod_name = parsed.get("product_name")
        review_count = parsed.get("review_count")
        page_count = parsed.get("page_count")

        # Do we have enough data to train on?
        if review_count <= config.get("misc").get("min_review_count"):
            logger.warning("Not enough reviews for " + sku)
            logger.error("Aborting process")
            _set_status("Not Enough Data", sku)
            return
        # If it's not in the queue, add it
        if not _is_in_queue(sku):
            logger.info(sku +
                        " not in queue. Checking if it's been scraped before")
            if not _reviews_scraped(sku):
                logger.info(sku +
                            " has not been scraped. Adding to the queue...")
                sc_helper.add_to_queue(source, sku, page_count)
        # If it's in the queue, scrape it
        if _is_in_queue(sku):
            logger.info(sku + " is in the queue. Launching the scraper")
            sc_helper.scrape(sku, prod_name, source)

        # If it hasn't been trained, train it
        if not _is_trained(sku):
            _nlp_reset(sku)
            logger.info("Starting NLP preprocessing")
            _set_status("Analyzing language", sku)
            preprocess.NLPreprocessor(sku).tokenize()
            logger.info("Finished NLP preprocessing")
            logger.info("Starting model trianing")
            _set_status("Building knowledge base", sku)
            d2v = training.Document2Vector(sku).train()
            logger.info("Finished model training")
            _update_details_db(sku)
            _set_status("Ready", sku)
    except Exception as e:
        logger.exception(e)
        _set_status(__error__, sku)
Example #8
0
    def __init_all(self):
        """
        把通过脚本传参开启的业务初始化为引擎能用的格式
        """

        # 开始加载
        cf.print_log('开始加载所有业务模块...')

        # 默认对象
        # 默认对象是完全一样功能的对象,无需每循环一个就开辟一块内存,在循环前先创建
        pipeline = Pipeline()  # 默认管道
        builder_mw = BuilderMiddleware()  # 默认建造器中间件
        downloader_mw = DownloaderMiddleware()  # 默认下载器中间件

        # 1.校验并添加业务建造器
        # 业务建造器是整个业务最重要的组件,其中一步校验没通过都将跳过该业务
        # 主参数为*,会把对应模块下所有业务都开启
        # 注意,如果为*,引擎会把该模块下所有文件(夹)名称作为code,请不要把无关文件(夹)放在对应模块下
        for type_, codes in argv.main_key_dict.items():
            module_name = p_parser[pk_main][type_][pk_module]
            code_list = os.listdir(
                '%s/%s' % (business_name,
                           module_name)) if codes == '*' else codes.split(',')

            # 1.1 校验模块
            for code in set(code_list):
                cf.print_log('加载%s模块下的%s...' % (module_name, code))
                try:
                    m = import_module('%s.%s.%s.%s' %
                                      (business_name, module_name, code, code))
                except ModuleNotFoundError:
                    logger.exception(
                        '%s为%s的模块不存在,请检查目录结构是否正确!如使用*,请不要把无关文件(夹)放在对应模块下!' %
                        (type_, code))
                    continue

                # 1.2 校验建造器规范
                try:
                    obj = getattr(m, p_parser[pk_main][type_][pk_builder])
                except AttributeError:
                    logger.exception(
                        '%s为%s的模块下没找到根据config配置的建造器类名%s,请规范写法。' %
                        (type_, code, p_parser[pk_main][type_][pk_builder]))
                    continue
                obj_name = obj.name  # 建造器名称(业务名称)
                if not issubclass(obj, Builder):  # 校验是否继承内置建造器
                    logger.exception('业务建造器(%s)没有继承内置建造器,请规范写法。' % obj_name)
                    continue

                # 1.3 校验业务名称唯一性
                if obj_name is None:
                    logger.exception(
                        '%s为%s的模块下的建造器对象%s没有name属性,name属性识别对应业务,必须有且是唯一值!' %
                        (type_, code, p_parser[pk_main][type_][pk_builder]))
                    continue
                elif obj_name in self.__builders.keys():
                    logger.exception(
                        '%s为%s的模块下的建造器对象%s的name属性(%s)与其他业务重复,name属性识别对应业务,必须有且是唯一值!'
                        % (type_, code, p_parser[pk_main][type_][pk_builder],
                           obj_name))
                    continue
                elif obj_name in self.__builders.keys() or not isinstance(
                        obj_name, str):
                    logger.exception(
                        '%s为%s的模块下的建造器对象%s的name属性不为字符串类型!请规范写法。' %
                        (type_, code, p_parser[pk_main][type_][pk_builder]))
                    continue

                # 1.4 全部校验通过,添加业务建造器
                try:
                    self.__builders[obj_name] = obj()
                except Exception as e:
                    logger.ding_exception('业务建造器(%s)初始化失败!' % obj_name, e,
                                          obj_name)
                    continue

                # 2.添加业务管道
                # 业务管道、中间件等都是附加组件,可以没有,没有则使用默认
                # 如果这些附加组件没有分别继承对应内置父类,也使用默认
                # 如果初始化失败,则使用默认
                try:
                    obj = getattr(m, p_parser[pk_main][type_][pk_pipeline])
                except AttributeError:
                    self.__pipelines[obj_name] = pipeline
                else:
                    if issubclass(obj, Pipeline):
                        try:
                            self.__pipelines[obj_name] = obj()
                        except Exception as e:
                            self.__pipelines[obj_name] = pipeline
                            logger.ding_exception(
                                '业务管道(%s)初始化失败,已更换成默认管道!' % obj_name, e,
                                obj_name)
                    else:
                        self.__pipelines[obj_name] = pipeline
                        logger.exception('业务管道(%s)没有继承内置管道,已更换成默认管道!请规范写法。' %
                                         obj_name)

                # 3.添加业务建造器中间件
                try:
                    obj = getattr(m, p_parser[pk_main][type_][pk_builder_mw])
                except AttributeError:
                    self.__builder_mws[obj_name] = builder_mw
                else:
                    if issubclass(obj, BuilderMiddleware):
                        try:
                            self.__builder_mws[obj_name] = obj()
                        except Exception as e:
                            self.__builder_mws[obj_name] = builder_mw
                            logger.ding_exception(
                                '业务建造器中间件(%s)初始化失败,已更换成默认建造器中间件!' % obj_name,
                                e, obj_name)
                    else:
                        self.__builder_mws[obj_name] = builder_mw
                        logger.exception(
                            '业务建造器中间件(%s)没有继承内置建造器中间件,已更换成默认建造器中间件!请规范写法。')

                # 4.添加业务下载器中间件
                try:
                    obj = getattr(m,
                                  p_parser[pk_main][type_][pk_downloader_mw])
                except AttributeError:
                    self.__downloader_mws[obj_name] = downloader_mw
                else:
                    if issubclass(obj, DownloaderMiddleware):
                        try:
                            self.__downloader_mws[obj_name] = obj()
                        except Exception as e:
                            self.__downloader_mws[obj_name] = downloader_mw
                            logger.ding_exception(
                                '业务下载器中间件(%s)初始化失败,已更换成默认下载器中间件!' % obj_name,
                                e, obj_name)
                    else:
                        self.__downloader_mws[obj_name] = downloader_mw
                        logger.exception(
                            '业务下载器中间件(%s)没有继承内置下载器中间件,已更换成默认下载器中间件!请规范写法。')

        # 加载完成
        cf.print_log('所有业务模块加载完成!')
Example #9
0
    def __execute_request_response_item(self):
        """
        处理后续请求与响应
        """

        # 3.调用调度器,获取请求对象
        # 如果获取请求对象的时候出错,就抛出框架级错误
        # 抛出错误后完成请求数+1(否则引擎会陷入死循环卡死而无法关闭),并直接结束该次任务
        try:
            request = self.__scheduler.get_request()
            if request is None:  # 如果没有获取到请求对象,直接结束
                return
            builder_name = request.builder_name  # 业务名称
            parse_name = request.parse  # 解析函数
        except Exception as e:
            self.__statistics_lock('error')
            self.__statistics_lock('response')
            logger.ding_exception(self.__f_exception, e, self.framework_key)
            return

        # 4.调用下载器,获取响应对象
        try:
            builder = self.__builders[builder_name]  # 业务建造器对象
            downloader_mw = self.__downloader_mws[builder_name]
            request = self.__check_return(self.__check_argument(
                downloader_mw.process_request, request),
                                          right_obj=Request)  # 下载器请求处理
            try:
                response = self.__downloader.get_response(request)
            except Exception as e:  # 下载过程中出错,把原生错误对象与请求对象交回给建造器处理
                result = builder.downloader_error_callback(e, request)
                if isinstance(result, Request):  # 如果返回的是一个请求对象,则再次添加去调度器
                    self.__add_request(result, builder_name)
                return
            response = self.__check_return(self.__check_argument(
                downloader_mw.process_response, response),
                                           right_obj=Response)  # 下载器响应处理
            response.meta = request.meta  # 信息(数据)互传

            # 5.调用建造器,解析响应对象
            response = self.__check_return(self.__check_argument(
                self.__builder_mws[builder_name].process_response, response),
                                           right_obj=Response)  # 建造器响应处理
            response_list = self.__check_return(
                self.__check_argument(self.__check_parse(builder, parse_name),
                                      response))

            # 6.根据响应对象类型,把该对象添加至调度器或交给管道
            for result in response_list:
                pipeline_result = None
                if isinstance(result, Request):
                    self.__add_request(result, builder_name)
                elif isinstance(result, Item):
                    pipeline_result = self.__check_argument(
                        self.__check_parse(self.__pipelines[builder_name],
                                           result.parse), result)
                    if pipeline_result is not None:  # 如果不是返回None,还需要校验是否yield生成器
                        self.__check_return(pipeline_result)
                else:
                    raise TypeDifferent([Request, Item])

                # 7.管道处理完数据对象后,根据处理结果返回的对象类型,添加请求对象至调度器或结束当次响应任务
                if pipeline_result is not None:
                    for one_request in pipeline_result:
                        self.__add_request(one_request, builder_name)

        # 8.完成一个响应,响应+1
        # 无论是正常执行还是报错,都需要完成响应,否则引擎会一直卡死
        except FaultReturn:
            self.__statistics_lock('error')
            logger.exception(self.__fr_warning.format(builder_name,
                                                      parse_name))
        except TypeDifferent:
            self.__statistics_lock('error')
            logger.exception(self.__td_warning.format(builder_name))
        except ArgumentNumError:
            self.__statistics_lock('error')
            logger.exception(self.__ane_warning.format(builder_name))
        except ParameterError:
            self.__statistics_lock('error')
            logger.exception(self.__pe_warning.format(builder_name))
        except ParseUnExist:
            self.__statistics_lock('error')
            logger.exception(self.__pue_warning.format(builder_name))
        except Exception as e:
            self.__statistics_lock('error')
            logger.ding_exception(self.__b_warning.format(builder_name), e,
                                  builder_name)
        finally:
            self.__statistics_lock('response')
Example #10
0
    def __start_request(self, request_type):
        """
        处理每次引擎循环启动的起始请求
        :param request_type:(type=str) 引擎循环启动的类型
        """

        # 1.调用建造器,获取请求对象列表
        no_task = True  # 该点是否没有任务的标记
        for builder_name, builder in self.__builders.items():
            a_gc = True if builder.auto_gc and request_type == 'start_requests' else False  # 通用流程标记
            try:
                try:
                    if a_gc:  # 使用通用的游戏数据采集流程
                        start_list = self.__check_return(
                            self.__check_argument(
                                builder.auto_game_collection))
                    else:  # 其余走正常流程
                        start_list = self.__check_return(
                            self.__check_argument(
                                getattr(builder, request_type)))
                except AttributeError:
                    self.__add_request(Request('test', parse='_funny'),
                                       builder_name)  # 彩蛋请求让引擎有机会关闭
                    continue
                else:
                    no_task = False
                empty = True  # 标记是否空start_list

                # 2.添加请求对象到调度器中
                for start_request in start_list:
                    empty = False
                    self.__add_request(start_request, builder_name)

                # 当编写人比较皮,所有业务建造器中的start都是一个空列表时,会出现引擎卡死的现象
                # 为了防止这种现象,如果start_list为空时,加个彩蛋请求,让引擎有机会关闭
                if empty:
                    self.__add_request(Request('test', parse='_funny'),
                                       builder_name)

            # 处理异常
            except FaultReturn:  # 校验yield不通过
                self.__add_request(Request('test', parse='_funny'),
                                   builder_name)  # 彩蛋请求,作用同上
                logger.exception(
                    self.__fr_warning.format(
                        builder_name,
                        'auto_game_collection' if a_gc else request_type))
            except TypeDifferent:  # 校验类型不通过
                self.__add_request(Request('test', parse='_funny'),
                                   builder_name)
                logger.exception(self.__td_warning.format(builder_name))
            except ArgumentNumError:  # 校验函数传参不通过
                self.__add_request(Request('test', parse='_funny'),
                                   builder_name)
                logger.exception(self.__ane_warning.format(builder_name))
            except CheckUnPass:  # start校验不通过
                self.__add_request(Request('test', parse='_funny'),
                                   builder_name)
                logger.exception(self.__cu_warning.format(builder_name))
            except Exception as e:  # 其他业务级错误
                self.__add_request(Request('test', parse='_funny'),
                                   builder_name)
                logger.ding_exception(self.__b_warning.format(builder_name), e,
                                      builder_name)

        # 3.检索所有业务建造器后,如该点没有任务,则标记不再循环启动引擎
        if no_task:
            self.__while_run = False