Ejemplo n.º 1
0
def page_url_generator(conf_dict, global_dict, results):
    log = global_dict['log']

    page_dict = results[0]
    page_urls = []
    params = None
    if 'static_params' in conf_dict:
        params = conf_dict['static_params']

    count_temp = page_dict[conf_dict['count_key']]
    try:
        count = int(count_temp)
    except Exception as e:
        log.error(u'页数转int错误\n%s', traceinfo(e))
        time.sleep(1)
        raise Exception()

    del page_dict[conf_dict['count_key']]
    if params != None:
        params_temp = dict(params, **page_dict)
    else:
        params_temp = page_dict

    for i in range(conf_dict['start_page'],
                   count / conf_dict['page_count'] + 1):
        page = {conf_dict['count_key']: i * conf_dict['index_count']}
        params_all = dict(params_temp, **page)
        url_dict = json.dumps({'url': conf_dict['url'], 'params': params_all})
        page_urls.append(url_dict)
        log.debug(url_dict)

    return page_urls
Ejemplo n.º 2
0
def record_success(pinyin, yzm, img_path, holder, count=10000):
    """
    打码成功后记录,文件名使用yzm
    :param yzm:  验证码
    :param count: 保存验证码文件个数,默认10000个
    :return: (None)
    """
    try:
        dir_path = os.path.abspath('../')
        yzm_dir = os.path.join(dir_path, "yzm_success", pinyin)
        if not fileutil.isdir(yzm_dir):
            # 建立目录
            fileutil.mkdirs(yzm_dir)
        pics = sum([len(files) for root, dirs, files in os.walk(yzm_dir)])
        holder.logging.info("已存放%d张验证码图片" % (pics - 1))
        if pics > count:
            holder.logging.warn("已存放超%d张验证码图片,不再存储" % count)
            return
        # 唯一的验证码图片文件名
        img = "%s.jpg" % str(uuid.uuid1())
        # 记录图片与验证码对应关系
        text_file_name = os.path.join(yzm_dir, "ans.txt")
        file = open(text_file_name, "a")
        file.write(img + ' ' + yzm + '\n')
        file.close()
        # 保存验证码图片
        img_name = os.path.join(yzm_dir, img)
        fileutil.copyfile(img_path, img_name)
    except Exception as e:
        holder.logging.error(u"记录发生异常.错误信息:%s" % exceptutil.traceinfo(e))
Ejemplo n.º 3
0
def run_function(function, args, log=None):
    '''
    use reflection to run a function
    :param function:
    :param args: the args dictionary, only accept the string as the key value
    :param log: the log object
    :return:
    '''
    if not is_function(function):
        return None
    try:
        varnames = inspect.getargspec(function).args
        # "args['entId'], args['company']"
        var_str = ''
        for varname in varnames:
            # 忽略对象方法中的self参数
            if varname == 'self':
                continue
            # 某些可能不会出现在中间结果集的参数会带默认参数,例如page_no
            if varname not in args:
                continue
            var_str += "args['" + varname + "'], "
        var_str = var_str.rstrip(',')
        if log:
            log.info("使用参数 %s 执行方法 %s" %
                     (','.join(varnames), function.func_name))
        if var_str:
            return apply(function, (eval(var_str)))
        else:
            return apply(function)
    except Exception as e:
        if log:
            log.error(traceinfo(e))
        return None
Ejemplo n.º 4
0
def parse_yzm(img_url,
              img_src,
              typecode,
              yzm_max_len=4,
              type=None,
              holder=None):
    """
    对验证码进行人工打码验证
    :param img_url:  验证码图片地址
    :param img_src:  验证码图片内容
    :param typecode:
    :param yzm_max_len:  验证码最大长度
    :return: (unicode,unicode,bool,RecChar,unicode)(验证码内容, 打码系统id, 是否正常,打码对象,验证码图片地址)
    """
    img_path = None
    try:
        if len(img_src) <= 50 or len(img_src) > 1024 * 1024:
            raise Exception(u'img_src len error!')
        if not os.path.exists('yzm'):
            os.mkdir('yzm')
        pid = str(os.getpid())
        dir_path = os.path.abspath('.')
        urlpret = urlparse.urlparse(img_url)
        img_path = os.path.join(
            dir_path, 'yzm',
            "%s_%s.png" % (urlpret.hostname, pid + '_' + holder.pinyin))
        print "img_path:", img_path, "type:", type
        fileutil.write(img_path, img_src)
        holder.logging.info(u"请求验证码")
        # 发送给打码公司打码 或 机器打码
        if type != None and len(type) > 0:
            if holder.recChar == None:
                holder.recChar = RecChar(type=type, log=holder.logging)
            ret = holder.recChar.rec(img_path)
            yzm = None
            if ret != None and len(ret) > 0:
                yzm = str(ret[0])
                print "yzm:", yzm
                if chardet.detect(yzm)['encoding'] == "utf-8":
                    yzm = yzm.decode("utf-8")
                if yzm != None and yzm.lower() == "none":
                    yzm = None
                holder.logging.info("机器打码结果:yzm=%s" % (yzm if yzm else ''))
            else:
                holder.logging.info("机器打码结果:yzm为None或长度为0.")
            if yzm and yzm != '-9999':
                return yzm, "0", False, holder.recChar, img_path
            if yzm == '-9999':
                raise Exception(u'yzm -9999 error...')
            else:
                return parseYzmManual(img_path, typecode, holder)
        else:
            return parseYzmManual(img_path, typecode, holder)
    except Exception as e1:
        holder.logging.error(u"验证码处理异常,error:%s" % exceptutil.traceinfo(e1))
        raise Exception(e1)
Ejemplo n.º 5
0
 def error_handler(self, e, error_message=None):
     '''
     统一处理异常
     :param e: 异常
     :param error_message:错误提示信息
     :return:
     '''
     self.error_prompt_message = error_message
     self.holder.logging.error(
         u"%s:%s" % (self.error_prompt_message, exceptutil.traceinfo(e)))
Ejemplo n.º 6
0
def json_parse(conf_dict, global_dict, html):
    log = global_dict['log']

    content_json = None
    try:
        content_json = json.loads(html)
    except Exception as e:
        log.error(u'页数text转json失败 \n%s', traceinfo(e))
        time.sleep(1)
        raise Exception()

    return recursive_keys(content_json, conf_dict['key'], log)
Ejemplo n.º 7
0
 def getPageNo(self, html, xpath_pattern):
     page_no = 1
     try:
         html = html.replace('&nbsp;', '').replace('<<', '').replace('>>', '').replace('\n', '')
         tree = etree.HTML(html)
         ele = tree.xpath(xpath_pattern)
         pages = ''.join(ele).strip()
         arr = pages.split("/")
         if pages and len(arr) > 0:
             page_no = int(arr[-1])
     except Exception as e:
         self.holder.logging.warning(traceinfo(e))
     return page_no
Ejemplo n.º 8
0
def bbd_yzm(img_src, holder=None):
    """
    parse the yzm from spider7 through net work
    :param img_src:
    :param holder:
    :return:
    """
    try:
        yzm_html = post('http://spider7:5678/form',
                        files={'files': img_src},
                        data={'type': holder.pinyin})
        yzm_html.encoding = 'utf-8'
        yzm_html = yzm_html.content
        assert len(yzm_html.split()) == 2
        yzm, img_name = yzm_html.split()
        print yzm, img_name
        return yzm, img_name, "no erro", img_name
    except Exception as e:
        holder.logging.error(exceptutil.traceinfo(e))
        return None, None, None, None
Ejemplo n.º 9
0
    def request(self, url, method='GET', **kwargs):
        """
        下载模块:Constructs and sends a :class:`Request <Request>`.
        Returns :class:`Response <Response>` object.

        :param method: method for the new :class:`Request` object.
        :param url: URL for the new :class:`Request` object.
        :param params: (optional) Dictionary or bytes to be sent in the query string for the :class:`Request`.
        :param data: (optional) Dictionary, bytes, or file-like object to send in the body of the :class:`Request`.
        :param headers: (optional) Dictionary of HTTP Headers to send with the :class:`Request`.
        :param cookies: (optional) Dict or CookieJar object to send with the :class:`Request`.
        :param files: (optional) Dictionary of ``'name': file-like-objects`` (or ``{'name': ('filename', fileobj)}``) for multipart encoding upload.
        :param auth: (optional) Auth tuple to enable Basic/Digest/Custom HTTP Auth.
        :param timeout: (optional) How long to wait for the server to send data
            before giving up, as a float, or a (`connect timeout, read timeout
            <user/advanced.html#timeouts>`_) tuple.
        :type timeout: float or tuple
        :param allow_redirects: (optional) Boolean. Set to True if POST/PUT/DELETE redirect following is allowed.
        :type allow_redirects: bool
        :param proxies: (optional) Dictionary mapping protocol to the URL of the proxy.
        :param verify: (optional) if ``True``, the SSL cert will be verified. A CA_BUNDLE path can also be provided.
        :param stream: (optional) if ``False``, the response content will be immediately downloaded.
        :param cert: (optional) if String, path to ssl client cert file (.pem). If Tuple, ('cert', 'key') pair.
        """
        self.outputLog("DownLoader内部,进入request方法", "debug")
        self.mail()

        def _get_values(strs, default=None):
            return kwargs[strs] if strs in kwargs else default

        use_proxy = _get_values('use_proxy', default=True)
        params = _get_values('params', default=None)
        data = _get_values('data')
        headers = kwargs['headers'] if 'headers' in kwargs else {
            "Host": urlparse.urlparse(url).netloc,
            "User-Agent": self.ua
        }
        ua = _get_values('ua', default=None)
        if ua:
            headers.update({"User-Agent": ua})
        allow_redirects = _get_values('allow_redirects', default=True)
        verify = _get_values('verify', default=False)
        stream = _get_values('stream', default=False)
        cert = _get_values('cert', default=None)
        de = DownLoaderException()
        exception = ''
        _retry = _get_values('retry')
        _retry = _retry + 1 if _retry else (
            1 if _retry <= 0 and _retry != None else
            (30 if self.retry_flag else 2))
        _retry = _retry if _retry and _retry >= 0 else 1
        if self.retry_flag == True:
            timeout = 30
        else:
            timeout = 120
        while _retry:
            _retry -= 1
            res = None
            try:
                self.outputLog("使用代理 %s" % self.proxy)
                req = Request(method.upper(),
                              url,
                              data=data or {},
                              headers=headers or {},
                              params=params or {})
                self.outputLog("DownLoader内部,即将prepare request", "debug")
                prepped = self.ss.prepare_request(req)
                self.outputLog("DownLoader内部,即将调用session.send", "debug")
                if use_proxy == False:
                    self.outputLog('该步代理设置为空')
                    proxies = {}
                else:
                    proxies = {
                        'http': 'http://' + self.proxy,
                        'https': 'http://' + self.proxy
                    } if self.proxy else {}
                settings = self.ss.merge_environment_settings(
                    prepped.url, proxies, stream, verify, cert)
                send_kwargs = {
                    'timeout': timeout,
                    'allow_redirects': allow_redirects,
                }
                send_kwargs.update(settings)
                resp = self.ss.send(prepped, **send_kwargs)
                self.outputLog("DownLoader内部,从session.send返回", "debug")
                if resp and isinstance(
                        resp,
                        object) and resp.status_code == requests.codes.ok:
                    self.correct_http += 1
                    self.retry_flag = False
                    self.outputLog("DownLoader内部,调用成功,即将从request方法返回", "debug")
                    return resp
                if resp.status_code in self.status_code_ok:
                    return resp
                else:
                    self.error_http += 1
                    if 500 <= resp.status_code < 600:  #返回code为500和600之间时候是服务器的问题,故休眠1分钟
                        self.outputLog(u'返回码为%d,休眠一分钟' % resp.status_code)
                        time.sleep(60)
                    resp.raise_for_status()

            except Exception as e:
                self.outputLog("获取页面内容异常:%s" % traceinfo(e), "error")
                exception += str(e)
                de.exception = exception
                de.res = res
                de.time_out = True if not res else False
                if self.retry_flag:
                    self.proxy = self.proxySet

        if self.proxy:
            self.proxy = self.proxySet

        self.outputLog("DownLoader内部,调用失败,即将从request方法返回,同时抛出异常", "debug")
        raise de
Ejemplo n.º 10
0
def request(downloader, **kwargs):
    """
    为获取网络内容封装HttpReque.Downloader
    :param downloader: 下载器
    :param url:
    :param headers:
    :param method:
    :param data:
    :param encoding:
    :param ua:
    :param is_pic: 是否获取图片内容
    :param use_proxy: 是否使用代理
    :param holder: 代持非业务对象
    :return:
    """
    start_time = time.time()
    url = kwargs.get('url', None)
    headers = kwargs.get('headers', None)
    method = kwargs.get('method', None)
    data = kwargs.get('data', None)
    encoding = kwargs.get('encoding', None)
    ua = kwargs.get('ua', None)
    is_pic = kwargs.get('is_pic', None)
    use_proxy = kwargs.get('use_proxy', None)
    holder = kwargs.get('holder', None)
    accept_code = kwargs.get('accept_code', None)

    web = WebContent(url=url,
                     headers=headers,
                     method=method,
                     data=data,
                     encoding=encoding,
                     use_proxy=use_proxy)
    try:
        #设置User-Agent
        if ua and len(ua) > 0:
            if not headers:
                headers = dict()
            if "User-Agent" not in headers:
                headers["User-Agent"] = ua
        if accept_code:
            downloader.setNotRaise(accept_code)
        if holder: holder.logging.info(u"开始调用download获取页面内容")
        response = downloader.request(url=url,
                                      headers=headers,
                                      method=method,
                                      data=data,
                                      use_proxy=use_proxy)
        if holder: holder.logging.info(u"通过download获取页面内容结束")
        if is_pic:
            web.body = response.content
        elif encoding:
            response.encoding = encoding
            web.body = response.content
        else:
            web.body = response.text
        web.status_code = response.status_code
        # 若出错,记录原因
        web.reason = response.reason
        # 记录耗时
        web.elapsed = response.elapsed.microseconds / 1000 / 1000
        return web
    except DownLoaderException as e:
        holder.logging.error(exceptutil.traceinfo(e))
        # 添加web状态及是否超时
        if e.res:
            web.status_code = e.res.status_code
            web.reason = e.res.reason
            web.elapsed = e.res.elapsed.microseconds / 1000 / 1000
        elif '403' in e.exception:
            web.status_code = 403
        else:
            web.status_code = 800
        web.time_out = e.time_out
        web.access_type = WebAccessType.EXCEPTION
        if not web.elapsed:
            web.elapsed = time.time() - start_time
        return web
Ejemplo n.º 11
0
def xpath_parse_single(results, xpath, index, log):
    print xpath
    print index
    returns = None
    if isinstance(results, unicode) or isinstance(results, str):
        tree = etree.HTML(results)
    else:
        tree = results

    if isinstance(xpath, list) and isinstance(tree, list):
        returns = []
        for i in range(0, len(xpath)):
            returns.append(xpath_parse_single(tree[i], xpath[i], index[i],
                                              log))
    elif isinstance(xpath, list):
        returns = []
        for i in range(0, len(xpath)):
            returns.append(xpath_parse_single(tree, xpath[i], index[i], log))
    elif isinstance(tree, list):
        returns = []
        for i in range(0, len(tree)):
            returns.append(xpath_parse_single(tree[i], xpath, index, log))
    else:
        try:
            log.debug(u'xpath is %s' % (xpath))
            if index == None:
                if xpath.endswith('/text()') and xpath != './/script/text()':
                    raw = [
                        ''.join(x.xpath('.//text()'))
                        for x in tree.xpath(xpath.replace('/text()', ''))
                    ]
                else:
                    raw = tree.xpath(xpath)
                returns = []
                log.debug(u'result of xpath is:')
                for r in raw:
                    log.debug(u'%s', r)
                    returns.append(r)
            elif isinstance(index, str) and ':' in index:
                if xpath.endswith('/text()') and xpath != './/script/text()':
                    raw = [
                        ''.join(x.xpath('.//text()'))
                        for x in tree.xpath(xpath.replace('/text()', ''))
                    ]
                else:
                    raw = tree.xpath(xpath)
                returns = []
                log.debug(u'result of xpath is:')
                temp_int = index.split(':')
                strIndex = int(temp_int[0])
                endIndex = int(temp_int[1])
                if endIndex < 0:
                    end_temp = len(raw) + endIndex
                    endIndex = end_temp + 1
                for i in range(strIndex, endIndex):
                    log.debug(u'%s', raw[i])
                    returns.append(raw[i])
            elif index == 'texts':
                raw = gettexts(tree, xpath)
                log.debug(u'result of xpath is:%s', raw)
                returns = raw
            elif index == 'all_texts':
                raw = get_all_text(tree, xpath)
                log.debug(u'result of xpath is:%s', raw)
                returns = raw
            elif index == 'raw':
                raw = tree.xpath(xpath)[0]
                string = etree.tostring(raw, encoding=unicode)
                log.debug(u'result of xpath is:%s', string)
                returns = string
            elif index < 0:
                raw = tree.xpath(xpath)
                log.debug(u'result of xpath is:')
                if len(raw) < 1:
                    returns = ''
                else:
                    returns = raw[len(raw) + index]
                    log.debug(returns)
            else:
                raw = tree.xpath(xpath)
                if raw:
                    log.debug(u'result of xpath is:%s', raw[index])
                    returns = raw[index]
                else:
                    returns = ""
        except Exception as e:
            log.error(u'xpath路径%s错误\n%s', xpath, traceinfo(e))
            time.sleep(1)
            returns = ''

    return returns