def get_json_by_requests(url, params=None, headers='', data=None, proxies={}):
    json = {}
    response = None
    try:
        # response = requests.get(url, params = params)
        if data:
            response = requests.post(url,
                                     headers=headers,
                                     data=data,
                                     params=params,
                                     timeout=TIME_OUT,
                                     proxies=proxies)
        else:
            response = requests.get(url,
                                    headers=headers,
                                    params=params,
                                    timeout=TIME_OUT,
                                    proxies=proxies)
        response.encoding = 'utf-8'
        json = response.json()
    except Exception as e:
        log.error(e)
    finally:
        response and response.close()

    return json
def get_html_by_requests(url,
                         headers='',
                         code='utf-8',
                         data=None,
                         proxies={},
                         max_retries=1):
    html = None
    if not url.endswith('.exe') and not url.endswith('.EXE'):
        r = None
        s = requests.Session()
        s.mount('http://', HTTPAdapter(max_retries=max_retries - 1))
        s.mount('https://', HTTPAdapter(max_retries=max_retries - 1))
        try:
            if data:
                r = s.post(url,
                           headers=headers,
                           timeout=TIME_OUT,
                           data=data,
                           proxies=proxies)
            else:
                r = s.get(url,
                          headers=headers,
                          timeout=TIME_OUT,
                          proxies=proxies)

            if code:
                r.encoding = code
            html = r.text

        except Exception as e:
            log.error(e)
        finally:
            r and r.close()

    return html and len(html) < 1024 * 1024 and html or None, r
def get_html_by_urllib(url, code='utf-8', headers={}, proxies={}):
    html = None
    if not url.endswith('.exe') and not url.endswith('.EXE'):
        page = None
        is_timeout = False
        try:

            def timeout_handler(response):
                is_timeout = True
                if response:
                    response.close()

            if proxies:
                proxy_support = request.ProxyHandler(proxies)
                opener = request.build_opener(proxy_support)
                page = opener.open(quote(url, safe='/:?=&'), timeout=TIME_OUT)

            else:
                page = request.urlopen(quote(url, safe='/:?=&'),
                                       timeout=TIME_OUT)
            # 设置定时器 防止在read时卡死
            t = Timer(TIMER_TIME, timeout_handler, [page])
            t.start()
            # charset = chardet.detect(page.read())['encoding']
            html = page.read().decode(code, 'ignore')
            t.cancel()

        except Exception as e:
            log.error(e)
        finally:
            # page and page.close()
            if page and not is_timeout:
                page.close()

    return html and len(html) < 1024 * 1024 and html or None
 def run_func(*args, **kw):
     callfunc = ''
     try:
         callfunc = func(*args, **kw)
     except Exception as e:
         log.error(module_name + ": " + func.__name__ + " - " +
                   str(e))
     return callfunc
def get_json_obj_by_requests(url, headers='', proxies={}):
    try:
        response = requests.get(url,
                                headers=headers,
                                timeout=TIME_OUT,
                                proxies=proxies)
        response.encoding = 'utf-8'
        json_obj = json.loads(response.text)
    except Exception as e:
        log.error(e)

    return json_obj
def get_tag(html, name=None, attrs={}, find_all=True):
    try:
        if html:
            soup = BeautifulSoup(html, "html.parser") if isinstance(
                html, str) else html
            result = soup.find_all(name, attrs) if find_all else soup.find(
                name, attrs)
            return result if result else []
        else:
            return []
    except Exception as e:
        log.error(e)
        return []
def get_json(json_str):
    '''
    @summary: 取json对象
    ---------
    @param json_str: json格式的字符串
    ---------
    @result: 返回json对象
    '''

    try:
        return json.loads(json_str) if json_str else {}
    except Exception as e:
        log.error(e)
        return {}
def del_file(path, ignore=[]):
    files = get_file_list(path, ignore)
    for file in files:
        try:
            os.remove(file)
        except Exception as e:
            log.error('''
                删除出错: %s
                Exception : %s
                ''' % (file, str(e)))
        else:
            log.debug(file + " 删除成功")
        finally:
            pass
def is_html(url):
    if not url:
        return False

    try:
        content_type = request.urlopen(url).info().get('Content-Type', '')

        if 'text/html' in content_type:
            return True
        else:
            return False
    except Exception as e:
        log.error(e)
        return False
Exemple #10
0
 def delete(self, table, condition={}):
     '''
     @summary: 删除数据
     ---------
     @param table: 表名
     @param condition: 删除条件 {}删除所有
     ---------
     @result:
     '''
     try:
         self._db[table].remove(condition)
     except Exception as e:
         log.error(e)
         return False
     else:
         return True
Exemple #11
0
    def add(self, table, key_value):
        '''
        @summary: 添加 表不存在自动创建表
        ---------
        @param table: 表名
        @param key_value: 要添加的值 字典格式
        ---------
        @result: True / False
        '''

        try:
            self._db[table].save(key_value)
        except Exception as e:
            log.error(e)
            return False
        else:
            return True
    def inner_run_safe_model(func):
        try:

            @functools.wraps(func)  # 将函数的原来属性付给新函数
            def run_func(*args, **kw):
                callfunc = ''
                try:
                    callfunc = func(*args, **kw)
                except Exception as e:
                    log.error(module_name + ": " + func.__name__ + " - " +
                              str(e))
                return callfunc

            return run_func
        except Exception as e:
            log.error(module_name + ": " + func.__name__ + " - " + str(e))
            return func
def read_file(filename, readlines=False, encoding='utf-8'):
    '''
    @summary: 读文件
    ---------
    @param filename: 文件名(有路径)
    @param readlines: 按行读取 (默认False)
    ---------
    @result: 按行读取返回List,否则返回字符串
    '''

    content = ''
    try:
        with open(filename, 'r', encoding=encoding) as file:
            content = file.readlines() if readlines else file.read()
    except Exception as e:
        log.error(e)

    return content
Exemple #14
0
    def update(self, table, old_value, new_value, multi=True):
        '''
        @summary: 更新
        ---------
        @param table: 表名
        @param old_value: 旧值
        @param new_value: 新值
        @param multi: 是否删除多个 默认是
        ---------
        @result:
        '''

        try:
            self._db[table].update(old_value, {'$set': new_value}, multi=multi)
        except Exception as e:
            log.error(e)
            return False
        else:
            return True
 def add(self, table, data, data_id = None, doc_type = ''):
     '''
     @summary:
     ---------
     @param table: 索引
     @param data_json: 数据 json类型
     @param doc_type: 类型 空时以表命名。 doc_type可理解为同样的数据结构不同意意义。比如url表,doc_type 可以以网站名命名
     @param data_id data_id不指定,会自己创建, data_id已存在,则更新
     ---------
     @result:
     '''
     try:
         table = table.lower()
         self._es.index(index = table, doc_type = doc_type or table ,id = data_id, body = data)
     except Exception as e:
         log.error(e)
         return False
     else:
         return True
def dumps_json(json_):
    '''
    @summary: 格式化json 用于打印
    ---------
    @param json_: json格式的字符串或json对象
    ---------
    @result: 格式化后的字符串
    '''
    try:
        if isinstance(json_, str):
            json_ = get_json(json_)

        json_ = json.dumps(json_, ensure_ascii=False, indent=4, skipkeys=True)

    except Exception as e:
        log.error(e)
        json_ = pformat(json_)

    return json_
    def search(self, table, body = {}):
        '''
        @summary:
        ---------
        @param table:
        @param body: 查询条件
        ---------
        @result: json
        '''

        datas = {}

        try:
            table = table.lower()
            datas = self._es.search(index = table, body = body, request_timeout = 30)

        except Exception as e:
            log.error(e)

        return datas
def format_date(date, old_format='', new_format='%Y-%m-%d %H:%M:%S'):
    '''
    @summary: 格式化日期格式
    ---------
    @param date: 日期 eg:2017年4月17日 3时27分12秒
    @param old_format: 原来的日期格式 如 '%Y年%m月%d日 %H时%M分%S秒'
        %y 两位数的年份表示(00-99)
        %Y 四位数的年份表示(000-9999)
        %m 月份(01-12)
        %d 月内中的一天(0-31)
        %H 24小时制小时数(0-23)
        %I 12小时制小时数(01-12)
        %M 分钟数(00-59)
        %S 秒(00-59)
    @param new_format: 输出的日期格式
    ---------
    @result: 格式化后的日期,类型为字符串 如2017-4-17 3:27:12
    '''
    if not date:
        return ''

    if not old_format:
        regex = '(\d+)'
        numbers = get_info(date, regex, allow_repeat=True)
        formats = ['%Y', '%m', '%d', '%H', '%M', '%S']
        old_format = date
        for i, number in enumerate(numbers):
            if i == 0 and len(number) == 2:  # 年份可能是两位 用小%y
                old_format = old_format.replace(
                    number, formats[i].lower(),
                    1)  # 替换一次 '2017年11月30日 11:49' 防止替换11月时,替换11小时
            else:
                old_format = old_format.replace(number, formats[i], 1)  # 替换一次

    try:
        date_obj = datetime.datetime.strptime(date, old_format)
        date_str = datetime.datetime.strftime(date_obj, new_format)
    except Exception as e:
        log.error('日期格式化出错,old_format = %s 不符合 %s 格式' % (old_format, date))
        date_str = date
    return date_str
    def add_batch(self, datas, primary_key, table, doc_type =  ''):
        try:
            actions = [
                {
                    '_op_type': 'index',
                    '_index': table,
                    '_type': doc_type or table,
                    # '_score':1,
                    '_id':data[primary_key],
                    '_source': data
                }
                for data in datas
            ]

            elasticsearch.helpers.bulk(self._es, actions)

        except Exception as e:
            log.error(e)
            return False
        else:
            return True
def get_html_by_webdirver(url, proxies=''):
    html = None
    try:

        driver = webdriver.PhantomJS()

        if proxies:
            proxy = webdriver.Proxy()
            proxy.proxy_type = ProxyType.MANUAL
            proxy.http_proxy = proxies  # '220.248.229.45:3128'
            # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中
            proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS)
            driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS)

        driver.get(url)
        html = driver.page_source
        # driver.save_screenshot('1.png')   #截图保存
        driver.close()
    except Exception as e:
        log.error(e)
    return html and len(html) < 1024 * 1024 and html or None
def download_file(url, base_path, filename='', call_func=''):
    file_path = base_path + filename
    directory = os.path.dirname(file_path)
    mkdir(directory)

    # 进度条
    def progress_callfunc(blocknum, blocksize, totalsize):
        '''回调函数
        @blocknum : 已经下载的数据块
        @blocksize : 数据块的大小
        @totalsize: 远程文件的大小
        '''
        percent = 100.0 * blocknum * blocksize / totalsize
        if percent > 100:
            percent = 100
        # print ('进度条 %.2f%%' % percent, end = '\r')
        sys.stdout.write('进度条 %.2f%%' % percent + "\r")
        sys.stdout.flush()

    if url:
        try:
            log.debug('''
                         正在下载 %s
                         存储路径 %s
                      ''' % (url, file_path))

            request.urlretrieve(url, file_path, progress_callfunc)

            log.debug('''
                         下载完毕 %s
                         文件路径 %s
                      ''' % (url, file_path))

            call_func and call_func()
            return 1
        except Exception as e:
            log.error(e)
            return 0
    else:
        return 0
Exemple #22
0
 def set_ensure_index(self, table, key):
     try:
         self._db[table].ensure_index(key, unique=False)
     except Exception as e:
         log.error(e)
Exemple #23
0
 def set_unique_key(self, table, key):
     try:
         self._db[table].ensure_index(key, unique=True)
     except:
         log.error("%s表中%s有重复的数据, 请先去重" % (table, key))
Exemple #24
0
 def clear(self, table):
     try:
         self._redis.delete(table)
     except Exception as e:
         log.error(e)
def get_text(soup, *args):
    try:
        return soup.get_text()
    except Exception as e:
        log.error(e)
        return ''