def get_json_by_requests(url, params=None, headers='', data=None, proxies={}): json = {} response = None try: # response = requests.get(url, params = params) if data: response = requests.post(url, headers=headers, data=data, params=params, timeout=TIME_OUT, proxies=proxies) else: response = requests.get(url, headers=headers, params=params, timeout=TIME_OUT, proxies=proxies) response.encoding = 'utf-8' json = response.json() except Exception as e: log.error(e) finally: response and response.close() return json
def get_html_by_requests(url, headers='', code='utf-8', data=None, proxies={}, max_retries=1): html = None if not url.endswith('.exe') and not url.endswith('.EXE'): r = None s = requests.Session() s.mount('http://', HTTPAdapter(max_retries=max_retries - 1)) s.mount('https://', HTTPAdapter(max_retries=max_retries - 1)) try: if data: r = s.post(url, headers=headers, timeout=TIME_OUT, data=data, proxies=proxies) else: r = s.get(url, headers=headers, timeout=TIME_OUT, proxies=proxies) if code: r.encoding = code html = r.text except Exception as e: log.error(e) finally: r and r.close() return html and len(html) < 1024 * 1024 and html or None, r
def get_html_by_urllib(url, code='utf-8', headers={}, proxies={}): html = None if not url.endswith('.exe') and not url.endswith('.EXE'): page = None is_timeout = False try: def timeout_handler(response): is_timeout = True if response: response.close() if proxies: proxy_support = request.ProxyHandler(proxies) opener = request.build_opener(proxy_support) page = opener.open(quote(url, safe='/:?=&'), timeout=TIME_OUT) else: page = request.urlopen(quote(url, safe='/:?=&'), timeout=TIME_OUT) # 设置定时器 防止在read时卡死 t = Timer(TIMER_TIME, timeout_handler, [page]) t.start() # charset = chardet.detect(page.read())['encoding'] html = page.read().decode(code, 'ignore') t.cancel() except Exception as e: log.error(e) finally: # page and page.close() if page and not is_timeout: page.close() return html and len(html) < 1024 * 1024 and html or None
def run_func(*args, **kw): callfunc = '' try: callfunc = func(*args, **kw) except Exception as e: log.error(module_name + ": " + func.__name__ + " - " + str(e)) return callfunc
def get_json_obj_by_requests(url, headers='', proxies={}): try: response = requests.get(url, headers=headers, timeout=TIME_OUT, proxies=proxies) response.encoding = 'utf-8' json_obj = json.loads(response.text) except Exception as e: log.error(e) return json_obj
def get_tag(html, name=None, attrs={}, find_all=True): try: if html: soup = BeautifulSoup(html, "html.parser") if isinstance( html, str) else html result = soup.find_all(name, attrs) if find_all else soup.find( name, attrs) return result if result else [] else: return [] except Exception as e: log.error(e) return []
def get_json(json_str): ''' @summary: 取json对象 --------- @param json_str: json格式的字符串 --------- @result: 返回json对象 ''' try: return json.loads(json_str) if json_str else {} except Exception as e: log.error(e) return {}
def del_file(path, ignore=[]): files = get_file_list(path, ignore) for file in files: try: os.remove(file) except Exception as e: log.error(''' 删除出错: %s Exception : %s ''' % (file, str(e))) else: log.debug(file + " 删除成功") finally: pass
def is_html(url): if not url: return False try: content_type = request.urlopen(url).info().get('Content-Type', '') if 'text/html' in content_type: return True else: return False except Exception as e: log.error(e) return False
def delete(self, table, condition={}): ''' @summary: 删除数据 --------- @param table: 表名 @param condition: 删除条件 {}删除所有 --------- @result: ''' try: self._db[table].remove(condition) except Exception as e: log.error(e) return False else: return True
def add(self, table, key_value): ''' @summary: 添加 表不存在自动创建表 --------- @param table: 表名 @param key_value: 要添加的值 字典格式 --------- @result: True / False ''' try: self._db[table].save(key_value) except Exception as e: log.error(e) return False else: return True
def inner_run_safe_model(func): try: @functools.wraps(func) # 将函数的原来属性付给新函数 def run_func(*args, **kw): callfunc = '' try: callfunc = func(*args, **kw) except Exception as e: log.error(module_name + ": " + func.__name__ + " - " + str(e)) return callfunc return run_func except Exception as e: log.error(module_name + ": " + func.__name__ + " - " + str(e)) return func
def read_file(filename, readlines=False, encoding='utf-8'): ''' @summary: 读文件 --------- @param filename: 文件名(有路径) @param readlines: 按行读取 (默认False) --------- @result: 按行读取返回List,否则返回字符串 ''' content = '' try: with open(filename, 'r', encoding=encoding) as file: content = file.readlines() if readlines else file.read() except Exception as e: log.error(e) return content
def update(self, table, old_value, new_value, multi=True): ''' @summary: 更新 --------- @param table: 表名 @param old_value: 旧值 @param new_value: 新值 @param multi: 是否删除多个 默认是 --------- @result: ''' try: self._db[table].update(old_value, {'$set': new_value}, multi=multi) except Exception as e: log.error(e) return False else: return True
def add(self, table, data, data_id = None, doc_type = ''): ''' @summary: --------- @param table: 索引 @param data_json: 数据 json类型 @param doc_type: 类型 空时以表命名。 doc_type可理解为同样的数据结构不同意意义。比如url表,doc_type 可以以网站名命名 @param data_id data_id不指定,会自己创建, data_id已存在,则更新 --------- @result: ''' try: table = table.lower() self._es.index(index = table, doc_type = doc_type or table ,id = data_id, body = data) except Exception as e: log.error(e) return False else: return True
def dumps_json(json_): ''' @summary: 格式化json 用于打印 --------- @param json_: json格式的字符串或json对象 --------- @result: 格式化后的字符串 ''' try: if isinstance(json_, str): json_ = get_json(json_) json_ = json.dumps(json_, ensure_ascii=False, indent=4, skipkeys=True) except Exception as e: log.error(e) json_ = pformat(json_) return json_
def search(self, table, body = {}): ''' @summary: --------- @param table: @param body: 查询条件 --------- @result: json ''' datas = {} try: table = table.lower() datas = self._es.search(index = table, body = body, request_timeout = 30) except Exception as e: log.error(e) return datas
def format_date(date, old_format='', new_format='%Y-%m-%d %H:%M:%S'): ''' @summary: 格式化日期格式 --------- @param date: 日期 eg:2017年4月17日 3时27分12秒 @param old_format: 原来的日期格式 如 '%Y年%m月%d日 %H时%M分%S秒' %y 两位数的年份表示(00-99) %Y 四位数的年份表示(000-9999) %m 月份(01-12) %d 月内中的一天(0-31) %H 24小时制小时数(0-23) %I 12小时制小时数(01-12) %M 分钟数(00-59) %S 秒(00-59) @param new_format: 输出的日期格式 --------- @result: 格式化后的日期,类型为字符串 如2017-4-17 3:27:12 ''' if not date: return '' if not old_format: regex = '(\d+)' numbers = get_info(date, regex, allow_repeat=True) formats = ['%Y', '%m', '%d', '%H', '%M', '%S'] old_format = date for i, number in enumerate(numbers): if i == 0 and len(number) == 2: # 年份可能是两位 用小%y old_format = old_format.replace( number, formats[i].lower(), 1) # 替换一次 '2017年11月30日 11:49' 防止替换11月时,替换11小时 else: old_format = old_format.replace(number, formats[i], 1) # 替换一次 try: date_obj = datetime.datetime.strptime(date, old_format) date_str = datetime.datetime.strftime(date_obj, new_format) except Exception as e: log.error('日期格式化出错,old_format = %s 不符合 %s 格式' % (old_format, date)) date_str = date return date_str
def add_batch(self, datas, primary_key, table, doc_type = ''): try: actions = [ { '_op_type': 'index', '_index': table, '_type': doc_type or table, # '_score':1, '_id':data[primary_key], '_source': data } for data in datas ] elasticsearch.helpers.bulk(self._es, actions) except Exception as e: log.error(e) return False else: return True
def get_html_by_webdirver(url, proxies=''): html = None try: driver = webdriver.PhantomJS() if proxies: proxy = webdriver.Proxy() proxy.proxy_type = ProxyType.MANUAL proxy.http_proxy = proxies # '220.248.229.45:3128' # 将代理设置添加到webdriver.DesiredCapabilities.PHANTOMJS中 proxy.add_to_capabilities(webdriver.DesiredCapabilities.PHANTOMJS) driver.start_session(webdriver.DesiredCapabilities.PHANTOMJS) driver.get(url) html = driver.page_source # driver.save_screenshot('1.png') #截图保存 driver.close() except Exception as e: log.error(e) return html and len(html) < 1024 * 1024 and html or None
def download_file(url, base_path, filename='', call_func=''): file_path = base_path + filename directory = os.path.dirname(file_path) mkdir(directory) # 进度条 def progress_callfunc(blocknum, blocksize, totalsize): '''回调函数 @blocknum : 已经下载的数据块 @blocksize : 数据块的大小 @totalsize: 远程文件的大小 ''' percent = 100.0 * blocknum * blocksize / totalsize if percent > 100: percent = 100 # print ('进度条 %.2f%%' % percent, end = '\r') sys.stdout.write('进度条 %.2f%%' % percent + "\r") sys.stdout.flush() if url: try: log.debug(''' 正在下载 %s 存储路径 %s ''' % (url, file_path)) request.urlretrieve(url, file_path, progress_callfunc) log.debug(''' 下载完毕 %s 文件路径 %s ''' % (url, file_path)) call_func and call_func() return 1 except Exception as e: log.error(e) return 0 else: return 0
def set_ensure_index(self, table, key): try: self._db[table].ensure_index(key, unique=False) except Exception as e: log.error(e)
def set_unique_key(self, table, key): try: self._db[table].ensure_index(key, unique=True) except: log.error("%s表中%s有重复的数据, 请先去重" % (table, key))
def clear(self, table): try: self._redis.delete(table) except Exception as e: log.error(e)
def get_text(soup, *args): try: return soup.get_text() except Exception as e: log.error(e) return ''