def start_yingyongbao(**kwargs): hr = HandleRedis(1) type2 = hr.get_data_redis("yingyongbao_types") t = type2.split(',')[0][2:-1] a = 0 for i in range(0, 2000): try: kwargs = dict(module="yingyongbao_data", data=dict(t=t, i=i)) dicts = start(**kwargs) if dicts: log.crawler.info("获取%s的第%d页内容长度为:%d" % (type2, i, len(dicts))) details = dict(module="yingyongbao_save_details", data=dicts) # start(**details) comment = dict(module="yingyongbao_save_comment", data=dicts) # start(**comment) for _ in range(1): threading.Thread(target=start, kwargs=details, name=None).start() threading.Thread(target=start, kwargs=comment, name=None).start() else: a += 1 if a == 10: break except Exception as e: raise e
def main(): global hr hr = HandleRedis(7) while True: url = hr.get_data_redis("TB_CREDIT_FJ_URL") if url: kwargs = dict(url=url) get_detail(**kwargs)
def start_wenshu_peichang(**kwargs): r = HandleRedis(1) name = r.get_data_redis("wenshu_keys") if name: name = name print(name) else: log.crawler.info('裁判文书关键词遍历完毕.....') return index = 1 while True: log.crawler.info("*" * 80) log.crawler.info("start crawler wenshu page is:%d" % index) kwargs = dict(module="wenshu_peichang_data", data=dict(key=name, index=index), proxies=True) items = start(**kwargs) log.crawler.info("获取的文件ID长度为:%d" % (len(items) - 1)) if len(items) == 1: break if items: run_eval = items[0]['RunEval'] else: break # monkey.patch_all() # pool = Pool(20) threads = [] for item in items[1:]: if threads: for t1 in threads: if not t1.is_alive(): threads.remove(t1) if len(threads) == 10: time.sleep(3) continue data = {} data["docid"] = item["文书ID"] data["CASE_TYPE"] = item["案件类型"] data["CASE_TIME"] = item["裁判日期"] data["CASE_NAME"] = item["案件名称"] data["CASE_NUM"] = item["案号"] data["COURT_NAME"] = item["法院名称"] data['runeval'] = run_eval d = dict(module="wenshu_peichang_detail", data=data, proxies=True) t = threading.Thread(target=start, kwargs=d, name=None) t.setDaemon(True) t.start() t.join() for t in threads: t.join() index += 1
def baidu_shixin(**kwargs): """ 定时任务调用失信爬虫百度失信的爬取策略 :return: """ # ip_pool=get_proxies_from_redis() r = HandleRedis(1) name = r.get_data_redis("shixin_words") # flag为一个开关确定是否需要重新遍历关键词 # flag = r.get('baidushixin_flag') data = {} if name: name = name else: log.crawler.info('百度失信关键词遍历完毕.....') return try: pn = 0 hr = HandleRedis(7) while isinstance(pn, int): kwargs = dict(module='baidu', data=dict(name=name, pn=pn)) # if ip_pool: # proxies = random.choice(ip_pool) # else: # ip_pool = get_proxies_from_redis() # proxies = ip_pool.pop() # kwargs['data']['proxies'] = proxies log.crawler.info("crawler name is:{},pn is:{}".format(name, pn)) result_dict = start(**kwargs) qiye = result_dict['enterprise'] person = result_dict['person'] if qiye: hr.cache_list_redis('TB_SHIXIN_ENTERPRISE', qiye) log.crawler.info( "cache qiye shixin into redis success length is:%s" % len(qiye)) if person: hr.cache_list_redis('TB_SHIXIN_PERSON', person) log.crawler.info( "cache person shixin into redis success length is:%s" % len(person)) pn = result_dict["pn"] if pn == "finished": log.crawler.info("数据请求完毕name:{},pn:{}".format(name, pn)) break elif pn == 2000: break else: pn += 10 except Exception as err: log.error.info('百度失信爬虫发生异常,信息为:\n%s' % err)
def get_page(**kwargs): hr = HandleRedis(7) proxies = kwargs.get("proxies", None) keyword = kwargs.get('keyword') for page in range(1, 101): url = 'http://www.fjcredit.gov.cn/creditsearch.redlist.dhtml?source_id=100&kw={}&page={}'.format(keyword, page) response = requests.get(url, headers=headers, proxies=proxies) response.encoding = response.apparent_encoding kwargs = dict(response=response.text, hr=hr) get_detail(**kwargs)
def start_sougou(**kwargs): hr = HandleRedis(1) type2 = hr.get_data_redis("sougou_type").split(',')[0][2:-1] a = 0 for i in range(0, 20000): try: kwargs = dict(module="sougou_content", data=dict(type1=type2, i=i)) content = start(**kwargs) dicts = {"content": content} if content: log.crawler.info("获取%s的第%d页内容长度为:%d" % (type2, i, len(dicts))) details = dict(module="sougou_save_data", data=dicts) # start(**details) for _ in range(1): threading.Thread(target=start, kwargs=details, name=None).start() else: a += 1 if a == 10: break except Exception as e: log.crawler.error(e)
# @Author : liuyd # @Site : # @File : tasks.py # @Software: PyCharm import time from gevent import monkey import threading from gevent.pool import Pool import ast from tax.monitor.save_mysql import Mysql from tax.util import log from tax.control_spider import start from tax.model.handle_redis import HandleRedis, RedisPool import multiprocessing hr = HandleRedis(1) def enterprise_list(**kwargs): k = kwargs.get("k", None) if not k: raise ValueError("k 参数存在错误......") item = hr.get_data_redis(k) if item: url = item item = ast.literal_eval(item) if isinstance(item, dict): url = item['url'] city = item['city'] prov = item['prov'] log.crawler.info("start crawler prov:%s,city:%s,url is:%s" %
#!/usr/bin/env python # -*- coding: utf-8 -*- """ @Time : 2018/10/9 14:06 @Author : liuyd @File : persist_task.py @desc : 从redis里取数据存入mysql """ import ast from tax.config.conf import * from tax.PublicSpider.get_content_static import log, HandDb, DbHandle from tax.model.handle_redis import HandleRedis from tax.PublicSpider.common import getmd5 hr = HandleRedis(7) db = DbHandle() def persis_data_into_mysql(table, datas): hd = HandDb(table) for data in datas: sql = hd.generate_sql_dict(data) db.insert_db_func(sql=sql) def get_result_data(table): keys_name = table pop_data_list = [] # 定义一个列表用来保存反馈的数据 feedback_data_list = [] # 一次最大传递数据量