def save_data_list_to_disk2(page_index):

    page_size = 1000
    print("page_size=%s,page_index=%s" % (page_size, page_index))
    # print config_dict

    data_type = int(config_dict["data_type"])
    data_list_folder_name = config_dict["data_list_folder_name"]

    # data_list保存文件名(文件按页码存储,每页PAGE_SIZE条)
    data_list_filename = "data_list_%s_%s.json" % (data_type, page_index)
    data_list_filename = data_list_folder_name + data_list_filename

    # 列表页url(从配置文件读取)
    data_list_url = cf.get("access_url", "data_list_url")
    data_list_url = data_list_url.format(data_type, page_index, page_size)
    logging.debug("数据采集地址:%s" % (data_list_url))

    # 数据采集并保存到本地
    data_list_data = access_data_utils.get_data(data_list_url)
    file_utils.write_file(data_list_filename, data_list_data)
    logging.debug("写入文件成功:%s" % (data_list_filename))
    logging.info("第%s页数据采集完成,剩余%s页,保存路径:%s" %
                 (page_index,
                  (total_page_no - page_index), data_list_filename))
    time.sleep(2)
def save_data_list_to_disk():
    # 每页显示总数量(即每个文件保存1000条数据)
    page_size = 1000
    if total_count < page_size:
        page_size = total_count

    # 计算共多少页
    if total_count % page_size == 0:
        total_page_no = total_count / page_size
    else:
        total_page_no = total_count / page_size + 1

    for index in range(total_page_no):
        page_index = index + 1

        # data_list保存文件名
        data_list_filename = "data_list_%s_%s.json" % (
            dataTypeConfig.get_data_type(), page_index)
        data_list_filename = DATA_LIST_PATH + data_list_filename

        # 列表页url
        app_list_url = dataTypeConfig.get_data_list_url()
        app_list_url = app_list_url.format(dataTypeConfig.get_data_type(),
                                           page_index, page_size)

        # 数据采集并保存到本地
        data_list_data = access_data_utils.get_data(app_list_url)
        file_utils.write_file(data_list_filename, data_list_data)

        time.sleep(2)
Esempio n. 3
0
def get_curr_nmpa_total_count(data_type):

    try:
        #访问url
        data_list_url = cf.get("access_url", "data_list_url")
        data_list_url = data_list_url.format(data_type, 1, 1)

        #数据采集
        data_list_data = access_data_utils.get_data(data_list_url)
        jsonData = json.loads(data_list_data)

        return int(jsonData[0]["COUNT"])
    except BaseException, e:
        raise e
def save_data_list_to_disk(config_dict):
    data_type = int(config_dict["data_type"])
    get_type = int(config_dict["get_type"])
    data_list_folder_name = config_dict["data_list_folder_name"]
    if file_utils.clear_folder(data_list_folder_name):
        logging.info("清空文件夹文件:%s" % (data_list_folder_name))

    begin_time = time.time()
    logging.info("数据采集开始时间:%s" %
                 (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
    # 当前官网数据总量
    total_count = comm_utils.get_curr_nmpa_total_count(data_type)
    # 每页显示总数量(即每个文件保存1000条数据)
    page_size = 1000
    if total_count < page_size:
        page_size = total_count

    # 计算共多少页
    if total_count % page_size == 0:
        total_page_no = total_count / page_size
    else:
        total_page_no = total_count / page_size + 1
    # logging("数据总量=%s,每页采集量=%s,共计%s页:" % (total_count,page_size,total_page_no))
    logging.info("当前NMPA官网数据:data_type=%s,数据总量=%s,每页采集量%s,共计%s页" %
                 (data_type, total_count, page_size, total_page_no))
    for index in range(total_page_no):
        page_index = index + 1

        # data_list保存文件名(文件按页码存储,每页PAGE_SIZE条)
        data_list_filename = "data_list_%s_%s.json" % (data_type, page_index)
        data_list_filename = data_list_folder_name + data_list_filename

        # 列表页url(从配置文件读取)
        data_list_url = cf.get("access_url", "data_list_url")
        data_list_url = data_list_url.format(data_type, page_index, page_size)
        logging.debug("数据采集地址:%s" % (data_list_url))

        # 数据采集并保存到本地
        data_list_data = access_data_utils.get_data(data_list_url)
        file_utils.write_file(data_list_filename, data_list_data)
        logging.debug("写入文件成功:%s" % (data_list_filename))
        logging.info("第%s页数据采集完成,剩余%s页,保存路径:%s" %
                     (page_index,
                      (total_page_no - page_index), data_list_filename))
        time.sleep(2)
    end_time = time.time()
    logging.info("数据采集结束时间:%s" %
                 (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
    logging.info("数据采集共计耗时:%s秒" % (end_time - begin_time))
Esempio n. 5
0
def get_data_info(thread_name, config_dict):
    #新增数据保存路径
    data_info_save_folder_name = config_dict["data_info_save_folder_name"]
    while not DATA_LIST.empty():
        try:
            data_id = DATA_LIST.get()
            info = "线程名:%s,没有获取到数据还有 %d个" % (thread_name, DATA_LIST.qsize())
            logging.info(info)

            # data_info保存文件名
            save_filename = "data_info_thread_%s.json" % (thread_name)
            save_filename = data_info_save_folder_name + save_filename

            # 列表详情页url
            data_info_url = cf.get("access_url", "data_info_url")
            data_info_url = data_info_url.format(config_dict["data_type"],
                                                 data_id)

            # 数据采集并保存到本地
            data_info_data = access_data_utils.get_data(data_info_url)
            data_info_data = data_info_data.replace(
                "\\n\\r", "").decode("gbk").encode("utf-8")

            #将数据标识和数据一起存储
            data = str(data_id) + "==" + data_info_data
            with open(save_filename, 'a') as f:
                f.writelines(data + "\n")

            info = save_filename + "写入成功! id: " + str(data_id)
            logging.debug(info)

            # 休眠2秒,防止服务器判断为攻击
            time.sleep(2)
        except urllib2.URLError as e:
            DATA_LIST.put(data_id)
            print("URLError")
            logging.error("URLError")
            logging.error(e.message)
        except UnboundLocalError as e:
            print("UnboundLocalError")
            logging.error("UnboundLocalError")
            logging.error(e.message)
Esempio n. 6
0
def get_data_info(NEW_DATA_LIST,config_dict):

    #新增数据保存路径
    data_info_save_folder_name = config_dict["data_info_save_folder_name"]
    # 清空data_info > save文件夹
    if file_utils.clear_folder(data_info_save_folder_name):
        logging.info("清空文件夹文件:%s" % (data_info_save_folder_name))


    while not NEW_DATA_LIST.empty():
        try:
            data_id = NEW_DATA_LIST.get()
            info = "没有获取到数据还有 %d个" % (NEW_DATA_LIST.qsize())
            logging.info(info)

            # data_info保存文件名
            save_filename = "get_new_data.json"
            save_filename = data_info_save_folder_name + save_filename

            # 列表详情页url
            data_info_url = cf.get("access_url" ,"data_info_url")
            data_info_url = data_info_url.format(config_dict["data_type"], data_id)

            # 数据采集并保存到本地
            data_info_data = access_data_utils.get_data(data_info_url)
            # access_data_utils.get_test_timeout()
            data_info_data = data_info_data.replace("\\n\\r", "").decode("gbk").encode("utf-8")

            #将数据标识和数据一起存储
            data = str(data_id) + "==" + data_info_data
            with open(save_filename, 'a') as f:
                f.writelines(data + "\n")

            info = save_filename + "写入成功! id: " + str(data_id)
            logging.debug(info)

            # 休眠2秒,防止服务器判断为攻击
            time.sleep(2)
        except urllib2.URLError as e:
            logging.error("获取新增数据采集失败! %s" %(e.args))
            raise e
Esempio n. 7
0
def get_data_info(thread_name):
    while not id_list.empty():
        try:
            data_id = id_list.get()
            info = "线程名:%s,没有获取到数据还有 %d个" % (thread_name, id_list.qsize())
            print(info)

            # data_info保存文件名
            data_info_filename = "data_info_thread_%s.json" % (thread_name)
            data_info_filename = DATA_INFO_PATH + data_info_filename

            # 列表详情页url
            data_info_url = dataTypeConfig.get_data_info_url()
            data_info_url = data_info_url.format(
                dataTypeConfig.get_data_type(), data_id)

            # 数据采集并保存到本地
            data_info_data = access_data_utils.get_data(data_info_url)
            data_info_data = data_info_data.replace(
                "\\n\\r", "").decode("gbk").encode("utf-8")

            with open(data_info_filename, 'a') as f:
                f.writelines(data_info_data + "\n")

            info = data_info_filename + "写入成功! id: " + data_id
            print(info)

            # 休眠1秒,防止服务器判断为攻击
            time.sleep(1)
            # logging.info(info)
        except urllib2.URLError as e:
            id_list.put(data_id)
            # time.sleep(6000)
            print("URLError")
            logging.error("URLError")
            logging.error(e.message)
        except UnboundLocalError as e:
            print("UnboundLocalError")
            logging.error("UnboundLocalError")
            logging.error(e.message)
Esempio n. 8
0
from utils import access_data_utils

print access_data_utils.get_data(
    "http://mobile.cfda.gov.cn/datasearch/QueryList?pageIndex=15701&pageSize=10&tableId=26&searchF=Quick%20"
)