コード例 #1
0
ファイル: crawler.py プロジェクト: vunt-0906/Crawler
                    (json.dumps(x, ensure_ascii=False) + '\n').encode('utf-8'))


n_success = 0
n_error = 0


def get_data_response(url):
    global n_success, n_error
    resp = requests.get(url)
    if resp.status_code == 200:
        data = extract_content(resp.content)
        n_success += 1
        print("Success: {}, Error: {}".format(n_success, n_error))
        data['url'] = url
        return data
    n_error += 1
    return None


if __name__ == '__main__':
    for i in range(12, 36):
        urls = get_data_from_file(url_file.format(i))
        for j in range(10):
            data = multithread_helper(urls[j * 10:(j + 1) * 10],
                                      get_data_response,
                                      timeout_concurrent_by_second=720,
                                      max_workers=50,
                                      debug=False)
            store_json_perline_to_file(data, data_file.format(i), True)
コード例 #2
0
            batch_sbd = 5000

            max_sbd = get_min_max_by_code(provide_id)
            # logger.info(max_sbd)
            # max_sbd = 5743
            lst_sbd = []
            for pos in range(1, max_sbd):
                sbd = build_sbd(provide_id=provide_id, post_sbd=pos)
                lst_sbd.append(sbd)

            for idx, sub_lst_sbd in enumerate(
                    get_sublists(lst_sbd,
                                 int(len(lst_sbd) / 5000) + 1)):
                file_diemthi_path = ConfigUniversityProject(
                ).file_diemthi_2019_path(provide_id=provide_id, part=idx)
                if os.path.exists(file_diemthi_path):
                    logger.info(f'skip: {file_diemthi_path}')
                    continue
                obj_sbd = multithread_helper(
                    items=sub_lst_sbd,
                    method=get_info,
                    timeout_concurrent_by_second=36000,
                    max_workers=50,
                    debug=False)
                store_jsons_perline_in_file(jsons_obj=obj_sbd,
                                            file_output_path=file_diemthi_path)
                logger.info(f'write: {file_diemthi_path}')
        except Exception as e:
            logger.error(e)
    logger.info('done')
コード例 #3
0
        }
    return None


def method_univerisy_data(university_obj):
    university_diemchuan_data = extract_data_diemchuan(
        url_diemchuan=university_obj.get('url'),
        university_meta=university_obj)
    return university_diemchuan_data


if __name__ == '__main__':
    file_university_path = ConfigUniversityProject().file_university_path
    universities = load_jsonl_from_gz(file_university_path)

    # logger.info(universities)
    universities_diemchuan_data = multithread_helper(
        items=universities,
        method=method_univerisy_data,
        timeout_concurrent_by_second=360,
        debug=False,
        max_workers=20)
    file_university_diemchuan_path = ConfigUniversityProject(
    ).file_university_diemchuan_path
    store_jsons_perline_in_file(
        jsons_obj=universities_diemchuan_data,
        file_output_path=file_university_diemchuan_path)
    logger.info(
        f'stored file_university_diemchuan_path: {file_university_diemchuan_path}'
    )