async def start_crawl(base_url): session = aiohttp.ClientSession() category_list = await extract_categories(base_url, session) course_list = await extract_courses(category_list, session) detail_list = await extract_details(course_list, session) write_to_json(detail_list, './detail/outputfiles/comprehensive_details.json') faculty_list = await extract_all_faculty_info(detail_list, session) cleaned_faculties = delete_repeat_faculties_for_faculty_list(faculty_list) write_to_json(cleaned_faculties, './final_files/faculty_8888_EUR_XW_0226.json') url = 'https://www.insead.edu/master-programmes' info = await extract_masters_detail(url, url, url, cleaned_faculties, session) category_list += filter_out_masters_category_list(url) write_to_json(category_list, './final_files/category_8888_EUR_XW_0226.json') modified_course_list = modify_course_keys(course_list) modified_course_list += filter_out_masters_course_list(info) write_to_json(modified_course_list, './final_files/course_8888_EUR_XW_0226.json') comprehensive_detail = final_detail(detail_list) comprehensive_detail += filter_out_masters_detail_list(info) write_to_json(comprehensive_detail, './final_files/detail_8888_EUR_XW_0226.json') await session.close() return
def final_run(): with open('faculty/outputfiles/faculty_6110_CBUS_XW_0316.json', 'r') as f: data = json.load(f) for fac in data: print(f'{fac["name"]}, {len(fac["name"])}, {len(fac["title"])}, {len(fac["intro_desc"])}') if 'Copyright' in fac["title"]: fac["title"] = '' write_to_json(data, './faculty/outputfiles/faculty_6110_CBUS_XW_0316.json')
async def start_crawler(url, online_url): session = aiohttp.ClientSession() category_list = await extract_categories(url, online_url, session) cate_page_detail = await extract_details(category_list, session) comprehensive_details = await integrate_details(cate_page_detail, session) final_categories = filter_out_final_categories(comprehensive_details) final_courses = filter_out_final_courses(comprehensive_details) final_faculties = filter_out_final_faculties(comprehensive_details) final_details = filter_out_final_details(comprehensive_details) # masters and mbas comprehensive_mbas_masters_details = await get_comprehensive_master_mba_detail( session) final_categories += add_masters_mbas_categories( comprehensive_mbas_masters_details) final_courses += add_masters_mbas_courses( comprehensive_mbas_masters_details) final_faculties += filter_out_final_faculties( comprehensive_mbas_masters_details) final_faculties = delete_repeat_faculties(final_faculties) final_details += modify_mbas_masters_faculty_and_other_attr( comprehensive_mbas_masters_details) write_to_json(final_categories, './final_outputfiles/category_3388_EUR_XW_0228.json') write_to_json(final_courses, './final_outputfiles/course_3388_EUR_XW_0228.json') write_to_json(final_faculties, './final_outputfiles/faculty_3388_EUR_XW_0228.json') write_to_json(final_details, './final_outputfiles/detail_3388_EUR_XW_0228.json') await session.close() return
def start_crawl(base_url): partial_details = extract_detail_from_cate_page(base_url) write_to_json(partial_details, './files/origianl_detail_partial.json') category = filter_category(partial_details, base_url) write_to_json(category, './files/category.json') cleaned_courses = delete_repeating_courses(partial_details) write_to_json(cleaned_courses, './files/cleaned_detail_partial.json')
async def start_crawl(base_url): session = aiohttp.ClientSession() category_list = await extract_categories(base_url, session) write_to_json(category_list, 'category/outputfiles/category_2222_EUR_XW_0226.json') course_list = await extract_courses(base_url,session,category_list) print(f'total {len(course_list)} courses') write_to_json(course_list, 'course/outputfiles/course_2222_EUR_XW_0226.json') details = await extract_details(course_list,session) write_to_json(details, './detail/outputfiles/comprehensive_details.json') await session.close() return
async def start_crawl(base_url, special_version_url): session = aiohttp.ClientSession() category_list = await extract_categories(base_url, session) write_to_json(category_list, './category/outputfiles/categories.json') course_list = await extract_courses(category_list, session) write_to_json(course_list, './course/outputfiles/courses.json') detail_list = await extract_details(course_list, session, special_version_url) write_to_json(detail_list, './detail/outputfiles/comprehensive_details.json') await session.close() return
def final_run(): with open("./detail/outputfiles/comprehensive_details.json", "r") as read_file: details = json.load(read_file) new_details = arrange_detail(details) write_to_json( new_details, './detail/outputfiles/comprehensive_details_with_version.json') faculty_details = filter_out_faculties(new_details) write_to_json(faculty_details, './final_files/faculty_3399_EUR_XW_0226.json') final_details = check_attrs(new_details) write_to_json(final_details, './final_files/detail_3399_EUR_XW_0226.json') with open("./category/outputfiles/categories.json", "r") as read_file: cates = json.load(read_file) new_cates = delete_repeating_cates(cates) write_to_json(new_cates, './final_files/category_3399_EUR_XW_0226.json') return
async def start_crawl(base_url): session = aiohttp.ClientSession() category_list = await extract_categories(base_url, session) write_to_json(category_list, './category/outputfiles/categories.json') course_list = extract_courses() write_to_json(course_list, './course/outputfiles/courses.json') detail_list = await extract_details(course_list, session) write_to_json(detail_list, './detail/outputfiles/origin_details.json') partial_detail = rename_keys(detail_list) write_to_json(partial_detail, './detail/outputfiles/first_partial_detail.json') coroutines = [] for detail in partial_detail: coroutines.append(course_page_detail(detail, session)) final_details = await asyncio.gather(*coroutines) # print(final_details) write_to_json(final_details, './detail/outputfiles/detail_6110_CBUS_XW_0226.json') fac_urls_with_names = get_faculty_urls_with_name(final_details) coroutines = [] for url_with_name in fac_urls_with_names: coroutines.append( get_one_fac_info(url_with_name[0], url_with_name[1], session)) faculties = await asyncio.gather(*coroutines) write_to_json(faculties, 'faculty/outputfiles/faculty_6110_CBUS_XW_0316.json') await session.close() return
detail["course_faculties"] = course_faculties print(f'before: {detail["languages"]}') detail["languages"] = language_map(detail["languages"]) print(f'after: {detail["languages"]}') return details # FINAL RUN with open('./detail/outputfiles/comprehensive_details.json') as f: d = json.load(f) print(len(d)) new_details = check_attrs(d) faculties = filter_out_faculties(new_details) final_details = modify_faculty_in_detail(new_details) write_to_json(faculties, 'detail/outputfiles/faculty_2222_EUR_XW_0316.json') write_to_json(final_details, './detail/outputfiles/detail_2222_EUR_XW_0226.json') for detail in final_details: if detail["version"] == 2: print(detail['url']) #### ### get all cities # with open('./detail/outputfiles/detail_2222_EUR_XW_0226.json') as f: # d = json.load(f) # # city_set = set() # for course in d:
import json from pprint import pprint from write_to_json import write_to_json with open('detail/outputfiles/faculty_2222_EUR_XW_0316.json', 'r') as f: data = json.load(f) for fac in data: if len(fac["name"]) > 20 and "Academic Director" in fac["name"]: print(f'---------{fac["name"]}') fac["name"] = fac["name"].replace("Academic Director", '').strip() if len(fac["name"]) > 20 and "Directora académica" in fac["name"]: print(f'---------{fac["name"]}') fac["name"] = fac["name"].replace("Directora académica", '').strip() for fac in data: print( f'{fac["name"]}: {len(fac["name"])},{len(fac["title"])}, {len(fac["intro_desc"])}' ) if fac["name"] == "Nuria Chinchilla": title = fac["title"].split(';') title = title[0] fac["title"] = title write_to_json(data, "detail/outputfiles/faculty_2222_EUR_XW_0316.json")