Exemple #1
0
async def start_crawl(base_url):
    session = aiohttp.ClientSession()

    category_list = await extract_categories(base_url, session)
    course_list = await extract_courses(category_list, session)
    detail_list = await extract_details(course_list, session)

    write_to_json(detail_list,
                  './detail/outputfiles/comprehensive_details.json')

    faculty_list = await extract_all_faculty_info(detail_list, session)
    cleaned_faculties = delete_repeat_faculties_for_faculty_list(faculty_list)
    write_to_json(cleaned_faculties,
                  './final_files/faculty_8888_EUR_XW_0226.json')

    url = 'https://www.insead.edu/master-programmes'
    info = await extract_masters_detail(url, url, url, cleaned_faculties,
                                        session)
    category_list += filter_out_masters_category_list(url)
    write_to_json(category_list,
                  './final_files/category_8888_EUR_XW_0226.json')

    modified_course_list = modify_course_keys(course_list)
    modified_course_list += filter_out_masters_course_list(info)
    write_to_json(modified_course_list,
                  './final_files/course_8888_EUR_XW_0226.json')

    comprehensive_detail = final_detail(detail_list)
    comprehensive_detail += filter_out_masters_detail_list(info)
    write_to_json(comprehensive_detail,
                  './final_files/detail_8888_EUR_XW_0226.json')

    await session.close()
    return
Exemple #2
0
def final_run():
    with open('faculty/outputfiles/faculty_6110_CBUS_XW_0316.json', 'r') as f:
        data = json.load(f)
    for fac in data:
        print(f'{fac["name"]}, {len(fac["name"])}, {len(fac["title"])}, {len(fac["intro_desc"])}')
        if 'Copyright' in fac["title"]:
            fac["title"] = ''
    write_to_json(data, './faculty/outputfiles/faculty_6110_CBUS_XW_0316.json')
Exemple #3
0
async def start_crawler(url, online_url):
    session = aiohttp.ClientSession()
    category_list = await extract_categories(url, online_url, session)
    cate_page_detail = await extract_details(category_list, session)
    comprehensive_details = await integrate_details(cate_page_detail, session)

    final_categories = filter_out_final_categories(comprehensive_details)
    final_courses = filter_out_final_courses(comprehensive_details)
    final_faculties = filter_out_final_faculties(comprehensive_details)
    final_details = filter_out_final_details(comprehensive_details)

    # masters and mbas
    comprehensive_mbas_masters_details = await get_comprehensive_master_mba_detail(
        session)
    final_categories += add_masters_mbas_categories(
        comprehensive_mbas_masters_details)
    final_courses += add_masters_mbas_courses(
        comprehensive_mbas_masters_details)
    final_faculties += filter_out_final_faculties(
        comprehensive_mbas_masters_details)
    final_faculties = delete_repeat_faculties(final_faculties)
    final_details += modify_mbas_masters_faculty_and_other_attr(
        comprehensive_mbas_masters_details)
    write_to_json(final_categories,
                  './final_outputfiles/category_3388_EUR_XW_0228.json')
    write_to_json(final_courses,
                  './final_outputfiles/course_3388_EUR_XW_0228.json')
    write_to_json(final_faculties,
                  './final_outputfiles/faculty_3388_EUR_XW_0228.json')
    write_to_json(final_details,
                  './final_outputfiles/detail_3388_EUR_XW_0228.json')
    await session.close()
    return
Exemple #4
0
def start_crawl(base_url):
    partial_details = extract_detail_from_cate_page(base_url)
    write_to_json(partial_details, './files/origianl_detail_partial.json')
    category = filter_category(partial_details, base_url)
    write_to_json(category, './files/category.json')
    cleaned_courses = delete_repeating_courses(partial_details)
    write_to_json(cleaned_courses, './files/cleaned_detail_partial.json')
Exemple #5
0
async def start_crawl(base_url):
    session = aiohttp.ClientSession()
    category_list = await extract_categories(base_url, session)
    write_to_json(category_list, 'category/outputfiles/category_2222_EUR_XW_0226.json')
    course_list = await extract_courses(base_url,session,category_list)
    print(f'total {len(course_list)} courses')
    write_to_json(course_list, 'course/outputfiles/course_2222_EUR_XW_0226.json')
    details = await extract_details(course_list,session)
    write_to_json(details, './detail/outputfiles/comprehensive_details.json')
    await session.close()
    return
Exemple #6
0
async def start_crawl(base_url, special_version_url):
    session = aiohttp.ClientSession()
    category_list = await extract_categories(base_url, session)
    write_to_json(category_list, './category/outputfiles/categories.json')
    course_list = await extract_courses(category_list, session)
    write_to_json(course_list, './course/outputfiles/courses.json')
    detail_list = await extract_details(course_list, session,
                                        special_version_url)
    write_to_json(detail_list,
                  './detail/outputfiles/comprehensive_details.json')

    await session.close()
    return
Exemple #7
0
def final_run():
    with open("./detail/outputfiles/comprehensive_details.json",
              "r") as read_file:
        details = json.load(read_file)

    new_details = arrange_detail(details)
    write_to_json(
        new_details,
        './detail/outputfiles/comprehensive_details_with_version.json')

    faculty_details = filter_out_faculties(new_details)
    write_to_json(faculty_details,
                  './final_files/faculty_3399_EUR_XW_0226.json')

    final_details = check_attrs(new_details)
    write_to_json(final_details, './final_files/detail_3399_EUR_XW_0226.json')

    with open("./category/outputfiles/categories.json", "r") as read_file:
        cates = json.load(read_file)

    new_cates = delete_repeating_cates(cates)
    write_to_json(new_cates, './final_files/category_3399_EUR_XW_0226.json')
    return
Exemple #8
0
async def start_crawl(base_url):
    session = aiohttp.ClientSession()
    category_list = await extract_categories(base_url, session)
    write_to_json(category_list, './category/outputfiles/categories.json')
    course_list = extract_courses()
    write_to_json(course_list, './course/outputfiles/courses.json')
    detail_list = await extract_details(course_list, session)
    write_to_json(detail_list, './detail/outputfiles/origin_details.json')
    partial_detail = rename_keys(detail_list)
    write_to_json(partial_detail,
                  './detail/outputfiles/first_partial_detail.json')

    coroutines = []
    for detail in partial_detail:
        coroutines.append(course_page_detail(detail, session))
    final_details = await asyncio.gather(*coroutines)
    # print(final_details)
    write_to_json(final_details,
                  './detail/outputfiles/detail_6110_CBUS_XW_0226.json')

    fac_urls_with_names = get_faculty_urls_with_name(final_details)
    coroutines = []
    for url_with_name in fac_urls_with_names:
        coroutines.append(
            get_one_fac_info(url_with_name[0], url_with_name[1], session))
    faculties = await asyncio.gather(*coroutines)
    write_to_json(faculties,
                  'faculty/outputfiles/faculty_6110_CBUS_XW_0316.json')
    await session.close()
    return
Exemple #9
0
        detail["course_faculties"] = course_faculties
        print(f'before: {detail["languages"]}')
        detail["languages"] = language_map(detail["languages"])
        print(f'after: {detail["languages"]}')
    return details


# FINAL RUN
with open('./detail/outputfiles/comprehensive_details.json') as f:
    d = json.load(f)

print(len(d))
new_details = check_attrs(d)
faculties = filter_out_faculties(new_details)
final_details = modify_faculty_in_detail(new_details)
write_to_json(faculties, 'detail/outputfiles/faculty_2222_EUR_XW_0316.json')
write_to_json(final_details,
              './detail/outputfiles/detail_2222_EUR_XW_0226.json')

for detail in final_details:
    if detail["version"] == 2:
        print(detail['url'])
####

### get all cities

# with open('./detail/outputfiles/detail_2222_EUR_XW_0226.json') as f:
#     d = json.load(f)
#
# city_set = set()
# for course in d:
Exemple #10
0
import json
from pprint import pprint

from write_to_json import write_to_json

with open('detail/outputfiles/faculty_2222_EUR_XW_0316.json', 'r') as f:
    data = json.load(f)

for fac in data:
    if len(fac["name"]) > 20 and "Academic Director" in fac["name"]:
        print(f'---------{fac["name"]}')
        fac["name"] = fac["name"].replace("Academic Director", '').strip()
    if len(fac["name"]) > 20 and "Directora académica" in fac["name"]:
        print(f'---------{fac["name"]}')
        fac["name"] = fac["name"].replace("Directora académica", '').strip()

for fac in data:
    print(
        f'{fac["name"]}: {len(fac["name"])},{len(fac["title"])}, {len(fac["intro_desc"])}'
    )
    if fac["name"] == "Nuria Chinchilla":
        title = fac["title"].split(';')
        title = title[0]
        fac["title"] = title
write_to_json(data, "detail/outputfiles/faculty_2222_EUR_XW_0316.json")