def extract_globle_ceo_detail(page): desc = get_globle_ceo_desc(page) version_list = get_globle_ceo_version_info(page) version_num = len(version_list) testimonials = get_globle_ceo_testimonials(page) takeaways = get_globle_ceo_takeaways(page) video_info = get_globle_ceo_video(page) who_attend_desc = get_globle_ceo_who_attend_desc(page) language = detect_language(desc) overview = { "desc": desc, "video_title": video_info["video_title"], "video_url": video_info["video_url"] } for version in version_list: version["version"] = version_num return { "desc": overview, "version_info_list": version_list, "testimonials": testimonials, "course_takeaways": takeaways, "who_attend_desc": who_attend_desc, "languages": language, "duration_consecutive": True, "university_school": "2222_EUR", "type": "Onsite", "category_tags": [], 'credential': '' }
def extract_trans_detail(page): desc = get_trans_desc(page) version_list = get_trans_version_info(page) version_num = len(version_list) testimonials = get_trans_testimonials(page) takeaways = get_trans_takeaways(page) video_info = get_trans_video(page) who_attend_desc = get_trans_who_attend_desc(page) language = detect_language(desc) overview = { "desc": desc, "video_title": video_info["video_title"], "video_url": video_info["video_url"] } for version in version_list: version["version"] = version_num return { "overview": overview, "version_info_list": version_list, "testimonials": testimonials, "course_takeaways": takeaways, "who_attend_desc": who_attend_desc, "languages": language }
def extract_bap_detail(page): desc = get_bap_desc(page) version_list = get_bap_version_info(page) version_num = len(version_list) testimonials = get_bap_testimonials(page) takeaways = get_bap_takeaways(page) video_info = get_bap_video(page) who_attend_desc = get_bap_who_attend_desc(page) language = detect_language(desc) overview = { "desc": desc, "video_title": video_info["video_title"], "video_url": video_info["video_url"] } for version in version_list: version["version"] = version_num return { "desc": overview, "version_info_list": version_list, "testimonials": testimonials, "course_takeaways": takeaways, "who_attend_desc": who_attend_desc, "languages": language, "credential": "", "duration_consecutive": True }
def final_format_detail(details): for detail in details: detail["category_tags"] = '' if "languages" in detail: detail["languages"] = language_map(detail["languages"]) else: detail["languages"] = detect_language(detail["name"]) if detail["exec_ed_inquiry_cc_emails"].startswith("mailto:"): detail["exec_ed_inquiry_cc_emails"] = detail[ "exec_ed_inquiry_cc_emails"].replace("mailto:", '') if "course_takeaways" not in detail: print(detail["url"]) detail["credential"] = '' # print(f'{detail["url"]}') # pprint(detail["overview"]) return details
def check_attrs(details): detail_attrs = { 'name': '', 'url': '', 'university_school': '', 'category': '', 'desc': '', 'active': '', 'type': '', 'category_tags': '', 'priority': 0, 'publish': 100, 'version': '', 'location': '', 'currency': '', 'tuition_number': '', 'tuition_note': '', 'Repeatable': 'Y', 'effective_date_start': '', 'effective_date_end': '', 'duration_consecutive': '', 'languages': '', 'credential': '', 'course_takeaways': '', 'course_faculties': [], 'who_attend_desc': '', 'overview': '', 'testimonials': [], 'exec_ed_inquiry_cc_emails': '', 'schedule': [] } final_details = [] re_scrape_course_detail = [] for detail in details: rescrape_urls = [ "https://execedprograms.iese.edu/strategic-management/getting-things-done/", "https://execedprograms.iese.edu/strategic-management/artificial-intelligence/", "https://execedprograms.iese.edu/leadership-people-management/communication-skills/" ] if detail["url"] in rescrape_urls: rescrape_course = copy.deepcopy(detail) re_scrape_course_detail.append(rescrape_course) del detail continue if detail[ 'url'] == 'https://execedprograms.iese.edu/leadership-people-management/high-performance-negotiator': detail['version'] = 1 detail['location'] = 'Barcelona, ----, Spanish' detail['type'] = 'Onsite' detail["effective_date_start"] = '2021-06-05' detail["schedule"] = [[ detail["effective_date_start"], "", "", "formal" ]] if detail[ "url"] == "https://execedprograms.iese.edu/strategic-management/value-creation-effective-boards/": detail['version'] = 1 detail['location'] = 'Barcelona, ----, Spanish' detail['type'] = 'Onsite' detail["effective_date_start"] = '2021-05-24' detail["schedule"] = [[ detail["effective_date_start"], "", "", "formal" ]] if detail[ "url"] == "https://execedprograms.iese.edu/leadership-people-management/positive-leader/": detail['version'] = 1 detail['location'] = 'Barcelona, ----, Spanish' detail['type'] = 'Onsite' detail["effective_date_start"] = '2021-10-25' detail["schedule"] = [[ detail["effective_date_start"], "", "", "formal" ]] if 'tuition_note' not in detail: detail['tuition_note'] = '' duration_number = get_duration_number(detail) if 'tuition' in detail: detail['tuition_number'] = detail['tuition'] schedule = [[ detail.get("effective_start_date", ''), detail.get("effective_end_date", ''), duration_number, 'formal' ]] detail['effective_date_start'] = detail.get('effective_start_date', '') detail['effective_date_end'] = detail.get('effective_end_date', '') detail["schedule"] = schedule if 'overview' not in detail: detail["overview"] = { 'desc': detail['desc'], 'video_url': detail.get('video_url', ''), 'video_title': detail.get('video_title', '') } if 'languages' not in detail: detail['languages'] = detect_language(detail["name"]) location = detail.get('location', '') formatted_location = format_location(location, detail["url"]) detail["location"] = formatted_location # print(f'location {formatted_location}') type = detail.get('type', '') formatted_type = format_type(type, location, detail['url']) detail['type'] = formatted_type if 'desc' not in detail and detail['url'] == \ 'https://executiveeducation.iese.edu/es/consejeros-directivos-seniors/transformacion-digital/': source = requests.get(detail['url']).content page = bs4.BeautifulSoup(source, 'lxml') detail["desc"] = get_trans_desc(page).strip() if 'currency' not in detail and 'tuition_number' not in detail: source = requests.get(detail['url']).content page = bs4.BeautifulSoup(source, 'lxml') info = extract_tuition_fee_info(page)[0] detail['currency'] = info['currency'] detail['tuition_number'] = info['tuition_number'] detail['tuition_note'] = info['tuition_note'] if 'effective_start_date' in detail: detail["effective_date_start"] = detail.pop('effective_start_date') if 'effective_end_date' in detail: detail["effective_date_end"] = detail.pop('effective_end_date') if 'mailto:' in detail["exec_ed_inquiry_cc_emails"]: detail["exec_ed_inquiry_cc_emails"] = detail[ "exec_ed_inquiry_cc_emails"].replace('mailto:', '').strip() final_details.append(detail) for detail in re_scrape_course_detail: url = detail['url'] source = requests.get(url).content page = bs4.BeautifulSoup(source, 'lxml') ver1 = onsite_version_detail(page) ver2 = online_version_detail(page) info1 = {**detail, **ver1} info2 = {**detail, **ver2} final_details.append(info1) final_details.append(info2) course_set = set() for detail in final_details: for k in detail_attrs.keys(): if k not in detail: course_set.add(detail["url"]) print(f'{detail["url"]} no {k}') if 'version_info_list' in detail: del detail["version_info_list"] print(len(final_details)) return final_details