def menu_item(str_item, mode): sleep(style.line_sleep) if mode == 1: print((' ' * 8) + str_item) else: print('\n' + style.string_color((' ' * 4) + str_item, "blue"))
def get_user_input(options): """ :param options: list of valid options :return: validated user input, ready to use """ sleep(style.line_sleep) option = input(style.string_color("\n => Option number: ", "cyan")) if option not in options: print( style.string_color((' ' * 7) + f"Option {option} not valid", "red")) sleep(style.end_line_sleep) get_user_input(options) else: return option
def init(): header(" scraper setup", "", "cyan", "left", 1, "-", "green", 0) print(style.string_color(" Base URL:", "blue")) while True: try: response = url_tool.get_base_url() if "/clases/" in response[0]: update_mode(1) with open("000 - Preview.html", "w") as page: page.writelines(response[1].content.decode("UTF-8")) if tools.continue_dialog(): return break except (KeyboardInterrupt, EOFError): if tools.exit_dialog(): return if mode == 2: try: scrape_courses(response[1]) except (KeyboardInterrupt, EOFError): if tools.exit_dialog(): return else: try: scrape_course("", response[1], response[0]) except (KeyboardInterrupt, EOFError): if tools.exit_dialog(): return
def download_page(url, target_path, name): tools.check_path(target_path) p_print.line_char("-", "yellow", 2) print(style.string_color(" Step 1: Downloading Webpage\n", "blue")) r = make_requests(url) if not os.path.exists(target_path + '/' + name): with open(f"{target_path}/{name}", "w") as page: page.writelines(r.content.decode("UTF-8")) print_dialog("\n", "Webpage downloaded", 3) else: print_dialog("\n", "File already exists, skipping", 2)
def print_dialog(pre_char, message, m_type): """ Print a program dialog :param pre_char: Character to put at begin or message :param message: Text to show in dialog :param m_type: 1=error dialog, 2=warning dialog, 3=success dialog :return: """ message = " " + message if m_type == 1: print( style.string_color(pre_char + " Error: " + "\n" + message, "red")) elif m_type == 2: print( style.string_color(pre_char + " Warning: " + "\n" + message, "yellow")) else: print( style.string_color(pre_char + " Success" + "\n" + message, "green"))
def line_char(char, color, spaces): """ Print a line filled with characters from begin to end :param color: Color for chars :param char: Char to fill line :param spaces: Spaces at begin and the end """ update_terminal_cols() if spaces == 0: str_output = char * TERMINAL_COLS else: str_output = char * (TERMINAL_COLS - (spaces * 2)) str_output = ' ' * spaces + str_output + ' ' * spaces str_output = style.string_color(str_output, color) print(str_output)
def header(title, extra, color, pos, str_case, decoration, d_color, d_spaces): """ Print a special header for current task :param title: Task title :param extra: Extra information about task :param color: Color title :param pos: Title position :param str_case: 1 upper case, all other is lower case :param decoration: Character to fill borders :param d_color: Borders color :param d_spaces: Spaces at begin and end of border :return: """ str_title = "" if str_case == 1: for char in title: if char == " ": str_title += char * 2 continue str_title += ' ' + char.upper() str_title = str_title[1:] str_title = style.string_position(str_title, pos, 0) str_title = style.string_color(str_title, color) line_char(decoration, d_color, d_spaces) print(str_title) if len(extra) > 0: if type(extra) == str: print(extra) else: for i in extra: print(i) line_char(decoration, d_color, d_spaces)
def scrape_courses(r_object): tools.clear_screen() em = " Load and select courses to process" header(" scraper setup", em, "cyan", "left", 1, "-", "green", 0) print(style.string_color(" Course list:", "blue")) target_page = html.fromstring(r_object.content) course_list = [] course_count = 0 for course in target_page.xpath('//div[@class="RoutesList-items"]'): for link in course.xpath('a[@class="RoutesList-item"]/@href'): name = style.format_name_string( course.xpath('a[@href="' + link + '"]/h4/text()')[0]) url = url_tool.base_url + link course_list.append({ "name": name, "url": url, "index": course_count, "active": True }) course_count += 1 # print info about data loaded from r_object sleep(0.2) print(string_color(f"\n Total courses: {course_count}", "cyan")) print() p_print.line_char("-", "cyan", 2) for data in course_list: print(f" Name: {string_color(data['name'], 'green')}") sleep(style.line_sleep) print(f" URL: {string_color(data['url'], 'green')}") sleep(style.line_sleep) print(f" Index: {string_color(data['index'], 'cyan')}") sleep(style.line_sleep) print() while True: try: exclude_list = input("\n Courses to exclude (index) => ") if len(exclude_list) > 0: for index in exclude_list.split(" "): if int(index) <= len(course_list) - 1: course_list[int(index)]["active"] = False print( string_color("\n Courses excluded from list: \n", "blue")) p_print.line_char("-", "red", 2) for i in course_list: if not i["active"]: print(f" Name: {string_color(i['name'], 'red')}") sleep(style.line_sleep) print(f" URL: {string_color(i['url'], 'red')}") sleep(style.line_sleep) print(f" Index: {string_color(i['index'], 'red')}") sleep(style.line_sleep) print() break except (KeyboardInterrupt, EOFError): if tools.exit_dialog(): return if tools.continue_dialog(): return course_count = 1 for i in course_list: if course_count > 9: cn = "0" else: cn = "00" if i["active"]: while True: r = url_tool.make_requests(i["url"]) if r.status_code == 200: scrape_course(f"{cn}{course_count}", r, i["url"]) course_count += 1 break else: if not tools.retry_dialog(): break
def scrape_course(numeration, r_object, course_url): data = {} tools.clear_screen() target_page = html.fromstring(r_object.content) course_title = style.format_name_string( target_page.xpath('//h1[@class="CourseDetail-left-title"]/text()')[0]) if mode == 2: course_title = f"{numeration} - {course_title}" em = style.format_string(f" Processing: {course_title}", 16) header(" downloading", em, "cyan", "left", 1, "-", "green", 0) print(style.string_color(" Course information: \n", "blue")) sleep(style.line_sleep) print(style.format_string(f" Course: {course_title}", 12)) # Download course page helper.download_page(course_url, course_title, "000 - Preview.html") course_url = course_url.replace("/clases/", "/cursos/") helper.download_page(course_url, course_title, "Course Details.html") sections = target_page.xpath('//div[@class="Material-concept"]') s_count = 1 for s in sections: sn = "00" if s_count > 9: sn = "0" s_title = s.xpath('div[@class="Material-concept-edit"]' '/h3[@class="Material-title"]/text()')[0] s_title = f"{sn}{s_count} - {style.format_name_string(s_title)}" # print() # p_print.line_char("-", "cyan", 2) # print(style.format_string(f" Section: {s_title}\n", 13)) l_count = 1 for lesson in s.xpath('div[@class="MaterialItem-content"]'): ln = "00" if l_count > 9: ln = "0" lock_element = 'div/div[@class="MaterialItem-copy"]' \ '/div[@class="MaterialItem-copy-actions"]/div[' \ '@class="MaterialItem-copy-actions-anchor"]/i/@class ' if not len(lesson.xpath(lock_element)) > 0: if len(lesson.xpath( 'div/div[@class="MaterialItem-video"]')) > 0: # lesson_type = "[VIDEO_NAME]" # course_data["type"] = "video" l_type = "video" else: # lesson_type = "[MATERIAL_NAME]" # course_data["type"] = "material" l_type = "material" l_title = lesson.xpath('div/div[@class="MaterialItem-copy"]' '/p[@class="MaterialItem-copy-title"]' '/text()') l_title = f"{ln}{l_count} - {style.format_name_string(l_title[0])}" # print(style.format_string(f" Lesson: {l_title}", 14)) l_url = lesson.xpath('a[@class="MaterialItem-anchor"]' '/@href')[0] l_url = url_tool.base_url + l_url # print(style.format_string(f" URL: {l_url}", 13)) # set data and start lesson download action data["path"] = course_title + '/' + s_title data["name"] = l_title data["url"] = l_url data["type"] = l_type if l_type == "video": data["extra_path"] = f"{course_title}/{s_title}/" data["extra_path"] += f"{ln}{l_count} - extra_files" data["webpage"] = f"{ln}{l_count} - webpage.html" helper.download_lesson(data) tools.clear_screen() l_count += 1 s_count += 1 input("\n\n Press enter to continue")