def remove_category(self, category): category_pretty_format = category[:category.find('-ppt')] if category.endswith('-ppt') else category DIRECTORY_NAME = category_pretty_format.title() DIRECTORY_PATH = os.path.join('/opt/kingsoft/wps-office/office6/mui/en_US/templates/{}/'.format(self.template_type), DIRECTORY_NAME) page_number = 0 url_template = 'http://www.ksosoft.com/{}-template?page={{}}'.format(category) url = url_template.format(page_number) templates_url_list = self.template_pattern.findall(web.get_source(url)) files_to_be_deleted = [] while len(templates_url_list): for url_component, filename_without_suffix, suffix in templates_url_list: filename = filename_without_suffix + '.' + suffix files_to_be_deleted.append(filename) page_number += 1 url = url_template.format(page_number) templates_url_list = self.template_pattern.findall(web.get_source(url)) FNULL = open(os.devnull, 'w') subprocess.call(['sudo', 'rm'] + [os.path.join(DIRECTORY_PATH, filename) for filename in files_to_be_deleted], stdout=FNULL, stderr=subprocess.STDOUT) if self.template_type == 'wpp': subprocess.call(['sudo', 'rm'] + map('/opt/kingsoft/wps-office/office6/mui/en_US/templates/presentationdesigns/{}'.format, files_to_be_deleted), stdout=FNULL, stderr=subprocess.STDOUT) subprocess.call(['sudo', 'rmdir', DIRECTORY_PATH], stdout=FNULL, stderr=subprocess.STDOUT) subprocess.call(['sudo', 'rmdir', '/opt/kingsoft/wps-office/office6/mui/en_US/templates/presentationdesigns/'], stdout=FNULL, stderr=subprocess.STDOUT) return True
def download_category(self, category, call_from_parent=False): category_pretty_format = category[:category.find('-ppt')] if category.endswith('-ppt') else category DESTINATION_DIRECTORY = self._create_directory(category_pretty_format) page_number = 0 url_template = 'http://www.ksosoft.com/{}-template?page={{}}'.format(category) url = url_template.format(page_number) templates_url_list = self.template_pattern.findall(web.get_source(url)) while len(templates_url_list): for url_component, filename_without_suffix, suffix in templates_url_list: file_content = self._download_template(self.homepage + url_component) assert self.not_empty(file_content), 'Error: Unable to download template, {}'.format(filename_without_suffix + suffix) with interrupt.KeyboardInterruptBlocked(): filename = filename_without_suffix + '.' + suffix DESTINATION = os.path.join(DESTINATION_DIRECTORY, filename) self._write(DESTINATION, file_content) page_number += 1 url = url_template.format(page_number) templates_url_list = self.template_pattern.findall(web.get_source(url)) if not call_from_parent: subprocess.call(['sudo', 'cp', '-r', DESTINATION_DIRECTORY, '/opt/kingsoft/wps-office/office6/mui/en_US/templates/{}'.format(self.template_type)]) if self.template_type == 'wpp': subprocess.call(['sudo', 'cp'] + glob.glob('{}/*'.format(directory)) + ['/opt/kingsoft/wps-office/office6/mui/en_US/templates/presentationdesigns/']) subprocess.call(['rm', '-rf', DESTINATION_DIRECTORY]) return DESTINATION_DIRECTORY
def download_all_categories(self): HOMEPAGE = 'http://www.ksosoft.com/{}-template'.format(self.url_reference) homepage_content = web.get_source(HOMEPAGE) pattern = re.compile('[\w -]+?-template(?!s)(?!-)(?=>)') raw_data = pattern.findall(homepage_content) section_listing = set(raw_data) section_to_category = lambda x: (x[:x.find('template') - 1]).lower() category_list = [section_to_category(section) for section in section_listing] category_directory_listing = [] print 'Starting to download templates for {}:'.format(self.template_type.upper()) for category in category_list: category_pretty_format = category[:category.find('-ppt')] if category.endswith('-ppt') else category print ' Downloading {} templates for {}...'.format(category_pretty_format, self.template_type.upper()) category_url_format ='-'.join(category.split()) category_directory_name = self.download_category(category_url_format, call_from_parent=True) category_directory_listing.append(category_directory_name) print 'Finishing up...' for directory in category_directory_listing: subprocess.call(['sudo', 'cp', '-r', directory, '/opt/kingsoft/wps-office/office6/mui/en_US/templates/{}'.format(self.template_type)]) if self.template_type == 'wpp': subprocess.call(['sudo', 'cp'] + glob.glob('{}/*'.format(directory)) + ['/opt/kingsoft/wps-office/office6/mui/en_US/templates/presentationdesigns/']) subprocess.call(['rm', '-rf', directory]) print 'Done!' return True
def _get_adjusted_total_pages(self, homepage): """ Get the total number of pages and then adjust it based on past progress Extract the total number of pages of books from the website, so that we can start from the end. Adjust the value based on progress already made (done by subtracting the pages completed from the current last page). Args: homepage (str): link to the homepage of a website Returns: int: the total number of pages """ page_content = web.get_source(homepage) total_pages_match = re.search('title="Last Page.*>(\d+)<', page_content) assert_message = 'Marker for finding total number of pages is not found!' interrupt.assert_extended(total_pages_match is not None, assert_message, self._save_progress) total_pages = int(total_pages_match.group(1)) adjusted_pages_count = total_pages - self.config.get('total_pages') + self.config.get('current_pages') self.config.set('total_pages', total_pages) return adjusted_pages_count
def remove_all_categories(self): HOMEPAGE = 'http://www.ksosoft.com/{}-template'.format(self.url_reference) homepage_content = web.get_source(HOMEPAGE) pattern = re.compile('[\w -]+?-template(?!s)(?!-)(?=>)') raw_data = pattern.findall(homepage_content) section_listing = set(raw_data) section_to_category = lambda x: (x[:x.find('template') - 1]).lower() category_list = [section_to_category(section) for section in section_listing] print 'Starting to removing templates for {}:'.format(self.template_type.upper()) for category in category_list: category_pretty_format = category[:category.find('-ppt')] if category.endswith('-ppt') else category print ' Removing {} templates for {}...'.format(category_pretty_format, self.template_type.upper()) category_url_format ='-'.join(category.split()) category_directory_name = self.remove_category(category_url_format) print 'Done!' return True
def get_list_of_books_page(self, page): """ Retrieve a list of books page From the given page, extract the links leading to each book and store the data in a list. This gives us a collection of links from which we can retrieve the relevant information and download the book. Args: page (str): link listing a set of books Returns: list: a list of links each of which leads to a webpage for a particular book """ HTML_LINK_TAG = '<a href="' BOOK_SECTION_MARKER = '"entry-title"' list_of_books_page = [] page_content = web.get_source(page) book_section_pattern = re.compile(BOOK_SECTION_MARKER) book_section_match = book_section_pattern.search(page_content) book_section_assert_message = 'Marker for finding book section not found!' book_page_assert_message = 'Marker for finding book link not found!' book_page_pattern = re.compile('<a href="(.+?)"') while interrupt.assert_extended(book_section_match is not None, book_section_assert_message, self._save_progress): book_section_start_index = book_section_match.start() book_page_match = book_page_pattern.search(page_content, marker_index) interrupt.assert_extended(book_page_match is not None, book_page_assert_message, self._save_progress) link = str(book_page_match.group(1)) list_of_books_page.append(link) book_section_match = book_section_pattern.search(page_content, book_page_match.end()) list_of_books_page.reverse() try: index_of_last_processed_book_page = list_of_books_page.index(self.config.get('url')) list_of_books_page = list_of_books_page[index_of_last_processed_book_page + 1:] except ValueError: pass return list_of_books_page