Esempi in Python per get_source, esempi in Python per lib.utils.web.get_source

Esempio n. 1

0

Mostra file

    def remove_category(self, category):
        category_pretty_format = category[:category.find('-ppt')] if category.endswith('-ppt') else category
        DIRECTORY_NAME = category_pretty_format.title()
        DIRECTORY_PATH = os.path.join('/opt/kingsoft/wps-office/office6/mui/en_US/templates/{}/'.format(self.template_type), DIRECTORY_NAME)
        page_number = 0
        url_template = 'http://www.ksosoft.com/{}-template?page={{}}'.format(category)

        url = url_template.format(page_number)
        templates_url_list = self.template_pattern.findall(web.get_source(url))
        files_to_be_deleted = []
        while len(templates_url_list):
            for url_component, filename_without_suffix, suffix in templates_url_list:
                filename = filename_without_suffix + '.' + suffix
                files_to_be_deleted.append(filename)

            page_number += 1
            url = url_template.format(page_number)
            templates_url_list = self.template_pattern.findall(web.get_source(url))
        FNULL = open(os.devnull, 'w')
        subprocess.call(['sudo', 'rm'] + [os.path.join(DIRECTORY_PATH, filename) for filename in files_to_be_deleted], stdout=FNULL, stderr=subprocess.STDOUT)
        if self.template_type == 'wpp':
            subprocess.call(['sudo', 'rm'] + map('/opt/kingsoft/wps-office/office6/mui/en_US/templates/presentationdesigns/{}'.format, files_to_be_deleted), stdout=FNULL, stderr=subprocess.STDOUT)
        subprocess.call(['sudo', 'rmdir', DIRECTORY_PATH], stdout=FNULL, stderr=subprocess.STDOUT)
        subprocess.call(['sudo', 'rmdir', '/opt/kingsoft/wps-office/office6/mui/en_US/templates/presentationdesigns/'], stdout=FNULL, stderr=subprocess.STDOUT)
        return True

Esempio n. 2

0

Mostra file

    def download_category(self, category, call_from_parent=False):
        category_pretty_format = category[:category.find('-ppt')] if category.endswith('-ppt') else category
        DESTINATION_DIRECTORY = self._create_directory(category_pretty_format)
        page_number = 0
        url_template = 'http://www.ksosoft.com/{}-template?page={{}}'.format(category)

        url = url_template.format(page_number)
        templates_url_list = self.template_pattern.findall(web.get_source(url))
        while len(templates_url_list):
            for url_component, filename_without_suffix, suffix in templates_url_list:
                file_content = self._download_template(self.homepage + url_component)
                assert self.not_empty(file_content), 'Error: Unable to download template, {}'.format(filename_without_suffix + suffix)

                with interrupt.KeyboardInterruptBlocked():
                    filename = filename_without_suffix + '.' + suffix
                    DESTINATION = os.path.join(DESTINATION_DIRECTORY, filename)
                    self._write(DESTINATION, file_content)

            page_number += 1
            url = url_template.format(page_number)
            templates_url_list = self.template_pattern.findall(web.get_source(url))
        if not call_from_parent:
            subprocess.call(['sudo', 'cp', '-r', DESTINATION_DIRECTORY, '/opt/kingsoft/wps-office/office6/mui/en_US/templates/{}'.format(self.template_type)])
            if self.template_type == 'wpp':
                subprocess.call(['sudo', 'cp'] + glob.glob('{}/*'.format(directory))  + ['/opt/kingsoft/wps-office/office6/mui/en_US/templates/presentationdesigns/'])
            subprocess.call(['rm', '-rf', DESTINATION_DIRECTORY])
        return DESTINATION_DIRECTORY

Esempio n. 3

0

Mostra file

    def download_all_categories(self):
        HOMEPAGE = 'http://www.ksosoft.com/{}-template'.format(self.url_reference)
        homepage_content = web.get_source(HOMEPAGE)

        pattern = re.compile('[\w -]+?-template(?!s)(?!-)(?=>)')
        raw_data = pattern.findall(homepage_content)
        section_listing = set(raw_data)

        section_to_category = lambda x: (x[:x.find('template') - 1]).lower()
        category_list = [section_to_category(section) for section in section_listing]

        category_directory_listing = []
        print 'Starting to download templates for {}:'.format(self.template_type.upper())
        for category in category_list:
            category_pretty_format = category[:category.find('-ppt')] if category.endswith('-ppt') else category
            print '    Downloading {} templates for {}...'.format(category_pretty_format, self.template_type.upper())
            category_url_format ='-'.join(category.split())
            category_directory_name = self.download_category(category_url_format, call_from_parent=True)
            category_directory_listing.append(category_directory_name)
        print 'Finishing up...'
        for directory in category_directory_listing:
            subprocess.call(['sudo', 'cp', '-r', directory, '/opt/kingsoft/wps-office/office6/mui/en_US/templates/{}'.format(self.template_type)])
            if self.template_type == 'wpp':
                subprocess.call(['sudo', 'cp'] + glob.glob('{}/*'.format(directory))  + ['/opt/kingsoft/wps-office/office6/mui/en_US/templates/presentationdesigns/'])
            subprocess.call(['rm', '-rf', directory])
        print 'Done!'
        return True

Esempio n. 4

0

Mostra file

    def _get_adjusted_total_pages(self, homepage):
        """
        Get the total number of pages and then adjust it based on past progress

        Extract the total number of pages of books from the website, so that we can
        start from the end.  Adjust the value based on progress already made (done by
        subtracting the pages completed from the current last page).

        Args:
            homepage (str): link to the homepage of a website

        Returns:
            int: the total number of pages
        """
        page_content = web.get_source(homepage)

        total_pages_match = re.search('title="Last Page.*>(\d+)<', page_content)
        assert_message = 'Marker for finding total number of pages is not found!'
        interrupt.assert_extended(total_pages_match is not None, assert_message, self._save_progress)

        total_pages = int(total_pages_match.group(1))
        adjusted_pages_count = total_pages - self.config.get('total_pages') + self.config.get('current_pages')
        self.config.set('total_pages', total_pages)

        return adjusted_pages_count

Esempio n. 5

0

Mostra file

    def remove_all_categories(self):
        HOMEPAGE = 'http://www.ksosoft.com/{}-template'.format(self.url_reference)
        homepage_content = web.get_source(HOMEPAGE)

        pattern = re.compile('[\w -]+?-template(?!s)(?!-)(?=>)')
        raw_data = pattern.findall(homepage_content)
        section_listing = set(raw_data)

        section_to_category = lambda x: (x[:x.find('template') - 1]).lower()
        category_list = [section_to_category(section) for section in section_listing]

        print 'Starting to removing templates for {}:'.format(self.template_type.upper())
        for category in category_list:
            category_pretty_format = category[:category.find('-ppt')] if category.endswith('-ppt') else category
            print '    Removing {} templates for {}...'.format(category_pretty_format, self.template_type.upper())
            category_url_format ='-'.join(category.split())
            category_directory_name = self.remove_category(category_url_format)
        print 'Done!'
        return True

Esempio n. 6

0

Mostra file

    def get_list_of_books_page(self, page):
        """
        Retrieve a list of books page

        From the given page, extract the links leading to each book and store the data in a list.
        This gives us a collection of links from which we can retrieve the relevant information and
        download the book.

        Args:
            page (str): link listing a set of books

        Returns:
            list: a list of links each of which leads to a webpage for a particular book
        """
        HTML_LINK_TAG = '<a href="'
        BOOK_SECTION_MARKER = '"entry-title"'

        list_of_books_page = []
        page_content = web.get_source(page)

        book_section_pattern = re.compile(BOOK_SECTION_MARKER)
        book_section_match = book_section_pattern.search(page_content)
        book_section_assert_message = 'Marker for finding book section not found!'
        book_page_assert_message = 'Marker for finding book link not found!'

        book_page_pattern = re.compile('<a href="(.+?)"')
        while interrupt.assert_extended(book_section_match is not None, book_section_assert_message, self._save_progress):
            book_section_start_index = book_section_match.start()
            book_page_match = book_page_pattern.search(page_content, marker_index)

            interrupt.assert_extended(book_page_match is not None, book_page_assert_message, self._save_progress)
            link = str(book_page_match.group(1))
            list_of_books_page.append(link)
            book_section_match = book_section_pattern.search(page_content, book_page_match.end())

        list_of_books_page.reverse()
        try:
            index_of_last_processed_book_page = list_of_books_page.index(self.config.get('url'))
            list_of_books_page = list_of_books_page[index_of_last_processed_book_page + 1:]
        except ValueError:
            pass
        return list_of_books_page