Esempio n. 1
0
    def add_file(self,
                 path,
                 title,
                 download_url,
                 write_data=True,
                 license=None,
                 **node_data):
        """ read: Creates file in csv and writes file to zip
            Args:
                path: (str) where in zip to write file
                title: (str) content's title
                download_url: (str) url or local path to download from
                write_data: (boolean) indicates whether to add as a csv entry (optional)
                license (str): content's license
                source_id: (str) content's original id (optional)
                description: (str) description of content (optional)
                author (str): who created the content (optional)
                language (str): language of content (optional)
                license_description (str): description of content's license (optional)
                copyright_holder (str): holder of content's license (optional)
                thumbnail (str):  path to thumbnail in zip (optional)
            Returns: path to file in zip
        """
        assert license, "Files must have a license"

        self._parse_path(path)
        _name, ext = os.path.splitext(download_url or "")
        filepath = "{}/{}{}".format(path, title, ext)
        if download_url and filepath:
            self._write_to_zip(filepath, read(download_url))
            if write_data:
                self._commit(filepath, title, license=license, **node_data)
            return filepath
Esempio n. 2
0
 def write_url(self, url, filename, directory="."):
     """ write_url: Write contents from url to filename in zip
         Args:
             url: (str) url to file to download
             filename: (str) name of file in zip
             directory: (str) directory in zipfile to write file to (optional)
         Returns: path to file in zip
     """
     return self.write_contents(filename, read(url), directory=directory)
def read_source(base, endpoint=None, loadjs=False):
    """ Read url """
    if base.count(
            'http://'
    ) > 1:  # Special case: http://web.archive.org/web/.../http://2012books.lardbucket.org/books/...
        return downloader.read("http://{}".format(base.split('http://')[-1]),
                               loadjs=loadjs)
    elif not endpoint:
        return downloader.read(base, loadjs=loadjs)
    elif endpoint.startswith('http'):
        return downloader.read(endpoint, loadjs=loadjs)
    elif endpoint.startswith('/'):
        return downloader.read(os.path.dirname(base) + endpoint.lstrip('/'),
                               loadjs=loadjs)
    else:
        return downloader.read(os.path.dirname(base).rstrip("/") + "/" +
                               endpoint,
                               loadjs=loadjs)
    def add_file(self,
                 path,
                 title,
                 download_url,
                 write_data=True,
                 ext=None,
                 license=None,
                 copyright_holder=None,
                 **node_data):
        """ add_file: Creates file in csv and writes file to zip
            Args:
                path: (str) where in zip to write file
                title: (str) content's title
                download_url: (str) url or local path to download from
                write_data: (boolean) indicates whether to add as a csv entry (optional)
                ext: (str) extension to use for file
                license (str): content's license
                copyright_holder (str): holder of content's license (required except for PUBLIC_DOMAIN)
                license_description (str): description of content's license (optional)
                source_id: (str) content's original id (optional)
                description: (str) description of content (optional)
                author (str): who created the content (optional)
                language (str): language of content (optional)
                thumbnail (str):  path to thumbnail in zip (optional)
            Returns: path to file in zip
        """
        if write_data:
            assert license, "Files must have a license"
            copyright_holder = None if copyright_holder.strip(
            ) == '' else copyright_holder
            assert license in NO_COPYRIGHT_HOLDER_REQUIRED or copyright_holder, "Licenses must have a copyright holder if they are not public domain"

        self._parse_path(path)
        if not ext:
            _name, ext = os.path.splitext(download_url or "")
        filepath = "{}/{}{}".format(path, title, ext)
        if download_url and filepath:
            self._write_to_zip(filepath, read(download_url))
            if write_data:
                self._commit(filepath,
                             title,
                             license=license,
                             copyright_holder=copyright_holder,
                             **node_data)
            return filepath
    def open(self):
        """ open: Opens pdf file to read from
            Args: None
            Returns: None
        """
        filename = os.path.basename(self.download_url)
        folder, _ext = os.path.splitext(filename)
        self.path = os.path.sep.join([self.directory, folder, filename])
        if not os.path.exists(os.path.dirname(self.path)):
            os.makedirs(os.path.dirname(self.path))

        # Download full pdf if it hasn't already been downloaded
        if not os.path.isfile(self.path):
            with open(self.path, "wb") as fobj:
                fobj.write(read(self.download_url))

        self.file = open(self.path, 'rb')
        self.pdf = CustomPDFReader(self.file)
Esempio n. 6
0
def read_source(url):
    """ Read page source as beautiful soup """
    html = downloader.read(url)
    return BeautifulSoup(html, 'html.parser')
Esempio n. 7
0
 def read(self, path, loadjs=False):
     return read(path,
                 loadjs=loadjs,
                 session=self.session,
                 driver=self.driver)
def read_source(endpoint="books"):
    """ Reads page source using downloader class to get json data """
    page_contents = downloader.read("{baseurl}/{endpoint}".format(
        baseurl=BASE_URL, endpoint=endpoint))
    return json.loads(page_contents)  # Open Stax url returns json object