Beispiel #1
0
 def extract_body(self, fp, basefile):
     pdffile = self.store.downloaded_path(basefile, attachment="index.pdf")
     # fp can now be a pointer to a hocr file, a pdf2xml file,
     # a html file or a StringIO object containing html taken
     # from index.xml
     if os.path.exists(pdffile):
         fp = self.parse_open(basefile)
         parser = "ocr" if ".hocr." in util.name_from_fp(fp) else "xml"
         reader = StreamingPDFReader().read(fp, parser=parser)
         identifier = self.canonical_uri(basefile)
         pdffile = self.store.downloaded_path(basefile, attachment="index.pdf")
         for page in reader:
             page.src = pdffile
         return reader
     else:
         # fp points to a HTML file, which we can use directly.
         # fp will be a raw bitstream of a latin-1 file.
         try:
             filename = util.name_from_fp(fp)
             self.log.debug("%s: Loading soup from %s" % (basefile, filename))
         except ValueError:
             self.log.debug("%s: Loading placeholder soup" % (basefile))
         text = fp.read()
         if text == "Propositionen ej utgiven":
             raise errors.DocumentRemovedError("%s was never published" % basefile)
         else:
             return BeautifulSoup(text, "lxml")
Beispiel #2
0
 def extract_body(self, fp, basefile):
     pdffile = self.store.downloaded_path(basefile, attachment="index.pdf")
     # fp can now be a pointer to a hocr file, a pdf2xml file,
     # a html file or a StringIO object containing html taken
     # from index.xml
     if os.path.exists(pdffile):
         fp = self.parse_open(basefile)
         parser = "ocr" if ".hocr." in util.name_from_fp(fp) else "xml"
         reader = StreamingPDFReader().read(fp, parser=parser)
         identifier = self.canonical_uri(basefile)
         pdffile = self.store.downloaded_path(basefile,
                                              attachment="index.pdf")
         for page in reader:
             page.src = pdffile
         return reader
     else:
         # fp points to a HTML file, which we can use directly.
         # fp will be a raw bitstream of a latin-1 file.
         try:
             filename = util.name_from_fp(fp)
             self.log.debug("%s: Loading soup from %s" %
                            (basefile, filename))
         except ValueError:
             self.log.debug("%s: Loading placeholder soup" % (basefile))
         text = fp.read()
         if text == "Propositionen ej utgiven":
             raise errors.DocumentRemovedError("%s was never published" %
                                               basefile)
         else:
             return BeautifulSoup(text, "lxml")
Beispiel #3
0
    def _process_file(self, filename, buf, destdir, origin=""):
        """
        Helper function to concatenate or copy CSS/JS (optionally
        processing them with e.g. Scss) or other files to correct place
        under the web root directory.

        :param filename: The name (relative to the ferenda package) of the file
        :param buf: A buffer into which the contents of the file is written
                    (if combineresources == True)
        :param destdir: The directory into which the file will be copied
                        (unless combineresources == True)
        :param origin: The source of the configuration that specifies this file
        :returns: The URL path of the resulting file, relative to the web root
                  (or None if combineresources == True)
        :rtype: str
        """
        if filename.startswith("http://") or filename.startswith("https://"):
            if self.config.combineresources:
                raise errors.ConfigurationError(
                    "makeresources: Can't use combineresources=True in combination with external js/css URLs (%s)"
                    % filename)
            self.log.debug("Using external url %s" % filename)
            return filename
        try:
            fp = self.resourceloader.openfp(filename, binary=True)
        except errors.ResourceNotFound:
            self.log.warning("file %(filename)s (specified in %(origin)s)"
                             " doesn't exist" % locals())
            return None

        (base, ext) = os.path.splitext(filename)

        if self.config.combineresources:
            self.log.debug("combining %s into buffer" % filename)
            d = fp.read()
            buf.write(d)
            fp.close()
            return None
        else:
            # FIXME: don't copy (at least not log) if the outfile
            # already exists.
            # self.log.debug("writing %s out to %s" % (filename, destdir))
            outfile = destdir + os.sep + os.path.basename(filename)
            if (os.path.islink(outfile) and os.path.relpath(
                    os.path.join(os.path.dirname(outfile),
                                 os.readlink(outfile)))
                    == util.name_from_fp(fp)):
                self.log.warning(
                    "%s is a symlink to source file %s, won't overwrite" %
                    (outfile, util.name_from_fp(fp)))
            else:
                util.ensure_dir(outfile)
                with open(outfile, "wb") as fp2:
                    fp2.write(fp.read())
                fp.close()
            return self._filepath_to_urlpath(outfile, 2)
Beispiel #4
0
 def close(self, *args, **kwargs):
     if "w" in self.mode:
         tempname = util.name_from_fp(self.fp)
         ret = self.fp.close()
         if not os.path.exists(self.filename) or not filecmp.cmp(
                 tempname, self.filename):
             util.ensure_dir(self.filename)
             shutil.move(tempname, self.filename)
             # since _open uses NamedTemporaryFile, which creates
             # files only readable by the creating user, we need to
             # set more liberal permissions. FIXME: This should
             # respect os.umask()
             os.chmod(
                 self.filename, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP
                 | stat.S_IWGRP | stat.S_IROTH)
         else:
             os.unlink(tempname)
         return ret
     else:
         # This is needed sometimes since
         # Bzip2File/LZMAFile/GzipFile doesn't close the open file
         # objects that they wrap
         if hasattr(self.fp,
                    '_fp'):  # for Bzip2File/LZMAFile with IOBufferedReader
             self.fp._fp.close()
         if hasattr(self.fp,
                    'fileobj'):  # for GzipFile in the same situation
             self.fp.fileobj.close()
         return self.fp.close()
Beispiel #5
0
    def _process_file(self, filename, buf, destdir, origin=""):
        """
        Helper function to concatenate or copy CSS/JS (optionally
        processing them with e.g. Scss) or other files to correct place
        under the web root directory.

        :param filename: The name (relative to the ferenda package) of the file
        :param buf: A buffer into which the contents of the file is written
                    (if combineresources == True)
        :param destdir: The directory into which the file will be copied
                        (unless combineresources == True)
        :param origin: The source of the configuration that specifies this file
        :returns: The URL path of the resulting file, relative to the web root
                  (or None if combineresources == True)
        :rtype: str
        """
        if filename.startswith("http://") or filename.startswith("https://"):
            if self.config.combineresources:
                raise errors.ConfigurationError(
                    "makeresources: Can't use combineresources=True in combination with external js/css URLs (%s)" % filename)
            self.log.debug("Using external url %s" % filename)
            return filename
        try: 
            fp = self.resourceloader.openfp(filename, binary=True)
        except errors.ResourceNotFound:
            self.log.warning("file %(filename)s (specified in %(origin)s)"
                             " doesn't exist" % locals())
            return None

        (base, ext) = os.path.splitext(filename)

        if self.config.combineresources:
            self.log.debug("combining %s into buffer" % filename)
            d = fp.read()
            buf.write(d)
            fp.close()
            return None
        else:
            # FIXME: don't copy (at least not log) if the outfile
            # already exists.
            # self.log.debug("writing %s out to %s" % (filename, destdir))
            outfile = destdir + os.sep + os.path.basename(filename)
            if (os.path.islink(outfile) and
                os.path.relpath(os.path.join(os.path.dirname(outfile),
                                             os.readlink(outfile))) == util.name_from_fp(fp)):
                self.log.warning("%s is a symlink to source file %s, won't overwrite" % (outfile, util.name_from_fp(fp)))
            else:
                util.ensure_dir(outfile)
                with open(outfile, "wb") as fp2:
                    fp2.write(fp.read())
                fp.close()
            return self._filepath_to_urlpath(outfile, 2)
Beispiel #6
0
 def extract_body(self, fp, basefile):
     # If we can asssume that the fp is a hOCR HTML file and not a
     # PDF2XML file, use alternate parser. FIXME: There ought to be
     # a cleaner way than guessing based on filename
     parser = "ocr" if ".hocr." in util.name_from_fp(fp) else "xml"
     reader = StreamingPDFReader().read(fp, parser=parser)
     baseuri = self.canonical_uri(basefile)
     for page in reader:
         page.src = "%s/sid%s.png" % (baseuri, page.number)
     if reader.is_empty():
         raise DocumentRemovedError(dummyfile=self.store.parsed_path(basefile))
     else:
         return reader
Beispiel #7
0
 def extract_body(self, fp, basefile):
     if util.name_from_fp(fp).endswith((".txt", ".txt.bz2")):
         bodystring = fp.read()
         if isinstance(bodystring, bytes):
             # fp is opened in bytestream mode
             bodystring = bodystring.decode("utf-8")
         return TextReader(string=bodystring)
     else:
         reader = super(PropTrips, self).extract_body(fp, basefile)
         pdffile = self.store.downloaded_path(basefile, attachment="index.pdf")
         for page in reader:
             page.src = pdffile
         return reader
Beispiel #8
0
 def close(self, *args, **kwargs):
     if "w" in self.mode:
         tempname = util.name_from_fp(self.fp)
         ret = self.fp.close()
         if not os.path.exists(self.filename) or not filecmp.cmp(
                 tempname, self.filename):
             util.ensure_dir(self.filename)
             shutil.move(tempname, self.filename)
         else:
             os.unlink(tempname)
         return ret
     else:
         return self.fp.close()
Beispiel #9
0
 def extract_body(self, fp, basefile):
     # If we can asssume that the fp is a hOCR HTML file and not a
     # PDF2XML file, use alternate parser. FIXME: There ought to be
     # a cleaner way than guessing based on filename
     parser = "ocr" if ".hocr." in util.name_from_fp(fp) else "xml"
     reader = StreamingPDFReader().read(fp, parser=parser)
     baseuri = self.canonical_uri(basefile)
     for page in reader:
         page.src = "%s/sid%s.png" % (baseuri, page.number)
     if reader.is_empty():
         raise DocumentRemovedError(
             dummyfile=self.store.parsed_path(basefile))
     else:
         return reader
Beispiel #10
0
 def close(self, *args, **kwargs):
     if "w" in self.mode:
         tempname = util.name_from_fp(self.fp)
         ret = self.fp.close()
         if not os.path.exists(self.filename) or not filecmp.cmp(tempname, self.filename):
             util.ensure_dir(self.filename)
             shutil.move(tempname, self.filename)
             # since _open uses NamedTemporaryFile, which creates
             # files only readable by the creating user, we need to
             # set more liberal permissions. FIXME: This should
             # respect os.umask()
             os.chmod(self.filename, stat.S_IRUSR|stat.S_IWUSR|stat.S_IRGRP|stat.S_IWGRP|stat.S_IROTH)
         else:
             os.unlink(tempname)
         return ret
     else:
         # This is needed sometimes since
         # Bzip2File/LZMAFile/GzipFile doesn't close the open file
         # objects that they wrap
         if hasattr(self.fp, '_fp'):  # for Bzip2File/LZMAFile with IOBufferedReader
             self.fp._fp.close()
         if hasattr(self.fp, 'fileobj'):  # for GzipFile in the same situation
             self.fp.fileobj.close()
         return self.fp.close()