Example #1
0
    def download(self, basefile=None):
        if basefile:
            return self.download_single(basefile)

        if not self.config.mediawikidump:
            resp = requests.get(self.config.mediawikidump)
            xml = etree.parse(resp.content)

        wikinamespaces = []
        # FIXME: Find out the proper value of MW_NS
        for ns_el in xml.findall("//" + MW_NS + "namespace"):
            wikinamespaces.append(ns_el.text)

        # Get list of currently downloaded pages - if any of those
        # does not appear in the XML dump, remove them afterwards
        basefiles = self.store.list_basefiles_for("parse")
        downloaded_files = [self.store.downloaded_path(x) for x in basefiles]

        for page_el in xml.findall(MW_NS + "page"):
            basefile = page_el.find(MW_NS + "title").text
            if basefile == "Huvudsida":
                continue
            if ":" in basefile and basefile.split(":")[0] in wikinamespaces:
                (namespace, localtitle) = basefile.split(":", 1)
                if namespace not in self.config.mediawikinamespaces:
                    continue
            with self.store.open_downloaded(title, "w"):
                f.write(etree.tostring(page_el, encoding="utf-8"))

            if basefile in basefiles:
                del basefiles[basefiles.index(basefile)]

        for b in basefiles:
            self.log.debug("Removing stale %s" % b)
            util.robust_remove(self.store.downloaded_path(b))
Example #2
0
 def wrapper(self, doc):
     try:
         return f(self, doc)
     except DocumentRemovedError as e:
         self.log.info(
             "Document has been removed (%s)", e)
         util.robust_remove(self.parsed_path(doc.basefile))
         return False
     except ParseError as e:
         self.log.error("ParseError %s", e)
         # FIXME: we'd like to use the shorter "if
         # ('fatalexceptions' in self.config" but a Mock we're
         # using in testDecorators.Decorators.test_handleerror does
         # not emulate this way of using the LayeredConfig
         # object. Until we rewrite the testcase better, this is
         # what we have to do.
         if (hasattr(self.config, 'fatalexceptions') and
                 self.config.fatalexceptions):
             raise
         else:
             return False
     except Exception:
         self.log.exception("parse failed")
         # FIXME: see above
         if (hasattr(self.config, 'fatalexceptions') and
                 self.config.fatalexceptions):
             raise
         else:
             return False
Example #3
0
    def GenerateMapAll(self):
        mapfile = os.path.sep.join(
            [self.baseDir, 'dv', 'generated', 'uri.map'])
        util.robust_remove(mapfile + ".new")

        parsed_dir = os.path.sep.join([self.baseDir, 'dv', 'parsed'])
        self._do_for_all(parsed_dir, '.xht2', self.GenerateMap)
        util.robustRename(mapfile + ".new", mapfile)
Example #4
0
 def test_loadgraphs(self):
     with open("graph_a.ttl", "w") as fp:
         fp.write(self.graph_a)
     with open("graph_a.nt", "w") as fp:
         fp.write(self.graph_a_nt)
     self.tester.assertEqualGraphs("graph_a.ttl", "graph_a.nt")
     util.robust_remove("graph_a.ttl")
     util.robust_remove("graph_a.nt")
Example #5
0
 def test_loadgraphs(self):
     with open("graph_a.ttl", "w") as fp:
         fp.write(self.graph_a)
     with open("graph_a.nt", "w") as fp:
         fp.write(self.graph_a_nt)
     self.tester.assertEqualGraphs("graph_a.ttl", "graph_a.nt")
     util.robust_remove("graph_a.ttl")
     util.robust_remove("graph_a.nt")
Example #6
0
 def transform(self, indata, config=None, parameters={}):
     strparams = {}
     if config:
         # paths to be used with the document() function
         # must use unix path separators
         if os.sep == "\\":
             config = config.replace(os.sep, "/")
         # print("Tranform: Using config %s. Contents:" % config)
         # print(util.readfile(config))
         config_fullpath = os.path.abspath(config)
         strparams['configurationfile'] = XSLT.strparam(config_fullpath)
     removefiles = []
     for key, value in parameters.items():
         if key.endswith("file") and value:
             if all(ord(c) < 128 and c != " " for c in value):
                 # IF the file name contains ONLY ascii chars and
                 # no spaces, we can use it directly. However, we
                 # need to relativize path of file relative to the
                 # XSL file we'll be using. The mechanism could be
                 # clearer...
                 value = os.path.relpath(value, self.templdir)
             else:
                 # If the filename contains non-ascii characters or
                 # space, any attempt to eg
                 # "document($annotationfile)" in the XSLT document
                 # will silently fail. Seriously, f**k lxml's error
                 # handling. In this case, copy it to a temp file
                 # (in the temporary templdir, with ascii filename)
                 # and use that.
                 contents = util.readfile(value)
                 value = os.path.basename(value)
                 value = "".join(c for c in value
                                 if ord(c) < 128 and c != " ")
                 removefiles.append(self.templdir + os.sep + value)
                 util.writefile(self.templdir + os.sep + value, contents)
             if os.sep == "\\":
                 value = value.replace(os.sep, "/")
         strparams[key] = XSLT.strparam(value)
     try:
         return self._transformer(indata, **strparams)
     except etree.XSLTApplyError as e:
         # the exception will only contain the last error. Errors
         # emanting from the xhtml file will not have file/line
         # number information. Errors emanting from the xslt file
         # do have file/line number info, and is probably more
         # useful to deal with.
         for error in self._transformer.error_log:
             if error.line:
                 log.error("%s: %s (line %s)" %
                           (error.filename, error.message, error.line))
         raise errors.TransformError(str(e))
     finally:
         for f in removefiles:
             util.robust_remove(f)
     # FIXME: This can never be reached, if _transformer() does not
     # raise an error, the above returns immediately.
     if len(self._transformer.error_log) > 0:
         raise errors.TransformError(str(_transformer.error_log))
Example #7
0
    def textreader_from_basefile(self, basefile, encoding):
        infile = self.store.downloaded_path(basefile)
        tmpfile = self.store.path(basefile, "intermediate", ".pdf")
        outfile = self.store.path(basefile, "intermediate", ".txt")
        util.copy_if_different(infile, tmpfile)
        util.runcmd("pdftotext %s" % tmpfile, require_success=True)
        util.robust_remove(tmpfile)

        return TextReader(outfile, encoding=encoding, linesep=TextReader.UNIX)
Example #8
0
 def transform(self, indata, config=None, parameters={}):
     strparams = {}
     if config:
         # paths to be used with the document() function
         # must use unix path separators
         if os.sep == "\\":
             config = config.replace(os.sep, "/")
         # print("Tranform: Using config %s. Contents:" % config)
         # print(util.readfile(config))
         config_fullpath = os.path.abspath(config)
         strparams['configurationfile'] = XSLT.strparam(config_fullpath)
     removefiles = []
     for key, value in parameters.items():
         if key.endswith("file") and value:
             if all(ord(c) < 128 and c != " " for c in value):
                 # IF the file name contains ONLY ascii chars and
                 # no spaces, we can use it directly. However, we
                 # need to relativize path of file relative to the
                 # XSL file we'll be using. The mechanism could be
                 # clearer...
                 value = os.path.relpath(value, self.templdir)
             else:
                 # If the filename contains non-ascii characters or
                 # space, any attempt to eg
                 # "document($annotationfile)" in the XSLT document
                 # will silently fail. Seriously, f**k lxml's error
                 # handling. In this case, copy it to a temp file
                 # (in the temporary templdir, with ascii filename)
                 # and use that.
                 contents = util.readfile(value)
                 value = os.path.basename(value)
                 value = "".join(c for c in value if ord(c) < 128 and c != " ")
                 removefiles.append(self.templdir+os.sep+value)
                 util.writefile(self.templdir+os.sep+value, contents)
             if os.sep == "\\":
                 value = value.replace(os.sep, "/")
         strparams[key] = XSLT.strparam(value)
     try:
         return self._transformer(indata, **strparams)
     except etree.XSLTApplyError as e:
         # the exception will only contain the last error. Errors
         # emanting from the xhtml file will not have file/line
         # number information. Errors emanting from the xslt file
         # do have file/line number info, and is probably more
         # useful to deal with.
         for error in self._transformer.error_log:
             if error.line:
                 log.error("%s: %s (line %s)" % (error.filename, error.message, error.line))
         raise errors.TransformError(str(e))
     finally:
         for f in removefiles:
             util.robust_remove(f)
     # FIXME: This can never be reached, if _transformer() does not
     # raise an error, the above returns immediately.
     if len(self._transformer.error_log) > 0:
         raise errors.TransformError(str(_transformer.error_log))
Example #9
0
 def tearDown(self):
     manager.config_loaded = False
     manager.shutdown_logger()
     if self.orig_cwd:
         os.chdir(self.orig_cwd)
         shutil.rmtree(self.tempdir)
         sys.path.remove(self.tempdir)
     else:
         # all tests took place in the project directory, so we
         # have to clean some crap out.
         for crap in ("ferenda.ini", "example.py", "index.html",
                      "index.xhtml", "other.css", "rsrc", "data",
                      "dummyfile.txt", "test.css", "test.js", "test.png"):
             util.robust_remove(crap)
Example #10
0
 def tearDown(self):
     manager.config_loaded = False
     manager.shutdown_logger()
     if self.orig_cwd:
         os.chdir(self.orig_cwd)
         shutil.rmtree(self.tempdir)
         sys.path.remove(self.tempdir)
     else:
         # all tests took place in the project directory, so we
         # have to clean some crap out.
         for crap in ("ferenda.ini", "example.py", "index.html",
                      "index.xhtml", "other.css", "rsrc", "data",
                      "dummyfile.txt", "test.css", "test.js", "test.png"):
             util.robust_remove(crap)
Example #11
0
 def test_drawboxes(self):
     pypdfmock = MagicMock()
     canvasmock = MagicMock()
     mocks = {'PyPDF2': pypdfmock,
              'reportlab': MagicMock(),
              'reportlab.pdfgen': MagicMock(),
              'reportlab.pdfgen.canvas': canvasmock}
     with patch.dict('sys.modules', mocks):
         metrics = self.analyzer.metrics()
         pdfpath = "test/files/pdfanalyze/lipsum.debug.pdf"
         self.analyzer.drawboxes(pdfpath, metrics=metrics)
     self.assertTrue(canvasmock.Canvas.called)
     self.assertTrue(pypdfmock.PdfFileReader.called)
     self.assertTrue(pypdfmock.PdfFileWriter.called)
     util.robust_remove(pdfpath)
Example #12
0
 def test_drawboxes(self):
     pypdfmock = MagicMock()
     canvasmock = MagicMock()
     mocks = {'PyPDF2': pypdfmock,
              'reportlab': MagicMock(),
              'reportlab.pdfgen': MagicMock(),
              'reportlab.pdfgen.canvas': canvasmock}
     with patch.dict('sys.modules', mocks):
         metrics = self.analyzer.metrics()
         pdfpath = "test/files/pdfanalyze/lipsum.debug.pdf"
         self.analyzer.drawboxes(pdfpath, metrics=metrics)
     self.assertTrue(canvasmock.Canvas.called)
     self.assertTrue(pypdfmock.PdfFileReader.called)
     self.assertTrue(pypdfmock.PdfFileWriter.called)
     util.robust_remove(pdfpath)
Example #13
0
    def download(self, basefile=None):
        if basefile:
            return self.download_single(basefile)

        if self.config.mediawikidump:
            resp = requests.get(self.config.mediawikidump)
            xmldumppath = self.store.path('dump', 'downloaded', '.xml')
            with self.store._open(xmldumppath, mode="wb") as fp:
                fp.write(resp.content)
            # xml = etree.parse(resp.content)
            xml = etree.parse(xmldumppath)
        else:
            raise ConfigurationError("config.mediawikidump not set")

        MW_NS = "{%s}" % xml.getroot().nsmap[None]
        wikinamespaces = []
        # FIXME: Find out the proper value of MW_NS
        for ns_el in xml.findall("//" + MW_NS + "namespace"):
            wikinamespaces.append(ns_el.text)

        # Get list of existing basefiles - if any of those
        # does not appear in the XML dump, remove them afterwards
        basefiles = list(self.store.list_basefiles_for("parse"))

        for page_el in xml.findall(MW_NS + "page"):
            basefile = page_el.find(MW_NS + "title").text
            if basefile == "Huvudsida":
                continue
            if ":" in basefile and basefile.split(":")[0] in wikinamespaces:
                (namespace, localtitle) = basefile.split(":", 1)
                if namespace not in self.config.mediawikinamespaces:
                    continue
            p = self.store.downloaded_path(basefile)
            self.log.info("%s: extracting from XML dump" % basefile)
            with self.store.open_downloaded(basefile, "w") as fp:
                fp.write(etree.tostring(page_el, encoding="utf-8"))

            if basefile in basefiles:
                del basefiles[basefiles.index(basefile)]

        for b in basefiles:
            self.log.debug("%s: removing stale document" % b)
            util.robust_remove(self.store.downloaded_path(b))
Example #14
0
    def archive(self, basefile, version, overwrite=False, copy=False):
        """Moves the current version of a document to an archive. All
        files related to the document are moved (downloaded, parsed,
        generated files and any existing attachment files).

        :param basefile: The basefile of the document to archive
        :type basefile: str
        :param version: The version id to archive under
        :type version: str
        """

        for meth in (self.downloaded_path, self.documententry_path,
                     self.parsed_path, self.serialized_path,
                     self.distilled_path, self.annotation_path,
                     self.generated_path):
            # FIXME: what about intermediate? Ignore them as they
            # should be able to be regenerated at any time?
            src = meth(basefile)
            dest = meth(basefile, version)
            if self.storage_policy == "dir" and meth in (self.downloaded_path,
                                                         self.parsed_path,
                                                         self.generated_path):
                src = os.path.dirname(src)
                dest = os.path.dirname(dest)
            if not os.path.exists(src):
                continue
            if os.path.exists(dest):
                if overwrite:
                    util.robust_remove(dest)
                else:
                    raise errors.ArchivingError(
                        "Archive destination %s for basefile %s version %s already exists!"
                        % (dest, basefile, version))
            # self.log.debug("Archiving %s to %s" % (src,dest))
            # print("Archiving %s to %s" % (src,dest))
            util.ensure_dir(dest)
            if copy:
                shutil.copy2(src, dest)
            else:
                shutil.move(src, dest)
Example #15
0
 def test_margins(self):
     jsonpath = "test/files/pdfanalyze/lipsum.metrics.json"
     try:
         self.assertFalse(os.path.exists(jsonpath))
         metrics = self.analyzer.metrics(jsonpath, startpage=1)
         self.assertEquals({'default': {'family': 'Comic Sans MS', 'size': 14},
                            'bottommargin': 1149,
                            'h1': {'family': 'Cambria,Bold', 'size': 19},
                            'h2': {'family': 'Cambria,Bold', 'size': 17},
                            'h3': {'family': 'Cambria,Bold', 'size': 14},
                            'topmargin': 53,
                            'leftmargin': 135,
                            'leftmargin_even': 108,
                            'pageheight': 1262,
                            'pagewidth': 892,
                            'rightmargin': 780,
                            'rightmargin_even': 760,
                            'scanned_source': False},
                           metrics)
         self.assertTrue(os.path.exists(jsonpath))
     finally:
         util.robust_remove(jsonpath)
Example #16
0
 def test_margins(self):
     jsonpath = "test/files/pdfanalyze/lipsum.metrics.json"
     try:
         self.assertFalse(os.path.exists(jsonpath))
         metrics = self.analyzer.metrics(jsonpath, startpage=1)
         self.assertEqual({'default': {'family': 'Comic Sans MS', 'size': 14},
                            'bottommargin': 1149,
                            'h1': {'family': 'Cambria,Bold', 'size': 19},
                            'h2': {'family': 'Cambria,Bold', 'size': 17},
                            'h3': {'family': 'Cambria,Bold', 'size': 14},
                            'topmargin': 53,
                            'leftmargin': 135,
                            'leftmargin_even': 108,
                            'pageheight': 1262,
                            'pagewidth': 892,
                            'rightmargin': 780,
                            'rightmargin_even': 760,
                            'scanned_source': False},
                           metrics)
         self.assertTrue(os.path.exists(jsonpath))
     finally:
         util.robust_remove(jsonpath)
Example #17
0
 def wrapper(self, doc):
     try:
         return f(self, doc)
     except DocumentRemovedError as e:
         self.log.info(
             "%s: Document has been removed (%s)", doc.basefile, e)
         util.robust_remove(self.parsed_path(doc.basefile))
         return False
     except KeyboardInterrupt:
         raise
     except ParseError as e:
         self.log.error("%s: ParseError %s", doc.basefile, e)
         if (hasattr(self.config, 'fatalexceptions') and
                 self.config.fatalexceptions):
             raise
         else:
             return False
     except:
         self.log.exception("parse of %s failed", doc.basefile)
         if (hasattr(self.config, 'fatalexceptions') and
                 self.config.fatalexceptions):
             raise
         else:
             return False
Example #18
0
 def test_robust_remove(self):
     util.writefile(self.fname, "Hello")
     util.robust_remove(self.fname)
     util.robust_remove(self.fname)
Example #19
0
    def query_webservice(self, query, page):
        # this is the only soap template we'll need, so we include it
        # verbatim to avoid having a dependency on a soap module like
        # zeep.
        endpoint = 'https://eur-lex.europa.eu/EURLexWebService'
        envelope = """<soap-env:Envelope xmlns:soap-env="http://www.w3.org/2003/05/soap-envelope">
  <soap-env:Header>
    <wsse:Security xmlns:wsse="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-wssecurity-secext-1.0.xsd">
      <wsse:UsernameToken>
        <wsse:Username>%s</wsse:Username>
        <wsse:Password Type="http://docs.oasis-open.org/wss/2004/01/oasis-200401-wss-username-token-profile-1.0#PasswordText">%s</wsse:Password>
      </wsse:UsernameToken>
    </wsse:Security>
  </soap-env:Header>
  <soap-env:Body>
    <sear:searchRequest xmlns:sear="http://eur-lex.europa.eu/search">
      <sear:expertQuery>%s</sear:expertQuery>
      <sear:page>%s</sear:page>
      <sear:pageSize>%s</sear:pageSize>
      <sear:searchLanguage>%s</sear:searchLanguage>
    </sear:searchRequest>
  </soap-env:Body>
</soap-env:Envelope>
""" % (self.config.username, self.config.password, escape(query, quote=False), page, self.pagesize, self.lang)
        headers = {'Content-Type': 'application/soap+xml; charset=utf-8; action="https://eur-lex.europa.eu/EURLexWebService/doQuery"',
                   'SOAPAction': 'https://eur-lex.europa.eu/EURLexWebService/doQuery'}
        if self.config.curl:
            # dump the envelope to a tempfile
            headerstr = ""
            for k, v in headers.items():
                assert "'" not in v  # if it is, we need to work on escaping it
                headerstr += " --header '%s: %s'" % (k, v)
            with tempfile.NamedTemporaryFile() as fp:
                fp.write(envelope.encode("utf-8"))
                fp.flush()
                envelopename = fp.name
                headerfiledesc, headerfilename = tempfile.mkstemp()
                cmd = 'curl -L -X POST -D %(headerfilename)s --data-binary "@%(envelopename)s" %(headerstr)s %(endpoint)s' % locals()
                (ret, stdout, stderr) = util.runcmd(cmd)
            headerfp = os.fdopen(headerfiledesc)
            header = headerfp.read()
            headerfp.close()
            util.robust_remove(headerfilename)
            status, headers = header.split('\n', 1)
            prot, code, msg = status.split(" ", 2)
            headers = dict(email.message_from_string(headers).items())
            res = FakeResponse(int(code), stdout, headers)
        else:
            res = util.robust_fetch(self.session.post, endpoint, self.log,
                                    raise_for_status=False,
                                    data=envelope, headers=headers,
                                    timeout=10)
            
        if res.status_code == 500:
            tree = etree.parse(BytesIO(res.content))
            statuscode = tree.find(".//{http://www.w3.org/2003/05/soap-envelope}Subcode")[0].text
            statusmsg = tree.find(".//{http://www.w3.org/2003/05/soap-envelope}Text").text
            raise errors.DownloadError("%s: %s" % (statuscode, statusmsg))
        elif res.status_code == 301:
            # the call to robust_fetch or curl should have followed
            # the redirect, but at this point we'll just have to
            # report the error
            raise errors.DownloadError("%s: was redirected to %s" % (endpoint, res.headers['Location']))
        return res
Example #20
0
                    removed.append((inst, basefile))
        fp.write('}\n')

for (inst, basefile) in removed:
    downloaded_path = inst.store.downloaded_path(basefile)
    storage_policy = inst.store.storage_policy
    if not os.path.exists(downloaded_path):
        # maybe the reason is that this is a compositerepo?
        # FIXME: maybe CompositeStore.downloaded_path and
        # friends should do this transparently?
        if hasattr(inst, 'get_preferred_instances'):
            subinsts = list(inst.get_preferred_instances(basefile))
            if not subinsts:
                print("%s %s: WARNING: no subinst handles this basefile" % (inst.alias, basefile))
                continue
            subinst = subinsts[0]
            downloaded_path = subinst.store.downloaded_path(basefile)
            storage_policy = subinst.store.storage_policy
    assert(os.path.exists(downloaded_path))
    print("%s %s: removing %s" % (inst.alias, basefile, downloaded_path))
    count["removed"] += 1
    if storage_policy == "dir":
        shutil.rmtree(os.path.dirname(downloaded_path))
        # print("shutil.rmtree(%s)" % os.path.dirname(downloaded_path))
    else:
        util.robust_remove(downloaded_path)
        # print("util.robust_remove(%s)" % downloaded_path)

print("%(unreferenced)s unreferenced docs, %(metadataonly)s set to 'metadataonly', %(removed)s removed" % count)

Example #21
0
    def download_single(self, basefile, url=None):
        if url is None:
            url = self.remote_url(basefile)
            if not url:  # remote_url failed
                return

        updated = created = False
        checked = True
        mainattachment = None

        if url in self.urlmap:
            attachment = self.urlmap[url]
        else:
            attachment = self.sniff_attachment(url)
        if attachment:
            self.urlmap[url] = attachment
            attachment += ".html"
        else:
            self.urlmap[url] = ''
            attachment = "index.html"
        
        downloaded_path = self.store.downloaded_path(basefile,
                                                     attachment=attachment)
        
        created = not os.path.exists(downloaded_path)
        if self.download_if_needed(url, basefile, filename=downloaded_path):
            text = util.readfile(downloaded_path)
            if "<div>Inga tr\xe4ffar</div>" in text:
                self.log.warning("%s: Could not find this prop at %s, might be a bug" % (basefile, url))
                util.robust_remove(downloaded_path)
                return False
            if created:
                self.log.info("%s: download OK from %s" % (basefile, url))
            else:
                self.log.info(
                    "%s: download OK (new version) from %s" % (basefile, url))
            updated = True
        else:
            self.log.debug("%s: exists and is unchanged" % basefile)
            text = util.readfile(downloaded_path)
            
        soup = BeautifulSoup(text, "lxml")
        del text
        attachment = self.find_attachment(soup)

        extraurls = []
        results = soup.find("div", "search-results-content")
        a = results.find("a", string="Hämta Pdf")
        if a:
            extraurls.append(a.get("href"))
        a = results.find("a", string="Hämta Doc") 
        if a:
            extraurls.append(a.get("href"))
        

        # parse downloaded html/text page and find out extraurls
        for url in extraurls:
            if url.endswith('get=doc'):
                # NOTE: We cannot be sure that this is
                # actually a Word (CDF) file. For older files
                # it might be a WordPerfect file (.wpd) or a
                # RDF file, for newer it might be a .docx. We
                # cannot be sure until we've downloaded it.
                # So we quickly read the first 4 bytes
                r = requests.get(url, stream=True)
                sig = r.raw.read(4)
                # r.raw.close()
                #bodyidx = head.index("\n\n")
                #sig = head[bodyidx:bodyidx+4]
                if sig == b'\xffWPC':
                    doctype = ".wpd"
                elif sig == b'\xd0\xcf\x11\xe0':
                    doctype = ".doc"
                elif sig == b'PK\x03\x04':
                    doctype = ".docx"
                elif sig == b'{\\rt':
                    doctype = ".rtf"
                else:
                    self.log.error(
                        "%s: Attached file has signature %r -- don't know what type this is" % (basefile, sig))
                    continue
            elif url.endswith('get=pdf'):
                doctype = ".pdf"
            else:
                self.log.warning("Unknown doc type %s" %
                                 url.split("get=")[-1])
                doctype = None
            if doctype:
                if attachment:
                    filename = self.store.downloaded_path(
                        basefile, attachment=attachment + doctype)
                else:
                    filename = self.store.downloaded_path(
                        basefile,
                        attachment="index" +
                        doctype)
                self.log.debug("%s: downloading attachment %s" % (basefile, filename))
                self.download_if_needed(url, basefile, filename=filename)

        entry = DocumentEntry(self.store.documententry_path(basefile))
        now = datetime.now()
        entry.orig_url = url
        if created:
            entry.orig_created = now
        if updated:
            entry.orig_updated = now
        if checked:
            entry.orig_checked = now
        entry.save()

        return updated
Example #22
0
 def tearDown(self):
     util.robust_remove("test/files/pdfanalyze/lipsum.metrics.json")
     util.robust_remove("test/files/pdfanalyze/lipsum.plot.png")
     util.robust_remove("test/files/pdfanalyze/lipsum.debug.pdf")
Example #23
0
    def download_single(self, basefile, url=None):
        if url is None:
            url = self.remote_url(basefile)
            if not url:  # remote_url failed
                return

        updated = created = False
        checked = True
        mainattachment = None

        if url in self.urlmap:
            attachment = self.urlmap[url]
        else:
            attachment = self.sniff_attachment(url)
        if attachment:
            self.urlmap[url] = attachment
            attachment += ".html"
        else:
            self.urlmap[url] = ''
            attachment = "index.html"

        downloaded_path = self.store.downloaded_path(basefile,
                                                     attachment=attachment)

        created = not os.path.exists(downloaded_path)
        if self.download_if_needed(url, basefile, filename=downloaded_path):
            text = util.readfile(downloaded_path)
            if "<div>Inga tr\xe4ffar</div>" in text:
                self.log.warning(
                    "%s: Could not find this prop at %s, might be a bug" %
                    (basefile, url))
                util.robust_remove(downloaded_path)
                return False
            if created:
                self.log.info("%s: downloaded from %s" % (basefile, url))
            else:
                self.log.info("%s: downloaded new version from %s" %
                              (basefile, url))
            updated = True
        else:
            self.log.debug("%s: exists and is unchanged" % basefile)
            text = util.readfile(downloaded_path)

        soup = BeautifulSoup(text, "lxml")
        del text
        attachment = self.find_attachment(soup)

        extraurls = []
        results = soup.find("div", "search-results-content")
        a = results.find("a", string="Hämta Pdf")
        if a:
            extraurls.append(a.get("href"))
        a = results.find("a", string="Hämta Doc")
        if a:
            extraurls.append(a.get("href"))

        # parse downloaded html/text page and find out extraurls
        for url in extraurls:
            if url.endswith('get=doc'):
                # NOTE: We cannot be sure that this is
                # actually a Word (CDF) file. For older files
                # it might be a WordPerfect file (.wpd) or a
                # RDF file, for newer it might be a .docx. We
                # cannot be sure until we've downloaded it.
                # So we quickly read the first 4 bytes
                r = requests.get(url, stream=True)
                sig = r.raw.read(4)
                # r.raw.close()
                #bodyidx = head.index("\n\n")
                #sig = head[bodyidx:bodyidx+4]
                if sig == b'\xffWPC':
                    doctype = ".wpd"
                elif sig == b'\xd0\xcf\x11\xe0':
                    doctype = ".doc"
                elif sig == b'PK\x03\x04':
                    doctype = ".docx"
                elif sig == b'{\\rt':
                    doctype = ".rtf"
                else:
                    self.log.error(
                        "%s: Attached file has signature %r -- don't know what type this is"
                        % (basefile, sig))
                    continue
            elif url.endswith('get=pdf'):
                doctype = ".pdf"
            else:
                self.log.warning("Unknown doc type %s" % url.split("get=")[-1])
                doctype = None
            if doctype:
                if attachment:
                    filename = self.store.downloaded_path(
                        basefile, attachment=attachment + doctype)
                else:
                    filename = self.store.downloaded_path(basefile,
                                                          attachment="index" +
                                                          doctype)
                self.log.debug("%s: downloading attachment %s" %
                               (basefile, filename))
                self.download_if_needed(url, basefile, filename=filename)

        entry = DocumentEntry(self.store.documententry_path(basefile))
        now = datetime.now()
        entry.orig_url = url
        if created:
            entry.orig_created = now
        if updated:
            entry.orig_updated = now
        if checked:
            entry.orig_checked = now
        entry.save()

        return updated
Example #24
0
    def download(self, basefile=None):

        def write_doc(basefile, page_el):
            writefile = False
            p = self.store.downloaded_path(basefile)
            newcontent = etree.tostring(page_el, encoding="utf-8")
            if not os.path.exists(p):
                writefile = True
            else:
                oldcontent = util.readfile(p, "rb")
                if newcontent != oldcontent:
                    writefile = True
            if writefile:
                util.ensure_dir(p)
                with open(p, "wb") as fp:
                    fp.write(newcontent)
                    self.log.info("%s: extracting from XML dump" % basefile)
            if basefile in basefiles:
                del basefiles[basefiles.index(basefile)]

        if basefile:
            return self.download_single(basefile)
        if self.config.mediawikidump:
            xmldumppath = self.store.path('dump', 'downloaded', '.xml')
            resp = requests.get(self.config.mediawikidump)
            self.log.info("Loaded XML dump from %s" % self.config.mediawikidump)
            from ferenda.documentstore import _open
            with _open(xmldumppath, mode="wb") as fp:
                fp.write(resp.content)
            xml = etree.parse(xmldumppath)
        else:
            raise ConfigurationError("config.mediawikidump not set")

        MW_NS = "{%s}" % xml.getroot().nsmap[None]
        wikinamespaces = []
        for ns_el in xml.findall("//" + MW_NS + "namespace"):
            wikinamespaces.append(ns_el.text)

        # Get list of existing basefiles - if any of those
        # does not appear in the XML dump, remove them afterwards
        basefiles = list(self.store.list_basefiles_for("parse"))
        total = written = 0
        deferred = {}
        for page_el in xml.findall(MW_NS + "page"):
            basefile = page_el.find(MW_NS + "title").text
            if basefile == "Huvudsida":  # FIXME: generalize/make configurable
                continue
            if ":" in basefile and basefile.split(":")[0] in wikinamespaces:
                (namespace, localtitle) = basefile.split(":", 1)
                if namespace not in self.config.mediawikinamespaces:
                    continue
                # defer writing of this one, so that it overwrites any
                # similarly named pages from teh main namespace. This
                # is so that Category pages about $TOPIC take
                # precedence over ordinary pages about $TOPIC
                deferred[localtitle] = page_el
            else:
                write_doc(basefile, page_el)
        for basefile, page_el in deferred.items():
            write_doc(basefile, page_el)

        if 'dump' in basefiles:  # never remove
            del basefiles[basefiles.index('dump')]
        for b in basefiles:
            self.log.info("%s: removing stale document" % b)
            util.robust_remove(self.store.downloaded_path(b))
Example #25
0
    def test_fsmparse(self):
        try:
            # 1. write a new python module containing a class with a staticmethod
            with open("testparser.py", "w") as fp:
                fp.write("""
from six import text_type as str
from ferenda.elements import Body, Paragraph

class Testobject(object):
    @staticmethod
    def get_parser():
        return Parser()


class Parser(object):

    def parse(self, source):
        res = Body()
        for chunk in source:
            res.append(Paragraph([str(len(chunk.strip()))]))
        return res
            """)
            import imp
            fp, pathname, desc = imp.find_module("testparser")
            imp.load_module("testparser", fp, pathname, desc)
            # 2. write a textfile with two paragraphs
            with open("testparseinput.txt", "w") as fp:
                fp.write("""This is one paragraph.

And another.
    """)
            # 3. patch print and call fsmparse
            d = Devel()
            printmock = MagicMock()
            with patch('builtins.print', printmock):
                # 3.1 fsmparse dynamically imports the module and call the method
                #     with every chunk from the text file
                # 3.2 fsmparse asserts that the method returned a callable
                # 3.3 fsmparse calls it with a iterable of text chunks from the
                #     textfile
                # 3.4 fsmparse recieves a Element structure and prints a
                # serialized version
                d.fsmparse("testparser.Testobject.get_parser", "testparseinput.txt")
            self.assertTrue(printmock.called)
            # 4. check that the expected thing was printed
            want = """
<Body>
  <Paragraph>
    <str>22</str>
  </Paragraph>
  <Paragraph>
    <str>12</str>
  </Paragraph>
</Body>
            """.strip()+"\n"
            printmock.assert_has_calls([call(want)])
        finally:
            util.robust_remove("testparser.py")
            util.robust_remove("testparser.pyc")
            util.robust_remove("testparseinput.txt")
            if os.path.exists("__pycache__") and os.path.isdir("__pycache__"):
                shutil.rmtree("__pycache__")
Example #26
0
    def test_fsmparse(self):
        try:
            # 1. write a new python module containing a class with a staticmethod
            with open("testparser.py", "w") as fp:
                fp.write("""
from six import text_type as str
from ferenda.elements import Body, Paragraph

class Testobject(object):
    @staticmethod
    def get_parser():
        return Parser()


class Parser(object):

    def parse(self, source):
        res = Body()
        for chunk in source:
            res.append(Paragraph([str(len(chunk.strip()))]))
        return res
            """)
            import imp
            fp, pathname, desc = imp.find_module("testparser")
            imp.load_module("testparser", fp, pathname, desc)
            # 2. write a textfile with two paragraphs
            with open("testparseinput.txt", "w") as fp:
                fp.write("""This is one paragraph.

And another.
    """)
            # 3. patch print and call fsmparse
            d = Devel()
            printmock = MagicMock()
            with patch('builtins.print', printmock):
                # 3.1 fsmparse dynamically imports the module and call the method
                #     with every chunk from the text file
                # 3.2 fsmparse asserts that the method returned a callable
                # 3.3 fsmparse calls it with a iterable of text chunks from the
                #     textfile
                # 3.4 fsmparse recieves a Element structure and prints a
                # serialized version
                d.fsmparse("testparser.Testobject.get_parser",
                           "testparseinput.txt")
            self.assertTrue(printmock.called)
            # 4. check that the expected thing was printed
            want = """
<Body>
  <Paragraph>
    <str>22</str>
  </Paragraph>
  <Paragraph>
    <str>12</str>
  </Paragraph>
</Body>
            """.strip() + "\n"
            printmock.assert_has_calls([call(want)])
        finally:
            util.robust_remove("testparser.py")
            util.robust_remove("testparser.pyc")
            util.robust_remove("testparseinput.txt")
            if os.path.exists("__pycache__") and os.path.isdir("__pycache__"):
                shutil.rmtree("__pycache__")