Beispiel #1
0
    def parse(self, doc):
        head, body = util.readfile(self.store.downloaded_path(
            doc.basefile)).split("\n\n", 1)
        datestr, timestr, title = head.split(" ", 2)
        published = datetime.strptime("%s %s" % (datestr, timestr),
                                      "%Y-%m-%d %H:%M:%S")

        doc.meta.add((URIRef(doc.uri), RDF.type, self.rdf_type))
        doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(published)))
        doc.meta.add(
            (URIRef(doc.uri), DCTERMS.title, Literal(title, lang=doc.lang)))
        soup = bs4.BeautifulSoup(
            "<div class='sitenews-item'>" + body + "</div>", "lxml")
        doc.body = elements_from_soup(soup.body)
        # move timestamp into dcterms:issued, title into dcterms:title
        # parse body with elements_from_soup
        # set first real para as dcterms:abstract (XMLLiteral)
        doc.body[0][0] = Div([doc.body[0][0]],
                             datatype="rdf:XMLLiteral",
                             property="dcterms:abstract")

        # but we need to add it to doc.meta RIGHT AWAY because of reasons...
        doc.meta.add((URIRef(doc.uri), DCTERMS.abstract,
                      Literal(body.split("\n\n")[0], datatype=RDF.XMLLiteral)))
        self.parse_entry_update(
            doc)  # need to set published and possibly updated
        entry = DocumentEntry(self.store.documententry_path(doc.basefile))
        entry.published = published
        entry.save()
        return True
Beispiel #2
0
    def parse(self, doc):
        source = util.readfile(self.store.downloaded_path(doc.basefile))
        html = publish_string(source, writer_name="html")
        soup = BeautifulSoup(html, "lxml")
        docinfo = soup.find("table", "docinfo")
        docuri = URIRef(doc.uri)
        if docinfo:
            # this is where our custom metadata goes
            for row in docinfo.find_all("tr", "field"):
                key, val = row.th.text.strip(), row.td.text.strip()
            if key == 'footer-order:':
                doc.meta.add((docuri, OLO['index'], Literal(int(val))))
            else:
                self.log.warning("%s: Unknown metadata directive %s (%s)" %
                                 (doc.basefile, key, val))

            # we don't need these in the final result
            docinfo.decompose()
        soup.find("h1", "title").decompose()

        doc.body = elements_from_soup(soup.body)
        doc.meta.add((docuri, DCTERMS.title, Literal(soup.title.text,
                                                     doc.lang)))
        doc.meta.add((docuri, PROV.wasGeneratedBy,
                      Literal(self.qualified_class_name())))
        doc.meta.add((docuri, RDF.type, self.rdf_type))
        self.parse_entry_update(doc)
        return True
Beispiel #3
0
    def test_elements_from_soup(self):
        from ferenda.elements import html
        soup = BeautifulSoup("""<body>
<h1>Sample</h1>
<div class="main">
<img src="xyz.png"/>
<p>Some <b>text</b></p>
<dl>
<dt>Term 1</dt>
<dd>Definition 1</dd>
</dl>
</div>
<div id="foot">
<hr/>
<a href="/">home</a> - <a href="/about">about</a>
</div>
</body>""", "lxml")
        body = html.elements_from_soup(soup.body)
        # print("Body: \n%s" % serialize(body))
        result = html.Body([html.H1(["Sample"]),
                            html.Div([html.Img(src="xyz.png"),
                                      html.P(["Some ",
                                              html.B(["text"])]),
                                      html.DL([html.DT(["Term 1"]),
                                               html.DD(["Definition 1"])])
                                  ], **{"class": "main"}),
                            html.Div([html.HR(),
                                      html.A(["home"], href="/"),
                                      " - ",
                                      html.A(["about"], href="/about")
                                  ], id="foot")])
        self.maxDiff = 4096
        self.assertEqual(serialize(body), serialize(result))
Beispiel #4
0
    def parse(self, doc):
        head, body = util.readfile(self.store.downloaded_path(doc.basefile)).split("\n\n", 1)
        datestr, timestr, title = head.split(" ", 2)
        published = datetime.strptime("%s %s" % (datestr, timestr), "%Y-%m-%d %H:%M:%S")

        doc.meta.add((URIRef(doc.uri), RDF.type, self.rdf_type))
        doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(published)))
        doc.meta.add((URIRef(doc.uri), DCTERMS.title, Literal(title, lang=doc.lang)))
        soup = bs4.BeautifulSoup("<div class='sitenews-item'>"+body+"</div>", "lxml")
        doc.body = elements_from_soup(soup.body)
        # move timestamp into dcterms:issued, title into dcterms:title
        # parse body with elements_from_soup
        # set first real para as dcterms:abstract (XMLLiteral)
        doc.body[0][0] = Div([doc.body[0][0]],
                          datatype="rdf:XMLLiteral",
                          property="dcterms:abstract")

        # but we need to add it to doc.meta RIGHT AWAY because of reasons...
        doc.meta.add((URIRef(doc.uri), DCTERMS.abstract,
                      Literal(body.split("\n\n")[0], datatype=RDF.XMLLiteral)))
        self.parse_entry_update(doc) # need to set published and possibly updated
        entry = DocumentEntry(self.store.documententry_path(doc.basefile))
        entry.published = published
        entry.save()
        return True
Beispiel #5
0
    def parse(self, doc):
        source = util.readfile(self.store.downloaded_path(doc.basefile))
        html = publish_string(source, writer_name="html")
        soup = BeautifulSoup(html, "lxml")
        docinfo = soup.find("table", "docinfo")
        docuri = URIRef(doc.uri)
        if docinfo:
            # this is where our custom metadata goes
            for row in docinfo.find_all("tr", "field"):
                key, val = row.th.text.strip(), row.td.text.strip()
            if key == 'footer-order:':
                doc.meta.add((docuri, OLO['index'], Literal(int(val))))
            else:
                self.log.warning("%s: Unknown metadata directive %s (%s)" %
                                 (doc.basefile, key, val))

            # we don't need these in the final result
            docinfo.decompose()
        soup.find("h1", "title").decompose()

        doc.body = elements_from_soup(soup.body)
        doc.meta.add((docuri, DCTERMS.title,
                      Literal(soup.title.text, doc.lang)))
        doc.meta.add((docuri, PROV.wasGeneratedBy, Literal(self.qualified_class_name())))
        doc.meta.add((docuri, RDF.type, self.rdf_type))
        self.parse_entry_update(doc)
        return True
Beispiel #6
0
    def test_elements_from_soup(self):
        soup = BeautifulSoup("""<html>
<head>
  <title>Example doc</title>
</head>
<body>
  <marquee>Hello world</marquee>
  <!-- Hello world -->
  <center>Hello world</center>
  <p>That's enough of this nonsense</p>
</body>""", "lxml")
        got = html.elements_from_soup(soup.html)
        self.assertEqual(html.HTML([html.Head([html.Title(["Example doc"])]),
                                    html.Body([html.P(["That's enough of this nonsense"])])]),
                         got)
Beispiel #7
0
 def _decode_query_result(self, response, pagenum, pagelen):
     json = response.json()
     res = []
     for hit in json['hits']['hits']:
         h = hit['_source']
         # wrap highlighted field in P, convert to elements
         hltext = " ... ".join([x.strip() for x in hit['highlight']['text']])
         soup = BeautifulSoup("<p>%s</p>" % re.sub("\s+", " ", hltext))
         h['text'] = html.elements_from_soup(soup.html.body.p)
         res.append(h)
     pager = {'pagenum': pagenum,
              'pagecount': int(math.ceil(json['hits']['total'] / float(pagelen))),
              'firstresult': (pagenum - 1) * pagelen + 1,
              'lastresult': (pagenum - 1) * pagelen + len(json['hits']['hits']),
              'totalresults': json['hits']['total']}
     return res, pager
Beispiel #8
0
    def test_elements_from_soup(self):
        soup = BeautifulSoup(
            """<html>
<head>
  <title>Example doc</title>
</head>
<body>
  <marquee>Hello world</marquee>
  <!-- Hello world -->
  <center>Hello world</center>
  <p>That's enough of this nonsense</p>
</body>""", "lxml")
        got = html.elements_from_soup(soup.html)
        self.assertEqual(
            html.HTML([
                html.Head([html.Title(["Example doc"])]),
                html.Body([html.P(["That's enough of this nonsense"])])
            ]), got)
Beispiel #9
0
 def _decode_query_result_hit(self, hit):
     h = hit['_source']
     # h['repo'] = hit['_type']
     if "join" in h:
         del h["join"]
         
     if 'highlight' in hit:
         for hlfield in ('text', 'label'):
             if hlfield in hit['highlight']:
                 # wrap highlighted field in P, convert to
                 # elements.
                 hltext = re.sub("\s+", " ", " ... ".join([x.strip() for x in hit['highlight'][hlfield]]))
                 hltext = hltext.replace("<em>", "<strong class='match'>").replace("</em>", " </strong>")
                 # FIXME: BeautifulSoup/lxml returns empty soup if
                 # first char is '§' or some other non-ascii char (like
                 # a smart quote). Padding with a space makes problem
                 # disappear, but need to find root cause.
                 soup = BeautifulSoup("<p> %s</p>" % hltext, "lxml")
                 h[hlfield] = html.elements_from_soup(soup.html.body.p)
     return h
Beispiel #10
0
    def test_elements_from_soup(self):
        from ferenda.elements import html
        soup = BeautifulSoup(
            """<body>
<h1>Sample</h1>
<div class="main">
<img src="xyz.png"/>
<p>Some <b>text</b></p>
<dl>
<dt>Term 1</dt>
<dd>Definition 1</dd>
</dl>
</div>
<div id="foot">
<hr/>
<a href="/">home</a> - <a href="/about">about</a>
</div>
</body>""", "lxml")
        body = html.elements_from_soup(soup.body)
        # print("Body: \n%s" % serialize(body))
        result = html.Body([
            html.H1(["Sample"]),
            html.Div([
                html.Img(src="xyz.png"),
                html.P(["Some ", html.B(["text"])]),
                html.DL([html.DT(["Term 1"]),
                         html.DD(["Definition 1"])])
            ], **{"class": "main"}),
            html.Div([
                html.HR(),
                html.A(["home"], href="/"), " - ",
                html.A(["about"], href="/about")
            ],
                     id="foot")
        ])
        self.maxDiff = 4096
        self.assertEqual(serialize(body), serialize(result))
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

from ferenda.compat import Mock
from ferenda.elements.html import elements_from_soup
from bs4 import BeautifulSoup

doc = Mock()
filedir = os.path.dirname(__file__)
with open(filedir + "/../doc/examples/citationparsing-before.xhtml") as fp:
    doc.body = elements_from_soup(BeautifulSoup(fp.read(), "lxml").body)

# begin
from pyparsing import Word, nums

from ferenda import CitationParser
from ferenda import URIFormatter
import ferenda.citationpatterns
import ferenda.uriformats

# Create two ParserElements for IETF document references and internal
# references
rfc_citation = "RFC" + Word(nums).setResultsName("RFCRef")
bcp_citation = "BCP" + Word(nums).setResultsName("BCPRef")
std_citation = "STD" + Word(nums).setResultsName("STDRef")
ietf_doc_citation = (rfc_citation | bcp_citation
                     | std_citation).setResultsName("IETFRef")

endnote_citation = ("[" + Word(nums).setResultsName("EndnoteID") +
                    "]").setResultsName("EndnoteRef")
Beispiel #12
0
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

from ferenda.compat import Mock
from ferenda.elements.html import elements_from_soup
from bs4 import BeautifulSoup
doc = Mock()
doc.body = elements_from_soup(
    BeautifulSoup(
        """<html>
<body>
URLs often appear like http://example.org/foo, in running text
</body>
</html>""", "lxml").body)
# begin
from ferenda import CitationParser
from ferenda import URIFormatter
import ferenda.citationpatterns
import ferenda.uriformats

# CitationParser is initialized with a list of pyparsing
# ParserElements (or any other object that has a scanString method
# that returns a generator of (tokens,start,end) tuples, where start
# and end are integer string indicies and tokens are dict-like
# objects)
citparser = CitationParser(ferenda.citationpatterns.url)

# URIFormatter is initialized with a list of tuples, where each
# tuple is a string (identifying a named ParseResult) and a function
# (that takes as a single argument a dict-like object and returns a
# URI string (possibly relative)
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

from ferenda.compat import Mock
from ferenda.elements.html import elements_from_soup
from bs4 import BeautifulSoup
doc = Mock()
doc.body = elements_from_soup(BeautifulSoup("""<html>
<body>
URLs often appear like http://example.org/foo, in running text
</body>
</html>""").body)
# begin
from ferenda import CitationParser
from ferenda import URIFormatter
import ferenda.citationpatterns
import ferenda.uriformats

# CitationParser is initialized with a list of pyparsing
# ParserElements (or any other object that has a scanString method
# that returns a generator of (tokens,start,end) tuples, where start
# and end are integer string indicies and tokens are dict-like
# objects)
citparser = CitationParser(ferenda.citationpatterns.url)

# URIFormatter is initialized with a list of tuples, where each
# tuple is a string (identifying a named ParseResult) and a function
# (that takes as a single argument a dict-like object and returns a
# URI string (possibly relative)
citparser.set_formatter(URIFormatter(("URLRef", ferenda.uriformats.url)))
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

from ferenda.compat import Mock
from ferenda.elements.html import elements_from_soup
from bs4 import BeautifulSoup

doc = Mock()
filedir = os.path.dirname(__file__)
doc.body = elements_from_soup(
    BeautifulSoup(
        open(filedir + "/../doc/examples/citationparsing-before.xhtml").read(),
        "lxml").body)

# begin
from pyparsing import Word, nums

from ferenda import CitationParser
from ferenda import URIFormatter
import ferenda.citationpatterns
import ferenda.uriformats

# Create two ParserElements for IETF document references and internal
# references
rfc_citation = "RFC" + Word(nums).setResultsName("RFCRef")
bcp_citation = "BCP" + Word(nums).setResultsName("BCPRef")
std_citation = "STD" + Word(nums).setResultsName("STDRef")
ietf_doc_citation = (rfc_citation | bcp_citation
                     | std_citation).setResultsName("IETFRef")

endnote_citation = ("[" + Word(nums).setResultsName("EndnoteID") +
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

from ferenda.compat import Mock
from ferenda.elements.html import elements_from_soup
from bs4 import BeautifulSoup

doc = Mock()
filedir = os.path.dirname(__file__) 
doc.body = elements_from_soup(BeautifulSoup(open(filedir+"/../doc/examples/citationparsing-before.xhtml").read()).body)

# begin
from pyparsing import Word, nums

from ferenda import CitationParser
from ferenda import URIFormatter
import ferenda.citationpatterns
import ferenda.uriformats

# Create two ParserElements for IETF document references and internal
# references
rfc_citation = "RFC" + Word(nums).setResultsName("RFCRef")
bcp_citation = "BCP" + Word(nums).setResultsName("BCPRef")
std_citation = "STD" + Word(nums).setResultsName("STDRef")
ietf_doc_citation = (rfc_citation | bcp_citation | std_citation).setResultsName("IETFRef")

endnote_citation = ("[" + Word(nums).setResultsName("EndnoteID") + "]").setResultsName("EndnoteRef")

# Create a URI formatter for IETF documents (URI formatter for endnotes
# is so simple that we just use a lambda function below
def rfc_uri_formatter(parts):
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

from ferenda.compat import Mock
from ferenda.elements.html import elements_from_soup
from bs4 import BeautifulSoup

doc = Mock()
filedir = os.path.dirname(__file__)
with open(filedir+"/../doc/examples/citationparsing-before.xhtml") as fp:
    doc.body = elements_from_soup(BeautifulSoup(fp.read(), "lxml").body)

# begin
from pyparsing import Word, nums

from ferenda import CitationParser
from ferenda import URIFormatter
import ferenda.citationpatterns
import ferenda.uriformats

# Create two ParserElements for IETF document references and internal
# references
rfc_citation = "RFC" + Word(nums).setResultsName("RFCRef")
bcp_citation = "BCP" + Word(nums).setResultsName("BCPRef")
std_citation = "STD" + Word(nums).setResultsName("STDRef")
ietf_doc_citation = (rfc_citation | bcp_citation | std_citation).setResultsName("IETFRef")

endnote_citation = ("[" + Word(nums).setResultsName("EndnoteID") + "]").setResultsName("EndnoteRef")

# Create a URI formatter for IETF documents (URI formatter for endnotes
# is so simple that we just use a lambda function below