def testparser(testcase, parser, filename): """Helper function to test :py:class:`~ferenda.FSMParser` based parsers.""" wantfilename = filename.replace(".txt", ".xml") if not os.path.exists(wantfilename) or 'FERENDA_FSMDEBUG' in os.environ: parser.debug = True tr = TextReader(filename, encoding="utf-8", linesep=TextReader.UNIX) b = parser.parse(tr.getiterator(tr.readparagraph)) if 'FERENDA_FSMDEBUG' in os.environ: print(elements.serialize(b)) testcase.maxDiff = 4096 if os.path.exists(wantfilename): with codecs.open(wantfilename, encoding="utf-8") as fp: want = fp.read().strip() got = elements.serialize(b).strip() testcase.assertEqualXML(want, got) else: raise AssertionError("Want file not found. Result of parse:\n" + elements.serialize(b))
def parametric_test(self, filename): self.maxDiff = None reader = TextReader(filename=filename, encoding='iso-8859-1', linesep=TextReader.DOS) reader.autostrip = True # p.lagrum_parser = FakeParser() parser = self.p.get_parser("9999:998", reader) b = parser(reader) elements = self.p._count_elements(b) # FIXME: How was this used? Where should we plug # skipfragments? if 'K' in elements and elements['K'] > 1 and elements['P1'] < 2: self.p.skipfragments = [ ('rinfoex:avdelningnummer', 'rpubl:kapitelnummer'), ('rpubl:kapitelnummer', 'rpubl:paragrafnummer')] else: self.p.skipfragments = [('rinfoex:avdelningnummer', 'rpubl:kapitelnummer')] # NB: _construct_ids won't look for references self.p.visit_node(b, self.p.construct_id, {'basefile': '9999:998', 'uris': set()}) self.p.visit_node(b, self.p.find_definitions, False, debug=False) self.p.lagrum_parser.parse_recursive(b) self._remove_uri_for_testcases(b) resultfilename = filename.replace(".txt", ".xml") if os.path.exists(resultfilename): with codecs.open(resultfilename, encoding="utf-8") as fp: result = fp.read().strip() self.assertEqual(result, serialize(b).strip()) else: self.assertEqual("", serialize(b).strip()) # reset the state of the repo... self.p.current_section = '0' self.p.current_headline_level = 0
def importarchive(self, archivedir): """Imports downloaded data from an archive from legacy lagen.nu data. In particular, creates proper archive storage for older versions of each text. """ current = archived = 0 for f in util.list_dirs(archivedir, ".html"): if not f.startswith("downloaded/sfs"): # sfst or sfsr continue for regex in self.templ: m = re.match(regex, f) if not m: continue if "vcheck" in m.groupdict(): # silently ignore break basefile = "%s:%s" % (m.group("byear"), m.group("bnum")) # need to look at the file to find out its version # text = t.extractfile(f).read(4000).decode("latin-1") text = open(f).read(4000).decode("latin-1") reader = TextReader(string=text) updated_to = self._find_uppdaterad_tom(basefile, reader=reader) if "vyear" in m.groupdict(): # this file is marked as # an archival version archived += 1 version = updated_to if m.group("vyear") == "first": pass else: exp = "%s:%s" % (m.group("vyear"), m.group("vnum")) if version != exp: self.log.warning("%s: Expected %s, found %s" % (f, exp, version)) else: version = None current += 1 de = DocumentEntry() de.basefile = basefile de.id = self.canonical_uri(basefile, updated_to) # fudge timestamps best as we can de.orig_created = datetime.fromtimestamp( os.path.getctime(f)) de.orig_updated = datetime.fromtimestamp( os.path.getmtime(f)) de.orig_updated = datetime.now() de.orig_url = self.document_url_template % locals() de.published = datetime.now() de.url = self.generated_url(basefile) de.title = "SFS %s" % basefile # de.set_content() # de.set_link() de.save(self.store.documententry_path(basefile)) # this yields more reasonable basefiles, but they are not # backwards compatible -- skip them for now # basefile = basefile.replace("_", "").replace(".", "") if "type" in m.groupdict() and m.group("type") == "sfsr": dest = self.store.register_path(basefile) current -= 1 # to offset the previous increment else: dest = self.store.downloaded_path(basefile, version) self.log.debug("%s: extracting %s to %s" % (basefile, f, dest)) util.ensure_dir(dest) shutil.copy2(f, dest) break else: self.log.warning("Couldn't process %s" % f) self.log.info( "Extracted %s current versions and %s archived versions" % (current, archived))
def parse(self, doc): """Parse downloaded documents into structured XML and RDF.""" reader = TextReader(self.store.downloaded_path(doc.basefile), linesep=TextReader.UNIX) # Some more preprocessing: Remove the faux-bold formatting # used in some RFCs (using repetitions of characters # interleaved with backspace control sequences). Note: that # is '\b' as in backspace, not r'\b' as in word boundary # docstring = re.sub('.\b','',docstring) cleanparagraphs = (re.sub('.\b', '', x) for x in reader.getiterator(reader.readparagraph)) parser = self.get_parser(doc.basefile) if not self.config.fsmdebug: self.config.fsmdebug = 'FERENDA_FSMDEBUG' in os.environ parser.debug = self.config.fsmdebug doc.body = parser.parse(cleanparagraphs) header = doc.body.pop(0) # body.findByClass(RFCHeader) title = " ".join( doc.body.pop(0).split()) # body.findByClass(DocHeader) for part in doc.body: if isinstance( part, PreambleSection) and part.title == "Table of Contents": doc.body.remove(part) break # create (RDF) metadata for document Note: The provided # basefile may be incorrect -- let whatever is in the header # override realid = self.get_rfc_num(header) if not realid: # eg RFC 100 -- fallback to basefile in that case realid = doc.basefile doc.uri = self.canonical_uri(realid) desc = Describer(doc.meta, doc.uri) desc.value(self.ns['prov'].wasGeneratedBy, self.qualified_class_name()) desc.value(self.ns['dcterms'].title, title, lang="en") self.parse_header(header, desc) # parse_header might have set .rdftype, but if not: try: desc.getrdftype() except KeyError: desc.rdftype(self.ns['rfc'].RFC) if not desc.getvalues(self.ns['dcterms'].identifier): desc.value(self.ns['dcterms'].identifier, "RFC %s" % doc.basefile) doc.lang = "en" # process body - remove the temporary Pagebreak objects, after # having extracted the shortTitle found in them shorttitle = self.cleanup_body(doc.body) if shorttitle and (desc.getvalue(self.ns['dcterms'].title) != shorttitle): desc.value(self.ns['bibo'].shortTitle, shorttitle, lang="en") # process body - add good metadata citparser = self.make_citation_parser() doc.body = citparser.parse_recursive(doc.body) PreambleSection.counter = 0 # self.decorate_bodyparts(doc.body,doc.uri) if self.config.fsmdebug: print(serialize(doc.body)) return True
def parse(self, doc): # some very simple heuristic rules for determining # what an individual paragraph is def is_heading(p): # If it's on a single line and it isn't indented with spaces # it's probably a heading. if p.count("\n") == 0 and not p.startswith(" "): return True def is_pagebreak(p): # if it contains a form feed character, it represents a page break return "\f" in p # Parsing a document consists mainly of two parts: # 1: First we parse the body of text and store it in doc.body from ferenda.elements import Body, Preformatted, Title, Heading from ferenda import Describer reader = TextReader(self.store.downloaded_path(doc.basefile)) # First paragraph of an RFC is always a header block header = reader.readparagraph() # Preformatted is a ferenda.elements class representing a # block of preformatted text. It is derived from the built-in # list type, and must thus be initialized with an iterable, in # this case a single-element list of strings. (Note: if you # try to initialize it with a string, because strings are # iterables as well, you'll end up with a list where each # character in the string is an element, which is not what you # want). preheader = Preformatted([header]) # Doc.body is a ferenda.elements.Body class, which is also # is derived from list, so it has (amongst others) the append # method. We build our document by adding to this root # element. doc.body.append(preheader) # Second paragraph is always the title, and we don't include # this in the body of the document, since we'll add it to the # medata -- once is enough title = reader.readparagraph() # After that, just iterate over the document and guess what # everything is. TextReader.getiterator is useful for # iterating through a text in other chunks than single lines for para in reader.getiterator(reader.readparagraph): if is_heading(para): # Heading is yet another of these ferenda.elements # classes. doc.body.append(Heading([para])) elif is_pagebreak(para): # Just drop these remnants of a page-and-paper-based past pass else: # If we don't know that it's something else, it's a # preformatted section (the safest bet for RFC text). doc.body.append(Preformatted([para])) # 2: Then we create metadata for the document and store it in # doc.meta (in this case using the convenience # ferenda.Describer class). desc = Describer(doc.meta, doc.uri) # Set the rdf:type of the document desc.rdftype(self.rdf_type) # Set the title we've captured as the dcterms:title of the document and # specify that it is in English desc.value(self.ns['dcterms'].title, util.normalize_space(title), lang="en") # Construct the dcterms:identifier (eg "RFC 6991") for this document from the basefile desc.value(self.ns['dcterms'].identifier, "RFC " + doc.basefile) # find and convert the publication date in the header to a datetime # object, and set it as the dcterms:issued date for the document re_date = re.compile( "(January|February|March|April|May|June|July|August|September|October|November|December) (\d{4})" ).search # This is a context manager that temporarily sets the system # locale to the "C" locale in order to be able to use strptime # with a string on the form "August 2013", even though the # system may use another locale. dt_match = re_date(header) if dt_match: with util.c_locale(): dt = datetime.strptime(re_date(header).group(0), "%B %Y") pubdate = date(dt.year, dt.month, dt.day) # Note that using some python types (cf. datetime.date) # results in a datatyped RDF literal, ie in this case # <http://localhost:8000/res/rfc/6994> dcterms:issued "2013-08-01"^^xsd:date desc.value(self.ns['dcterms'].issued, pubdate) # find any older RFCs that this document updates or obsoletes obsoletes = re.search("^Obsoletes: ([\d+, ]+)", header, re.MULTILINE) updates = re.search("^Updates: ([\d+, ]+)", header, re.MULTILINE) # Find the category of this RFC, store it as dcterms:subject cat_match = re.search("^Category: ([\w ]+?)( |$)", header, re.MULTILINE) if cat_match: desc.value(self.ns['dcterms'].subject, cat_match.group(1)) for predicate, matches in ((self.ns['rfc'].updates, updates), (self.ns['rfc'].obsoletes, obsoletes)): if matches is None: continue # add references between this document and these older rfcs, # using either rfc:updates or rfc:obsoletes for match in matches.group(1).strip().split(", "): uri = self.canonical_uri(match) # Note that this uses our own unofficial # namespace/vocabulary # http://example.org/ontology/rfc/ desc.rel(predicate, uri) # And now we're done. We don't need to return anything as # we've modified the Document object that was passed to # us. The calling code will serialize this modified object to # XHTML and RDF and store it on disk # end parse1 # Now do it again reader.seek(0) reader.readparagraph() reader.readparagraph() doc.body = Body() doc.body.append(preheader) # doc.body.append(Title([util.normalize_space(title)])) # begin parse2 from ferenda.elements import Section, Subsection, Subsubsection # More heuristic rules: Section headers start at the beginning # of a line and are numbered. Subsections and subsubsections # have dotted numbers, optionally with a trailing period, ie # '9.2.' or '11.3.1' def is_section(p): return re.match(r"\d+\.? +[A-Z]", p) def is_subsection(p): return re.match(r"\d+\.\d+\.? +[A-Z]", p) def is_subsubsection(p): return re.match(r"\d+\.\d+\.\d+\.? +[A-Z]", p) def split_sectionheader(p): # returns a tuple of title, ordinal, identifier ordinal, title = p.split(" ", 1) ordinal = ordinal.strip(".") return title.strip(), ordinal, "RFC %s, section %s" % ( doc.basefile, ordinal) # Use a list as a simple stack to keep track of the nesting # depth of a document. Every time we create a Section, # Subsection or Subsubsection object, we push it onto the # stack (and clear the stack down to the appropriate nesting # depth). Every time we create some other object, we append it # to whatever object is at the top of the stack. As your rules # for representing the nesting of structure become more # complicated, you might want to use the # :class:`~ferenda.FSMParser` class, which lets you define # heuristic rules (recognizers), states and transitions, and # takes care of putting your structure together. stack = [doc.body] for para in reader.getiterator(reader.readparagraph): if is_section(para): title, ordinal, identifier = split_sectionheader(para) s = Section(title=title, ordinal=ordinal, identifier=identifier) stack[1:] = [] # clear all but bottom element stack[0].append(s) # add new section to body stack.append(s) # push new section on top of stack elif is_subsection(para): title, ordinal, identifier = split_sectionheader(para) s = Subsection(title=title, ordinal=ordinal, identifier=identifier) stack[2:] = [] # clear all but bottom two elements stack[1].append(s) # add new subsection to current section stack.append(s) elif is_subsubsection(para): title, ordinal, identifier = split_sectionheader(para) s = Subsubsection(title=title, ordinal=ordinal, identifier=identifier) stack[3:] = [] # clear all but bottom three stack[-1].append( s) # add new subsubsection to current subsection stack.append(s) elif is_heading(para): stack[-1].append(Heading([para])) elif is_pagebreak(para): pass else: pre = Preformatted([para]) stack[-1].append(pre) # end parse2 # begin citation1 from pyparsing import Word, CaselessLiteral, nums section_citation = ( CaselessLiteral("section") + Word(nums + ".").setResultsName("Sec")).setResultsName("SecRef") rfc_citation = ("[RFC" + Word(nums).setResultsName("RFC") + "]").setResultsName("RFCRef") section_rfc_citation = (section_citation + "of" + rfc_citation).setResultsName("SecRFCRef") # end citation1 # begin citation2 def rfc_uriformatter(parts): uri = "" if 'RFC' in parts: uri += self.canonical_uri(parts['RFC'].lstrip("0")) if 'Sec' in parts: uri += "#S" + parts['Sec'] return uri # end citation2 # begin citation3 from ferenda import CitationParser, URIFormatter citparser = CitationParser(section_rfc_citation, section_citation, rfc_citation) citparser.set_formatter( URIFormatter(("SecRFCRef", rfc_uriformatter), ("SecRef", rfc_uriformatter), ("RFCRef", rfc_uriformatter))) citparser.parse_recursive(doc.body)
def run_test_file(self, filename, debug=False): # some basic recognizers and constructors to parse a simple # structured plaintext format. # # RECOGNIZERS def is_header(parser): suspect = parser.reader.peek() return (len(suspect) > 100 and not suspect.endswith(".")) def is_section(parser): (ordinal, title) = analyze_sectionstart(parser.reader.peek()) return section_segments_count(ordinal) == 1 def is_subsection(parser): (ordinal, title) = analyze_sectionstart(parser.reader.peek()) return section_segments_count(ordinal) == 2 def is_subsubsection(parser): (ordinal, title) = analyze_sectionstart(parser.reader.peek()) return section_segments_count(ordinal) == 3 def is_preformatted(parser): return " " in parser.reader.peek() def is_definition(parser): return False def is_description(parser): return False def is_li_decimal(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('decimal', 'decimal-leading-zero') def is_li_alpha(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('lower-alpha', 'upper-alpha') def is_li_roman(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('lower-roman', 'upper-roman') def is_unordereditem(parser): listtype = analyze_listitem(parser.reader.peek())[0] return listtype in ('disc', 'circle', 'square', 'dash') def is_state_a(parser): return parser.reader.peek().startswith("State A:") def is_state_b(parser): return parser.reader.peek().startswith("State B:") def is_state_c(parser): return parser.reader.peek().startswith("State C:") def is_paragraph(parser): # c.f. test/files/fsmparser/invalid.txt return len(parser.reader.peek()) > 6 # MAGIC def sublist_or_parent(symbol, state_stack): constructor = False newstate = None if symbol == is_li_alpha and "ol-alpha" not in state_stack: # maybe only check state_stack[-2] constructor = make_ol_alpha newstate = "ol-alpha" elif symbol == is_li_roman and "ol-roman" not in state_stack: constructor = make_ol_roman newstate = "ol-roman" elif symbol == is_li_decimal and "ol-decimal" not in state_stack: constructor = make_ol_roman newstate = "ol-roman" else: pass return (constructor, newstate) # CONSTRUCTORS @newstate('body') def make_body(parser): parser._debug("Hello") b = elements.Body() return parser.make_children(b) @newstate('section') def make_section(parser): (secnumber, title) = analyze_sectionstart(parser.reader.next()) s = elements.Section(ordinal=secnumber, title=title) return parser.make_children(s) @newstate('subsection') def make_subsection(parser): (secnumber, title) = analyze_sectionstart(parser.reader.next()) s = elements.Subsection(ordinal=secnumber, title=title) return parser.make_children(s) @newstate('subsubsection') def make_subsubsection(parser): (secnumber, title) = analyze_sectionstart(parser.reader.next()) s = elements.Subsubsection(ordinal=secnumber, title=title) return parser.make_children(s) def make_paragraph(parser): return elements.Paragraph([parser.reader.next().strip()]) def make_preformatted(parser): return elements.Preformatted([parser.reader.next()]) # def make_unorderedlist(parser): # listtype = analyze_listitem(parser.reader.peek())[0] # assert ordinal is None # ul = elements.UnorderedList(type=listtype) # ul.append(parser.make_child(IN_UNORDEREDLIST)) # 1st element of list # return parser.make_children(ul) # setattr(make_unorderedlist,'newstate','unorderedlist') @newstate('ol-decimal') def make_ol_decimal(parser): return make_orderedlist(parser, "decimal", "ol-decimal") @newstate('ol-alpha') def make_ol_alpha(parser): return make_orderedlist(parser, "lower-alpha", "ol-alpha") @newstate('ol-roman') def make_ol_roman(parser): return make_orderedlist(parser, "lower-roman", "ol-roman") @newstate('listitem') def make_listitem(parser): chunk = parser.reader.next() (listtype, ordinal, separator, rest) = analyze_listitem(chunk) li = elements.ListItem(ordinal=ordinal) li.append(rest) return parser.make_children(li) # NOTE: no @newstate decorator for these -- we transition from # one state to the next, not push a new state onto the stack def make_state_a(parser): return elements.Paragraph([parser.reader.next().strip()], id="state-a") def make_state_b(parser): return elements.Paragraph([parser.reader.next().strip()], id="state-b") def make_state_c(parser): return elements.Paragraph([parser.reader.next().strip()], id="state-c") # HELPERS def section_segments_count(s): return ((s is not None) and len(list(filter(None, s.split("."))))) def make_orderedlist(parser, listtype, childstate): listtype = analyze_listitem(parser.reader.peek())[0] ol = elements.OrderedList(type=listtype) ol.append(parser.make_child(make_listitem, "listitem")) return parser.make_children(ol) # matches # "1 Blahonga" # "1.2.3. This is a subsubsection" re_sectionstart = re.compile("^(\d[\.\d]*) +(.*[^\.])$").match def analyze_sectionstart(chunk): m = re_sectionstart(chunk) if m: return (m.group(1).rstrip("."), m.group(2).strip()) else: return (None, chunk) def analyze_listitem(chunk): # returns: same as list-style-type in CSS2.1, sans # 'georgian', 'armenian' and 'greek', plus 'dashed' listtype = ordinal = separator = rest = None # match "1. Foo…" or "14) bar…" but not "4 This is a heading" m = re.match('^(\d+)([\.\)]) +', chunk) if m: if chunk.startswith("0"): listtype = "decimal-leading-zero" else: listtype = "decimal" (ordinal, separator) = m.groups() rest = chunk[m.end():] return (listtype, ordinal, separator, rest) # match "IX. Foo… or "vii) bar…" but not "vi is a sucky # editor" or "MMXIII is the current year" m = re.match('^([IVXivx]+)([\.\)]) +', chunk) if m: if chunk[0].islower(): listtype = 'lower-roman' else: listtype = 'upper-roman' (ordinal, separator) = m.groups() rest = chunk[m.end():] return (listtype, ordinal, separator, rest) # match "a. Foo… or "z) bar…" but not "to. Next sentence…" m = re.match('^([A-Za-z])([\.\)]) +', chunk) if m: if chunk[0].islower(): listtype = 'lower-alpha' else: listtype = 'upper-alpha' (ordinal, separator) = m.groups() rest = chunk[m.end():] return (listtype, ordinal, separator, rest) if chunk.startswith("* "): return ("disc", None, None, chunk) if chunk.startswith("- "): return ("dash", None, None, chunk) return (listtype, ordinal, separator, chunk) # None * 3 # MAIN CODE p = FSMParser() p.set_recognizers(is_li_decimal, is_li_roman, is_li_alpha, is_header, is_section, is_subsection, is_subsubsection, is_preformatted, is_definition, is_description, is_state_a, is_state_b, is_state_c, is_paragraph) p.set_transitions({ ("body", is_paragraph): (make_paragraph, None), ("body", is_section): (make_section, "section"), ("body", is_state_a): (make_state_a, "state-a"), ("state-a", is_state_b): (make_state_b, "state-b"), ("state-b", is_state_c): (make_state_c, "state-c"), ("state-c", is_section): (False, None), ("section", is_paragraph): (make_paragraph, None), ("section", is_subsection): (make_subsection, "subsection"), ("subsection", is_paragraph): (make_paragraph, None), ("subsection", is_subsection): (False, None), ("subsection", is_state_a): (False, "body"), ("subsection", is_subsubsection): (make_subsubsection, "subsubsection"), ("subsubsection", is_paragraph): (make_paragraph, None), ("subsubsection", is_section): (False, None), ("subsection", is_section): (False, None), ("section", is_section): (False, None), ("body", is_li_decimal): (make_ol_decimal, "ol-decimal"), ("ol-decimal", is_li_decimal): (make_listitem, "listitem"), ("ol-decimal", is_li_alpha): (make_ol_alpha, "ol-alpha"), ("ol-alpha", is_li_alpha): (make_listitem, "listitem"), ("ol-alpha", is_li_roman): (make_ol_roman, "ol-roman"), ("ol-roman", is_li_roman): (make_listitem, "listitem"), ("ol-roman", is_li_alpha): (False, None), ("ol-alpha", is_li_decimal): (False, None), ("listitem", is_li_alpha): sublist_or_parent, ("listitem", is_li_roman): sublist_or_parent, ("listitem", is_li_decimal): sublist_or_parent, }) p.debug = debug tr = TextReader(filename, encoding="utf-8", linesep=TextReader.UNIX) p.initial_state = "body" p.initial_constructor = make_body b = p.parse(tr.getiterator(tr.readparagraph)) return p, b