def testCitationSetters(self): d = Citation() c = Citation( name="ahah", refsDecl="/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n='$1']", child=None) b = Citation( name="ahah", refsDecl="/tei:TEI/tei:text/tei:body/tei:div/tei:z[@n='$1']", child=None) with open("tests/testing_data/texts/sample.xml", "rb") as sample: a = CapitainsCtsText(resource=sample, citation=b) """ Test original setting """ self.assertIs(a.citation, b) """ Test simple replacement """ a.citation = d self.assertIs(a.citation, d) """ Test conversion """ a.citation = c self.assertEqual(a.citation.name, "ahah") self.assertEqual(a.citation.child, None) self.assertEqual( a.citation.refsDecl, "/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n='$1']") self.assertEqual(a.citation.scope, "/tei:TEI/tei:text/tei:body/tei:div") self.assertEqual(a.citation.xpath, "/tei:div[@n='?']")
def test_node_collision(self): """ Test unique_passage """ unit = HookTest.capitains_units.cts.CTSText_TestUnit("/a/b") unit.xml = etree.ElementTree( etree.fromstring( self.frame.format( "/tei:TEI/tei:text/tei:body//tei:div[@n='$1']", "/tei:TEI/tei:text/tei:body/tei:div[@n='$1']//tei:div[@n='$2']", 1, 1, 2, 3, 1, 1, 2))).getroot() unit.Text = CapitainsCtsText(resource=unit.xml) unit.flush() results = [result for result in unit.unique_passage()] self.assertEqual(results, [False], "Wrong citation with node collision should fail") unit.xml = etree.ElementTree( etree.fromstring( self.frame.format( "/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n='$1']", "/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n='$1']/tei:div[@n='$2']", 1, 1, 2, 3, 1, 1, 2))).getroot() unit.Text = CapitainsCtsText(resource=unit.xml) unit.flush() results = [result for result in unit.unique_passage()] self.assertEqual(results, [True], "Right citation with node collision should success")
def test_get_passage_hypercontext_complex_xpath(self): simple = self.text_complex.getTextualNode(Reference("pr.1-1.2")) str_simple = simple.tostring(encoding=str) text = CapitainsCtsText(resource=str_simple, citation=self.text_complex.citation) self.assertIn( "Pervincis tandem", text.getTextualNode(Reference("pr.1"), simple=True).export(output=Mimetypes.PLAINTEXT, exclude=["tei:note" ]).strip(), "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)" ) self.assertEqual( text.getTextualNode( Reference("1.2"), simple=True).export(output=Mimetypes.PLAINTEXT).strip(), "lusimus quos in Suebae gratiam virgunculae,", "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)" ) self.assertEqual( list(map(lambda x: str(x), text.getValidReff(level=2))), ["pr.1", "1.1", "1.2"], "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)" )
def onekgreek_tei_xml_to_text_capitains(): """Use MyCapitains program to convert TEI to plaintext.""" file = os.path.expanduser( '~/cltk_data/greek/text/greek_text_first1kgreek/data/tlg0627/tlg021/tlg0627.tlg021.1st1K-grc1.xml') xml_dir = os.path.expanduser('~/cltk_data/greek/text/greek_text_first1kgreek/data/*/*/*.xml') xml_paths = glob.glob(xml_dir) if not len(xml_paths): logger.error('1K Greek corpus not installed. Use CorpusInstaller to get `First1KGreek`.') raise FileNotFoundError xml_paths = [path for path in xml_paths if '__cts__' not in path] # new dir new_dir = os.path.expanduser('~/cltk_data/greek/text/greek_text_first1kgreek_plaintext/') if not os.path.isdir(new_dir): os.makedirs(new_dir) for xml_path in xml_paths: _, xml_name = os.path.split(xml_path) xml_name = xml_name.rstrip('.xml') xml_name += '.txt' plain_text = '' with open(xml_path) as file_open: text = CapitainsCtsText(resource=file_open) for ref in text.getReffs(level=len(text.citation)): psg = text.getTextualNode(subreference=ref, simple=True) text_line = psg.export(Mimetypes.PLAINTEXT, exclude=["tei:note"]) plain_text += text_line new_plaintext_path = os.path.join(new_dir, xml_name) with open(new_plaintext_path, 'w') as file_open: file_open.write(plain_text)
def test_Text_text_function(self, simple): simple = self.seneca.getTextualNode(Reference("1"), simple=simple) str_simple = simple.tostring(encoding=str) text = CapitainsCtsText(resource=str_simple, citation=self.seneca.citation) self.assertEqual( text.export(output=Mimetypes.PLAINTEXT, exclude=["tei:note"]).strip(), "Di coniugales tuque genialis tori,", "Ensure text methods works on CtsTextMetadata object")
def test_warning(self): with open("tests/testing_data/texts/duplicate_references.xml") as xml: text = CapitainsCtsText(resource=xml) with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. warnings.simplefilter("always") for i in [1, 2, 3]: text.getValidReff(level=i, _debug=True) self.assertEqual(len(w), 3, "There should be warning on each level") self.assertEqual( issubclass(w[-1].category, MyCapytain.errors.DuplicateReference), True, "Warning should be DuplicateReference") self.assertEqual(str(w[0].message), "1", "Warning message should be list of duplicate")
def testURN(self): """ Check that urn is set""" tei = CapitainsCtsText( resource=self.TEI.xml, urn="urn:cts:latinLit:phi1294.phi002.perseus-lat2") self.assertEqual(str(tei.urn), "urn:cts:latinLit:phi1294.phi002.perseus-lat2")
def parsable(self): """ Chacke that the text is parsable (as XML) and ingest it through MyCapytain then. .. note:: Override super(parsable) and add CapiTainS Ingesting to it """ status = next(super(CTSText_TestUnit, self).parsable()) if status is True: try: self.Text = CapitainsCtsText(resource=self.xml.getroot()) except MissingRefsDecl as E: self.Text = None self.log(str(E)) self.capitains_errors.append(str(E)) yield False else: self.Text = None yield status
def build_texts(self, text): interactive_text = CapitainsCtsText( resource=etree.parse(text).getroot()) reffs = interactive_text.getReffs(level=len(interactive_text.citation)) passages = [ interactive_text.getTextualNode(passage) for passage in reffs ] plaintext = [ r.export(Mimetypes.PLAINTEXT, exclude=["tei:note"]).strip() for r in passages ] if self.cites is True: for i, t in enumerate(plaintext): plaintext[i] = '#' + reffs[i] + '#\n' + t with open('{}text/{}.txt'.format( self.dest, text.split('/')[-1].replace('.xml', '')), mode='w') as f: f.write('\n\n'.join(plaintext))
def test_passage_extraction_fail_when_reffs_are_found(self): """ This issues is drawn from https://github.com/PerseusDL/canonical-latinLit/issues/226 """ with open("tests/testing_data/texts/extraction_issue.xml") as text: interactive_text = CapitainsCtsText(resource=etree.parse(text).getroot()) reffs = interactive_text.getReffs(level=len(interactive_text.citation)) passages = [] # The failing passage was 5.1 for reff in reffs: try: passages.append(interactive_text.getTextualNode(reff)) except IndexError: raise Exception("Unable to extract %s " % reff) plaintext = [r.export(Mimetypes.PLAINTEXT, exclude=["tei:note"]).strip() for r in passages] self.assertIn( "NUNC et praedictos et regni sorte sequentes", plaintext, "The text of 5.1 should be in plaintext" )
def test_empty_ref_warning(self): with open("tests/testing_data/texts/empty_references.xml") as xml: text = CapitainsCtsText(resource=xml) with warnings.catch_warnings(record=True) as w: # Cause all warnings to always be triggered. warnings.simplefilter("always") for i in [1, 2, 3]: text.getValidReff(level=i, _debug=True) self.assertEqual(len(w), 3, "There should be warning on each level") self.assertEqual( issubclass(w[-1].category, MyCapytain.errors.EmptyReference), True, "Warning should be EmptyReference") self.assertEqual([str(s.message) for s in w], [ "1 empty reference(s) at citation level 1", "1 empty reference(s) at citation level 2", "1 empty reference(s) at citation level 3" ], "Warning message should indicate number of references and the level at which they occur" )
def onekgreek_tei_xml_to_text_capitains(): """Use MyCapitains program to convert TEI to plaintext.""" file = os.path.expanduser( get_cltk_data_dir() + '/greek/text/greek_text_first1kgreek/data/tlg0627/tlg021/tlg0627.tlg021.1st1K-grc1.xml' ) xml_dir = os.path.normpath( get_cltk_data_dir() + '/greek/text/greek_text_first1kgreek/data/*/*/*.xml') xml_paths = glob.glob(xml_dir) if not len(xml_paths): logger.error( '1K Greek corpus not installed. Use CorpusInstaller to get `First1KGreek`.' ) raise FileNotFoundError xml_paths = [path for path in xml_paths if '__cts__' not in path] # new dir new_dir = os.path.normpath( get_cltk_data_dir() + '/greek/text/greek_text_first1kgreek_plaintext/') if not os.path.isdir(new_dir): os.makedirs(new_dir) for xml_path in xml_paths: _, xml_name = os.path.split(xml_path) xml_name = xml_name.rstrip('.xml') xml_name += '.txt' plain_text = '' with open(xml_path) as file_open: text = CapitainsCtsText(resource=file_open) for ref in text.getReffs(level=len(text.citation)): psg = text.getTextualNode(subreference=ref, simple=True) text_line = psg.export(Mimetypes.PLAINTEXT, exclude=["tei:note"]) plain_text += text_line new_plaintext_path = os.path.join(new_dir, xml_name) with open(new_plaintext_path, 'w') as file_open: file_open.write(plain_text)
def test_get_passage_hyper_context_double_slash_xpath(self): simple = self.seneca.getTextualNode(Reference("1-10")) str_simple = simple.export(output=Mimetypes.XML.Std) text = CapitainsCtsText(resource=str_simple, citation=self.seneca.citation) self.assertEqual( text.getTextualNode(Reference("1"), simple=True).export(output=Mimetypes.PLAINTEXT, exclude=["tei:note" ]).strip(), "Di coniugales tuque genialis tori,", "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)" ) self.assertEqual( text.getTextualNode( Reference("10"), simple=True).export(output=Mimetypes.PLAINTEXT).strip(), "aversa superis regna manesque impios", "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)" ) self.assertEqual( list(map(lambda x: str(x), text.getValidReff(level=1))), ["1", "2", "3", "4", "5", "6", "7", "8", "9", "10"], "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)" ) simple = self.seneca.getTextualNode(Reference("1")) str_simple = simple.tostring(encoding=str) text = CapitainsCtsText(resource=str_simple, citation=self.seneca.citation) self.assertEqual( text.getTextualNode(Reference("1"), simple=True).export(output=Mimetypes.PLAINTEXT, exclude=["tei:note" ]).strip(), "Di coniugales tuque genialis tori,", "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)" ) self.assertEqual( list(map(lambda x: str(x), text.getValidReff(level=1))), ["1"], "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)" )
def _get_citable_text(self, fileid): """ Parameters ---------- fileid: str The file identifier of the file to read Returns ------- CapitainsCtsText object """ with open(self._root.join(fileid)) as f: text = CapitainsCtsText(resource=f) return text
def test_illegal_characters_pass(self): """ Test that forbidden passes when there are no illegal characters""" unit = HookTest.capitains_units.cts.CTSText_TestUnit("/a/b") unit.xml = etree.ElementTree( etree.fromstring( self.frame.format( "/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n='$1']", "/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n='$1']/tei:div[@n='$2']", 0, 1, "q", "b", "105v", "1", "2"))).getroot() unit.Text = CapitainsCtsText(resource=unit.xml) unit.flush() results = [result for result in unit.passages()] self.assertEqual(results, [True, True], "Passages are found") unit.test_status['passages'] = True results = list(unit.forbidden()) self.assertEqual( results, [True], "Illegal characters should pass if no forbidden characters") self.assertEqual( unit.forbiddens, [], "All passage IDs containing forbidden characters should be stored." )
def test_illegal_characters_fail(self): """ Test that illegal characters are detected""" unit = HookTest.capitains_units.cts.CTSText_TestUnit("/a/b") unit.xml = etree.ElementTree( etree.fromstring( self.frame.format( "/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n='$1']", "/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n='$1']/tei:div[@n='$2']", "0 1", "a.b", "d-d", "@", "7", "1", "2"))).getroot() unit.Text = CapitainsCtsText(resource=unit.xml) unit.flush() results = [result for result in unit.passages()] self.assertEqual(results, [True, True], "Passages are found") results = list(unit.forbidden()) self.assertEqual(results, [False], "Illegal character should fail") self.assertIn( ">>>>>> Reference with forbidden characters found: '0 1', '0 1.a.b', '0 1.d-d', '0 1.@'", unit.logs) self.assertCountEqual( unit.forbiddens, ["'0 1'", "'0 1.a.b'", "'0 1.d-d'", "'0 1.@'"], "All passage IDs containing forbidden characters should be stored." )
class CTSText_TestUnit(TESTUnit): """ CTS testing object :param path: Path to the file :type path: basestring :param countwords: Count the number of words and log it if necessary :type countwords: bool :cvar tests: Contains the list of methods to be run again the text :type tests: [str] :cvar readable: Human friendly string associated to object methods :type readable: dict :ivar inv: List of URN retrieved in metadata. Used to check the availability of metadata for the text :type inv: [str] :ivar scheme: Scheme to be used to check the :type scheme: str :ivar Text: Text object according to MyCapytains parsing. Used to find passages :type Text: MyCapytain.resources.text.local.Text Shared variables with parent class: :ivar path: Path for the resource :type path: str :ivar xml: XML resource, parsed in python. Used to do general checking :type xml: lxml._etree.Element .. note:: All method in CTSText_TestUnit.tests ( "parsable", "has_urn", "naming_convention", "refsDecl", "passages", \ "unique_passage", "inventory" ) yield at least one boolean (might be more) which represents the success of it. """ tests = [ # Parsing the XML "parsable", # Retrieving the URN (requires parsale "has_urn", 'language', # Requires has_urn "inventory", "naming_convention", # Requires parsable "refsDecl", "passages", "unique_passage", "duplicate", "forbidden", "empty" ] breaks = [ "parsable", "refsDecl", "passages" ] readable = { "parsable": "File parsing", "refsDecl": "RefsDecl parsing", "passages": "Passage level parsing", "duplicate": "Duplicate passages", "forbidden": "Forbidden characters", "epidoc": "Epidoc DTD validation", "tei": "TEI DTD Validation", "auto_rng": "Automatic RNG validation", "local_file": "Custom local RNG validation", "has_urn": "URN informations", "naming_convention": "Naming conventions", "inventory": "Available in inventory", "unique_passage": "Unique nodes found by XPath", "count_words": "Word Counting", "language": "Correct xml:lang attribute", "empty": "Empty References" } splitter = re.compile(r'\S+', re.MULTILINE) def __init__(self, path, countwords=False, timeout=30, *args, **kwargs): self.inv = list() self.timeout = timeout self.scheme = None self.guidelines = None self.rng = None self.Text = None self.xml = None self.count = 0 self.countwords = countwords self.citation = list() self.duplicates = list() self.forbiddens = list() self.empties = list() self.capitains_errors = list() self.test_status = defaultdict(bool) self.lang = '' self.dtd_errors = list() super(CTSText_TestUnit, self).__init__(path, *args, **kwargs) def parsable(self): """ Chacke that the text is parsable (as XML) and ingest it through MyCapytain then. .. note:: Override super(parsable) and add CapiTainS Ingesting to it """ status = next( super(CTSText_TestUnit, self).parsable() ) if status is True: try: self.Text = CapitainsCtsText(resource=self.xml.getroot()) except MissingRefsDecl as E: self.Text = None self.log(str(E)) self.capitains_errors.append(str(E)) yield False else: self.Text = None yield status def refsDecl(self): """ Check that the text contains refsDecl informations """ if self.Text: # In 1.0.1, MyCapytain actually create an empty citation by default if not self.Text.citation.isEmpty(): self.log(str(len(self.Text.citation)) + " citation's level found") yield True else: yield False else: yield False def run_rng(self, rng_path): """ Run the RNG through JingTrang :param rng_path: Path to the RelaxNG file to run against the XML to test """ test = subprocess.Popen( ["java", "-Duser.country=US", "-Duser.language=en", "-jar", TESTUnit.JING, rng_path, self.path], stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=False ) out = [] error = [] timer = Timer(self.timeout, test.kill) try: timer.start() out, error = test.communicate() except Exception as E: self.error(E) yield False pass finally: if not timer.isAlive(): self.log("Timeout on RelaxNG") yield False timer.cancel() pass timer.cancel() # This is to deal with Travis printing a message about the _JAVA_OPTIONS when a java command is run # Travis printing this command resulted in this test not passing out = '\n'.join([x for x in out.decode().split('\n') if '_JAVA_OPTIONS' not in x]).encode() error = '\n'.join([x for x in error.decode().split('\n') if '_JAVA_OPTIONS' not in x]).encode() if len(out) > 0: for issue in TESTUnit.rng_logs(out): self.log(issue) self.dtd_errors.append(issue) yield len(out) == 0 and len(error) == 0 def auto_rng(self): xml = parse(self.path) xml_dir = os.path.dirname(os.path.abspath(self.path)) # A file can have multiple schema for rng in xml.xpath("/processing-instruction('xml-model')"): uri = rng.attrib["href"] rng_path = os.path.abspath(os.path.join(xml_dir, uri)) if validators.url(uri): rng_path = self.get_remote_rng(uri) elif not os.path.isfile(rng_path): self.dtd_errors.append("No RNG was found at " + rng_path) yield False continue for status in self.run_rng(rng_path): yield status def get_remote_rng(self, url): """ Given a valid URL, downloads the RNG from the given URL and returns the filepath and name :param url: the URL of the RNG :return: filenpath and name where the RNG was saved """ # If the file is remote, have a file-system approved name # The md5 hash seems like a good option sha = md5(url.encode()).hexdigest() # We have a name for the rng file but also for the in-download marker # Note : we might want to add a os.makedirs somewhere with exists=True makedirs(".rngs", exist_ok=True) stable_local = os.path.join(".rngs", sha+".rng") stable_local_downloading = os.path.join(".rngs", sha+".rng-indownload") # check if the stable_local rng already exists # if it does, immediately run the rng test and move to the next rng in the file if os.path.exists(stable_local): return stable_local # We check if the in-download proof file is shown here # Until the in-download marker is there, we need to wait elif os.path.exists(stable_local_downloading): # Wait up to 30 secs ? # Have it as a constant that could be changed in environment variables ? waited = self.timeout while not os.path.exists(stable_local): time.sleep(1) waited -= 1 if waited < 0: # Maybe we can wait more ? raise EnvironmentError("The download of the RNG took too long") else: with open(stable_local_downloading, "w") as f: f.write("Downloading...") data = requests.get(url) data.raise_for_status() with open(stable_local_downloading, "w") as f: f.write(data.text) shutil.move(stable_local_downloading, stable_local) return stable_local def epidoc(self): """ Check the original file against Epidoc rng through a java pipe """ for status in self.run_rng(TESTUnit.EPIDOC): yield status def tei(self): """ Check the original file against TEI rng through a java pipe """ for status in self.run_rng(TESTUnit.TEI_ALL): yield status def local_file(self): """ Check the original file against TEI rng through a java pipe """ for status in self.run_rng(self.rng): yield status def passages(self): """ Check that passages are available at each level. On top of that, it checks for forbidden characters \ and duplicate in references """ if self.Text and self.Text.citation.refsDecl: citations = [c.name for c in self.Text.citation] for i in range(0, len(self.Text.citation)): try: with warnings.catch_warnings(record=True) as warning_record: # Cause all warnings to always be triggered. warnings.simplefilter("always") passages = self.Text.getValidReff(level=i+1, _debug=True) ids = [ref.split(".", i)[-1] for ref in passages] space_in_passage = TESTUnit.FORBIDDEN_CHAR.search("".join(ids)) len_passage = len(passages) status = len_passage > 0 self.log(str(len_passage) + " found") self.citation.append((i, len_passage, citations[i])) for record in warning_record: if record.category == DuplicateReference: self.duplicates += sorted(str(record.message).split(", ")) if record.category == EmptyReference: self.empties += [str(record.message)] if space_in_passage and space_in_passage is not None: self.forbiddens += ["'{}'".format(n) for ref, n in zip(ids, passages) if TESTUnit.FORBIDDEN_CHAR.search(ref)] if status is False: yield status break yield status except Exception as E: self.error(E) self.log("Error when searching passages at level {0}".format(i+1)) yield False break else: yield False def duplicate(self): """ Detects duplicate references """ if len(self.duplicates) > 0: self.log("Duplicate references found : {0}".format(", ".join(self.duplicates))) yield False elif self.test_status['passages'] is False: yield False else: yield True def forbidden(self): """ Checks for forbidden characters in references """ if len(self.forbiddens) > 0: self.log("Reference with forbidden characters found: {0}".format(", ".join(self.forbiddens))) yield False elif self.test_status['passages'] is False: yield False else: yield True def empty(self): """ Detects empty references """ if len(self.empties) > 0: self.log("Empty references found : {0}".format(", ".join(self.empties))) yield False elif self.test_status['passages'] is False: yield False else: yield True def unique_passage(self): """ Check that citation scheme do not collide (eg. Where text:1 would be the same node as text:1.1) """ try: # Checking for duplicate xpaths = [ self.Text.xml.xpath( MyCapytain.common.reference.REFERENCE_REPLACER.sub( r"\1", citation.refsDecl ), namespaces=TESTUnit.NS ) for citation in self.Text.citation ] nodes = [element for xpath in xpaths for element in xpath] bad_citation = len(nodes) == len(set(nodes)) if not bad_citation: self.log("Some node are found twice") yield False else: yield True except Exception: yield False def has_urn(self): """ Test that a file has its urn according to CapiTainS Guidelines in its scheme """ if self.xml is not None: if self.guidelines == "2.tei": urns = self.xml.xpath("//tei:text/tei:body[starts-with(@n, 'urn:cts:')]", namespaces=TESTUnit.NS) + \ self.xml.xpath("//tei:text[starts-with(@xml:base, 'urn:cts:')]", namespaces=TESTUnit.NS) else: urns = self.xml.xpath( "//tei:body/tei:div[@type='edition' and starts-with(@n, 'urn:cts:')]", namespaces=TESTUnit.NS ) urns += self.xml.xpath( "//tei:body/tei:div[@type='translation' and starts-with(@n, 'urn:cts:')]", namespaces=TESTUnit.NS ) urns += self.xml.xpath( "//tei:body/tei:div[@type='commentary' and starts-with(@n, 'urn:cts:')]", namespaces=TESTUnit.NS ) status = len(urns) > 0 if status: logs = urns[0].get("n") if not logs: logs = urns[0].base urn = MyCapytain.common.reference.URN(logs) missing_members = [ key for key in ['namespace', 'work', 'version', 'textgroup'] if getattr(urn, key) is None or len(getattr(urn, key)) == 0 ] if len(urn) < 5: status = False self.log("Incomplete URN") elif urn.reference: status = False self.log("Reference not accepted in URN") elif len(missing_members) > 0: status = False self.log("Elements of URN are empty: {}".format(", ".join(sorted(missing_members)))) self.urn = logs else: status = False yield status def naming_convention(self): """ Check the naming convention of the file """ if self.urn: yield self.urn.split(":")[-1] in self.path else: yield False def inventory(self): """ Check the naming convention of the file """ if self.urn and self.inv: yield self.urn in self.inv else: yield False def count_words(self): """ Count words in a file """ status = False if self.test_status["passages"]: text = self.Text.export(Mimetypes.PLAINTEXT, exclude=["tei:note", "tei:teiHeader"]) self.count = len(type(self).splitter.findall(text)) self.log("{} has {} words".format(self.urn, self.count)) status = self.count > 0 yield status def language(self): """ Tests to make sure an xml:lang element is on the correct node """ if self.guidelines == "2.epidoc": urns_holding_node = self.xml.xpath( "//tei:text/tei:body/tei:div" "[@type='edition' or @type='translation' or @type='commentary']" "[starts-with(@n, 'urn:cts:')]", namespaces=TESTUnit.NS ) elif self.guidelines == "2.tei": urns_holding_node = self.xml.xpath("//tei:text/tei:body[starts-with(@n, 'urn:cts:')]", namespaces=TESTUnit.NS) + \ self.xml.xpath("//tei:text[starts-with(@xml:base, 'urn:cts:')]", namespaces=TESTUnit.NS) try: self.lang = urns_holding_node[0].get('{http://www.w3.org/XML/1998/namespace}lang') except: self.lang = '' if self.lang == '' or self.lang is None: self.lang = 'UNK' yield False else: yield True def test(self, scheme, guidelines, rng=None, inventory=None): """ Test a file with various checks :param scheme: Test with TEI DTD :type scheme: str :param inventory: URNs to be matched against :type inventory: list :returns: Iterator containing human readable test name, boolean status and logs :rtype: iterator(str, bool, list(str)) """ if inventory is not None: self.inv = inventory tests = [] + CTSText_TestUnit.tests if self.countwords: tests.append("count_words") if scheme in["tei", "epidoc", "auto_rng", "local_file"]: tests = [scheme] + tests self.scheme = scheme self.guidelines = guidelines self.rng = rng if environ.get("HOOKTEST_DEBUG", False): print("Starting %s " % self.path) i = 0 for test in tests: # Show the logs and return the status if environ.get("HOOKTEST_DEBUG", False): print("\t Testing %s " % test) status = False not in [status for status in getattr(self, test)()] self.test_status[test] = status yield (CTSText_TestUnit.readable[test], status, self.logs) if test in self.breaks and status == False: for t in tests[i+1:]: self.test_status[t] = False yield (CTSText_TestUnit.readable[t], False, []) break self.flush() i += 1
def test_get_Passage_context_no_double_slash(self): """ Check that get CapitainsCtsPassage contexts return right information """ simple = self.TEI.getTextualNode(Reference("1.pr.2")) str_simple = simple.tostring(encoding=str) text = CapitainsCtsText(resource=str_simple, citation=self.TEI.citation) self.assertEqual( text.getTextualNode( Reference("1.pr.2"), simple=True).export(output=Mimetypes.PLAINTEXT).strip(), "tum, ut de illis queri non possit quisquis de se bene", "Ensure passage finding with context is fully TEI / Capitains compliant (One reference CapitainsCtsPassage)" ) simple = self.TEI.getTextualNode(Reference("1.pr.2-1.pr.7")) str_simple = simple.tostring(encoding=str) text = CapitainsCtsText(resource=str_simple, citation=self.TEI.citation) self.assertEqual( text.getTextualNode( Reference("1.pr.2"), simple=True).export(output=Mimetypes.PLAINTEXT).strip(), "tum, ut de illis queri non possit quisquis de se bene", "Ensure passage finding with context is fully TEI / Capitains compliant (Same level same " "parent range CapitainsCtsPassage)") self.assertEqual( text.getTextualNode( Reference("1.pr.3"), simple=True).export(output=Mimetypes.PLAINTEXT).strip(), "senserit, cum salva infimarum quoque personarum re-", "Ensure passage finding with context is fully TEI / Capitains compliant (Same level same " "parent range CapitainsCtsPassage)") self.assertEqual( list(map(lambda x: str(x), text.getValidReff(level=3))), ["1.pr.2", "1.pr.3", "1.pr.4", "1.pr.5", "1.pr.6", "1.pr.7"], "Ensure passage finding with context is fully TEI / Capitains compliant (Same level same " "parent range CapitainsCtsPassage)") simple = self.TEI.getTextualNode(Reference("1.pr.2-1.1.6")) str_simple = simple.tostring(encoding=str) text = CapitainsCtsText(resource=str_simple, citation=self.TEI.citation) self.assertEqual( text.getTextualNode( Reference("1.pr.2"), simple=True).export(output=Mimetypes.PLAINTEXT).strip(), "tum, ut de illis queri non possit quisquis de se bene", "Ensure passage finding with context is fully TEI / Capitains compliant (Same level range CapitainsCtsPassage)" ) self.assertEqual( text.getTextualNode( Reference("1.1.6"), simple=True).export(output=Mimetypes.PLAINTEXT).strip(), "Rari post cineres habent poetae.", "Ensure passage finding with context is fully TEI / Capitains compliant (Same level range CapitainsCtsPassage)" ) self.assertEqual( list(map(lambda x: str(x), text.getValidReff(level=3))), [ "1.pr.2", "1.pr.3", "1.pr.4", "1.pr.5", "1.pr.6", "1.pr.7", "1.pr.8", "1.pr.9", "1.pr.10", "1.pr.11", "1.pr.12", "1.pr.13", "1.pr.14", "1.pr.15", "1.pr.16", "1.pr.17", "1.pr.18", "1.pr.19", "1.pr.20", "1.pr.21", "1.pr.22", "1.1.1", "1.1.2", "1.1.3", "1.1.4", "1.1.5", "1.1.6", ], "Ensure passage finding with context is fully TEI / Capitains compliant (Same level range CapitainsCtsPassage)" ) simple = self.TEI.getTextualNode(Reference("1.pr.2-1.2")) str_simple = simple.tostring(encoding=str) text = CapitainsCtsText(resource=str_simple, citation=self.TEI.citation) self.assertEqual( text.getTextualNode( Reference("1.pr.2"), simple=True).export(output=Mimetypes.PLAINTEXT).strip(), "tum, ut de illis queri non possit quisquis de se bene", "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)" ) self.assertEqual( text.getTextualNode( Reference("1.1.6"), simple=True).export(output=Mimetypes.PLAINTEXT).strip(), "Rari post cineres habent poetae.", "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)" ) self.assertEqual( list(map(lambda x: str(x), text.getValidReff(level=3))), [ "1.pr.2", "1.pr.3", "1.pr.4", "1.pr.5", "1.pr.6", "1.pr.7", "1.pr.8", "1.pr.9", "1.pr.10", "1.pr.11", "1.pr.12", "1.pr.13", "1.pr.14", "1.pr.15", "1.pr.16", "1.pr.17", "1.pr.18", "1.pr.19", "1.pr.20", "1.pr.21", "1.pr.22", "1.1.1", "1.1.2", "1.1.3", "1.1.4", "1.1.5", "1.1.6", '1.2.1', '1.2.2', '1.2.3', '1.2.4', '1.2.5', '1.2.6', '1.2.7', '1.2.8' ], "Ensure passage finding with context is fully TEI / Capitains compliant (Different level range CapitainsCtsPassage)" )
def test_wrong_main_scope(self): with open("tests/testing_data/texts/sample2.xml", "rb") as f: with self.assertRaises(MyCapytain.errors.RefsDeclError): (CapitainsCtsText(resource=f)).test()
def parse(self, resource): """ Parse a list of directories and reades it into a collection :param resource: List of folders :return: An inventory resource and a list of CtsTextMetadata metadata-objects """ for folder in resource: textgroups = glob("{base_folder}/data/*/__cts__.xml".format(base_folder=folder)) for __cts__ in textgroups: try: with io.open(__cts__) as __xml__: textgroup = XmlCtsTextgroupMetadata.parse( resource=__xml__ ) tg_urn = str(textgroup.urn) if tg_urn in self.inventory: self.inventory[tg_urn].update(textgroup) else: self.dispatcher.dispatch(textgroup, path=__cts__) for __subcts__ in glob("{parent}/*/__cts__.xml".format(parent=os.path.dirname(__cts__))): with io.open(__subcts__) as __xml__: work = XmlCtsWorkMetadata.parse( resource=__xml__, parent=self.inventory[tg_urn] ) work_urn = str(work.urn) if work_urn in self.inventory[tg_urn].works: self.inventory[work_urn].update(work) for __textkey__ in work.texts: __text__ = self.inventory[__textkey__] __text__.path = "{directory}/{textgroup}.{work}.{version}.xml".format( directory=os.path.dirname(__subcts__), textgroup=__text__.urn.textgroup, work=__text__.urn.work, version=__text__.urn.version ) if os.path.isfile(__text__.path): try: with io.open(__text__.path) as f: t = CapitainsCtsText(resource=self.xmlparse(f)) cites = list() for cite in [c for c in t.citation][::-1]: if len(cites) >= 1: cites.append(XmlCtsCitation( xpath=cite.xpath.replace("'", '"'), scope=cite.scope.replace("'", '"'), name=cite.name, child=cites[-1] )) else: cites.append(XmlCtsCitation( xpath=cite.xpath.replace("'", '"'), scope=cite.scope.replace("'", '"'), name=cite.name )) del t __text__.citation = cites[-1] self.logger.info("%s has been parsed ", __text__.path) if __text__.citation.isEmpty() is False: self.texts.append(__text__) else: self.logger.error("%s has no passages", __text__.path) except Exception: self.logger.error( "%s does not accept parsing at some level (most probably citation) ", __text__.path ) else: self.logger.error("%s is not present", __text__.path) except UndispatchedTextError as E: self.logger.error("Error dispatching %s ", __cts__) if self.RAISE_ON_UNDISPATCHED is True: raise E except Exception as E: self.logger.error("Error parsing %s ", __cts__) return self.inventory, self.texts
# We import the correct classes from the local module from MyCapytain.resources.texts.local.capitains.cts import CapitainsCtsText from MyCapytain.common.constants import Mimetypes, XPATH_NAMESPACES from lxml.etree import tostring # We open a file with open("./tests/testing_data/examples/text.martial.xml") as f: # We initiate a Text object giving the IO instance to resource argument text = CapitainsCtsText(resource=f) # Text objects have a citation property # len(Citation(...)) gives the depth of the citation scheme # in the case of this sample, this would be 3 (Book, Poem, Line) for ref in text.getReffs(level=len(text.citation)): # We retrieve a Passage object for each reference that we find # We can pass the reference many way, including in the form of a list of strings # We use the _simple parameter to get a fairly simple object # Simple makes a straight object that has only the targeted node inside of it psg = text.getTextualNode(subreference=ref, simple=True) # We print the passage from which we retrieve <note> nodes print("\t".join([ref, psg.export(Mimetypes.PLAINTEXT, exclude=["tei:note"])])) """ You'll print something like the following : 1.pr.1 Spero me secutum in libellis meis tale temperamen- 1.pr.2 tum, ut de illis queri non possit quisquis de se bene 1.pr.3 senserit, cum salva infimarum quoque personarum re- 1.pr.4 verentia ludant; quae adeo antiquis auctoribus defuit, ut 1.pr.5 nominibus non tantum veris abusi sint, sed et magnis. 1.pr.6 Mihi fama vilius constet et probetur in me novissimum