def t_INITIAL_parallel_labeled_HASHID(self, t): '\#[a-zA-Z][a-zA-Z0-9\[\]]+\:' # Note that \:? absorbs a trailing colon in protocol keywords t.value = t.value[1:-1] t.lexpos += 1 # Use lower here since there are some ATF files with # the protocol incorrectly written as #NOTE: t.type = self.resolve_keyword(t.value.lower(), AtfLexer.protocols, extra={'CHECK': 'CHECK'}) if t.type == "KEY": t.lexer.push_state('nonequals') if t.type == "LEM": t.lexer.push_state('lemmatize') if t.type == "TR": t.lexer.push_state('interlinear') if t.type in ['PROJECT', "BIB"]: t.lexer.push_state('flagged') if t.type in ['CHECK']: t.lexer.push_state('absorb') if t.type == "NOTE": t.lexer.push_state('para') if t.type is None: formatstring = u"Illegal #STRING '{}'".format(t.value) valuestring = t.value if _pyversion() == 2: formatstring = formatstring.encode('UTF-8') valuestring = valuestring.encode('UTF-8') if self.skipinvalid: warnings.warn(formatstring, UserWarning) return else: raise SyntaxError(formatstring, (None, t.lineno, t.lexpos, valuestring)) return t
def p_error(self, p): formatstring = u"PyOracc could not parse token '{}'.".format(p) valuestring = p.value if _pyversion() == 2: formatstring = formatstring.encode('UTF-8') valuestring = valuestring.encode('UTF-8') print(SyntaxError(formatstring, (None, p.lineno, p.lexpos, valuestring)))
def p_error(self, p): formatstring = u"PyOracc could not parse token '{}'.".format(p) valuestring = p.value if _pyversion() == 2: formatstring = formatstring.encode('UTF-8') valuestring = valuestring.encode('UTF-8') raise SyntaxError(formatstring, (None, p.lineno, p.lexpos, valuestring))
def t_ANY_error(self, t): formatstring = u"Illegal character '{}'".format(t.value[0]) if _pyversion() == 2: formatstring = formatstring.encode('UTF-8') if self.skipinvalid: t.lexer.skip(1) warnings.warn(formatstring, UserWarning) else: raise SyntaxError(formatstring)
def p_error(self, p): formatstring = u"PyOracc could not parse token {} at line {} at " \ u"offset {} with value '{}'.".format(p.type, p.lineno, p.lexpos, p.value) valuestring = p.value if _pyversion() == 2: valuestring = valuestring.encode('UTF-8') formatstring = formatstring.encode('UTF-8') raise SyntaxError(formatstring, (None, p.lineno, p.lexpos, valuestring))
def t_ANY_error(self, t): fstring = u"PyOracc got an illegal character '{}'".format(t.value[0]) valuestring = t.value if _pyversion() == 2: fstring = fstring.encode('UTF-8') valuestring = valuestring.encode('UTF-8') if self.skipinvalid: t.lexer.skip(1) warnings.warn(fstring, UserWarning) return else: raise SyntaxError(fstring, (None, t.lineno, t.lexpos, valuestring))
def t_INITIAL_parallel_labeled_ATID(self, t): '^\@[a-zA-Z][a-zA-Z0-9\[\]]*\+?' t.value = t.value[1:] t.lexpos += 1 t.type = self.resolve_keyword( t.value, AtfLexer.structures + AtfLexer.long_argument_structures, extra={ "h1": "HEADING", "h2": "HEADING", "h3": "HEADING", "label+": "LABEL", "end": "END" }, ) if t.type == "INCLUDE": t.lexer.push_state('nonequals') if t.type == "END": if not (self.skipinvalid) or t.lexer.current_state() != 'INITIAL': t.lexer.pop_state() t.lexer.push_state('transctrl') if t.type == "LABEL": t.lexer.push_state("para") t.lexer.push_state("transctrl") if t.type == "TRANSLATION": t.lexer.push_state("transctrl") if t.type == "SCORE": t.lexer.push_state('score') if t.type in AtfLexer.long_argument_structures + ["NOTE"]: t.lexer.push_state('flagged') if t.type is None: formatstring = u"Illegal @STRING '{}'".format(t.value) valuestring = t.value if _pyversion() == 2: formatstring = formatstring.encode('UTF-8') valuestring = valuestring.encode('UTF-8') if self.skipinvalid: warnings.warn(formatstring, UserWarning) return else: print( SyntaxError(formatstring, (None, t.lineno, t.lexpos, valuestring))) return return t
def t_INITIAL_parallel_labeled_ATID(self, t): '^\@[a-zA-Z][a-zA-Z0-9\[\]]*\+?' t.value = t.value[1:] t.lexpos += 1 t.type = self.resolve_keyword(t.value, AtfLexer.structures + AtfLexer.long_argument_structures, extra={ "h1": "HEADING", "h2": "HEADING", "h3": "HEADING", "label+": "LABEL", "end": "END" }, ) if t.type == "INCLUDE": t.lexer.push_state('nonequals') if t.type == "END": if not(self.skipinvalid) or t.lexer.current_state() != 'INITIAL': t.lexer.pop_state() t.lexer.push_state('transctrl') if t.type == "LABEL": t.lexer.push_state("para") t.lexer.push_state("transctrl") if t.type == "TRANSLATION": t.lexer.push_state("transctrl") if t.type == "SCORE": t.lexer.push_state('score') if t.type in AtfLexer.long_argument_structures + ["NOTE"]: t.lexer.push_state('flagged') if t.type is None: formatstring = u"Illegal @STRING '{}'".format(t.value) valuestring = t.value if _pyversion() == 2: formatstring = formatstring.encode('UTF-8') valuestring = valuestring.encode('UTF-8') if self.skipinvalid: warnings.warn(formatstring, UserWarning) return else: raise SyntaxError(formatstring, (None, t.lineno, t.lexpos, valuestring)) return t
def test_note_ended_by_line(line_label): 'Notes can be free text until the next line label.' # Sample text. line1 = u"a-šar _saḫar.ḫi.a_ bu-bu-su-nu" line2 = u"a-kal-ši-na ṭi-id-di" # Generate the successive line numbers in the same style. label1 = line_label next_label = int(label1[:1]) + 1 if _pyversion() == 2: label2 = unicode(next_label) + label1[1:] else: label2 = str(next_label) + label1[1:] compare_tokens( label1 + ". " + line1 + "\n" + "#note: Does this combine with the next line?\n" + label2 + ". " + line2 + "\n", ["LINELABEL"] + ["ID"] * len(line1.split()) + ["NEWLINE"] + ["NOTE", "ID", "NEWLINE"] + ["LINELABEL"] + ["ID"] * len(line2.split()) + ["NEWLINE"], [label1] + line1.split() + [None, None, "Does this combine with the next line?", None] + [label2] + line2.split() + [None])
PyORACC is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with PyORACC. If not, see <http://www.gnu.org/licenses/>. ''' from __future__ import print_function from itertools import repeat import pytest from pyoracc.atf.common.atflex import AtfLexer from pyoracc import _pyversion if _pyversion() == 2: from itertools import izip_longest as zip_longest else: from itertools import zip_longest def compare_tokens(content, expected_types, expected_values=None, expected_lineno=None, expected_lexpos=None): lexer = AtfLexer().lexer lexer.input(content) if expected_values is None: expected_values = repeat(None) if expected_lineno is None:
def yacc_default(self, value, line, pos, etype): mesg = self.yacc_tmp_default.format(value, line, pos, etype) mesg = mesg.encode('UTF-8') if _pyversion() == 2 else mesg return mesg
def lex_default(self, value, line, pos): mesg = self.lex_tmp_default.format(value, line, pos) mesg = mesg.encode('UTF-8') if _pyversion() == 2 else mesg return mesg
def wrong_path(self, log_path): mesg = self.wrong_logpath_tmp.format(log_path) mesg = mesg.encode('UTF-8') if _pyversion() == 2 else mesg return mesg
def summary_num(self, lex_num, yacc_num, pathname): mesg = self.summary_num_tmp.format(lex_num, yacc_num, pathname) mesg = mesg.encode('UTF-8') if _pyversion() == 2 else mesg return mesg
but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with PyORACC. If not, see <http://www.gnu.org/licenses/>. ''' from __future__ import print_function from itertools import repeat from unittest import TestCase import pytest from pyoracc.atf.common.atflex import AtfLexer from pyoracc import _pyversion if _pyversion() == 2: from itertools import izip_longest as zip_longest else: from itertools import zip_longest class TestLexer(TestCase): """A class that contains all tests of the ATFLexer""" def setUp(self): self.lexer = AtfLexer().lexer def compare_tokens(self, content, expected_types, expected_values=None, expected_lineno=None, expected_lexpos=None): self.lexer.input(content) if expected_values is None: expected_values = repeat(None)
def head_default(self, idx, ID, path): mesg = self.head_tmp_default.format(idx, ID, path) mesg = mesg.encode('UTF-8') if _pyversion() == 2 else mesg return mesg
def summary_end(self, pathname): mesg = self.summary_end_tmp.format(pathname) mesg = mesg.encode('UTF-8') if _pyversion() == 2 else mesg return mesg
def raise_error(self, e, pathname): mesg = self.raise_tmp.format(e, pathname) mesg = mesg.encode('UTF-8') if _pyversion() == 2 else mesg return mesg