Exemple #1
0
def test_json_serialization(name):
    """
    Parses ATF and verifies the to_json() method output.
    """
    afile = AtfFile(sample_file(name))
    js = afile.to_json()
    result = json.loads(js)
    assert result
    noskipjs = afile.to_json(skip_empty=False, sort_keys=True)
    result = json.loads(noskipjs)
    assert result
    assert len(noskipjs) >= len(js)
Exemple #2
0
def consider_file(name, code, description):
    """
    Parses ATF and checks CDLI ID and text description coincide
    """
    afile = AtfFile(sample_file(name))
    assert afile.text.code == code
    assert afile.text.description == description
Exemple #3
0
def test_create():
    """
    Parse belsunu.atf and check &-line was parsed correctly
    """
    afile = AtfFile(belsunu())
    assert afile.text.code == "X001001"
    assert afile.text.description == "JCS 48, 089"
Exemple #4
0
def test_text_designation(name, code, description):
    """
    Parses ATF and checks CDLI ID and text description coincide.
    """
    afile = AtfFile(sample_file(name))
    assert afile.text.code == code
    assert afile.text.description == description
Exemple #5
0
def test_composite():
    """
    Parse anzu.atf (composite sample) and check separate text elements were
    parsed correctly
    """
    afile = AtfFile(anzu())
    assert afile.text.texts[0].code == "X002001"
    assert afile.text.texts[0].description == "SB Anzu 1"
    assert afile.text.texts[1].code == "Q002770"
    assert afile.text.texts[1].description == "SB Anzu 2"
Exemple #6
0
 def test_line_words():
     """
     Get a sample line of words with unicode chars and test serialization.
     1. [MU] 1.03-KAM {iti}AB GE₆ U₄ 2-KAM
     """
     atf_file = AtfFile(belsunu())
     uline = atf_file.text.children[0].children[0].children[0]
     uwords = uline.words
     gold = [
         u'[MU]', u'1.03-KAM', u'{iti}AB', u'GE\u2086', u'U\u2084', u'2-KAM'
     ]
     assert uwords == gold
Exemple #7
0
 def test_line_lemmas():
     """
     Get a sample line of lemmas with unicode chars and test serialization.
     šatti[year]N; n; Ṭebetu[1]MN; mūša[at night]AV; ūm[day]N; n
     """
     atf_file = AtfFile(belsunu())
     uline = atf_file.text.children[0].children[0].children[0]
     ulemmas = uline.lemmas
     gold = [
         u' \u0161atti[year]N', u'n', u'\u1e6cebetu[1]MN',
         u'm\u016b\u0161a[at night]AV', u'\u016bm[day]N', u'n'
     ]
     assert ulemmas == gold
Exemple #8
0
    def __init__(self, **kwargs):
        self.texts = []
        self.failures = 0
        self.successes = 0
        self.atftype = kwargs['atftype']
        self.source = kwargs['source']
        if 'source' in kwargs:
            for dirpath, _, files in os.walk(self.source):
                for file in files:
                    if file.endswith('.atf'):
                        try:
                            path = os.path.join(dirpath, file)
                            print("Parsing file", path, "... ", end="")
                            content = codecs.open(path,
                                                  encoding='utf-8-sig').read()
                            self.texts.append(AtfFile(content, self.atftype))

                            self.successes += 1
                            print("OK")
                        except (SyntaxError, IndexError, AttributeError,
                                UnicodeDecodeError) as e:
                            self.texts.append(None)
                            self.failures += 1
                            print("Failed with message: '{}'".format(e))
Exemple #9
0
def convert(atf_text):
    """
    Create a TEI representation of a file-like object containing ATF.
    """

    # Parse the ATF input string.
    atf = AtfFile(atf_text, 'cdli', False)
    if verbose:
        print("Parsed {} -- {}".format(atf.text.code, atf.text.description))

    # Construct a TEI Document to hold the converted text.
    doc = tei.Document()
    doc.language = atf.text.language
    doc.header = tei.Header()
    doc.header.title = atf.text.description
    doc.header.cdli_code = atf.text.code

    # Traverse the parse tree, recording lines under labels.
    translations = {}
    objects = [
        item for item in atf.text.children if isinstance(item, OraccObject)
    ]
    edition = tei.Edition()
    doc.parts.append(edition)
    for item in objects:
        part = tei.TextPart(item.objecttype)
        edition.append(part)
        for section in item.children:
            if isinstance(section, OraccObject):
                try:
                    name = section.name
                except AttributeError:
                    name = section.objecttype
                div = tei.TextPart(name)
                part.append(div)
            elif isinstance(section, Translation):
                # Handle in another pass.
                continue
            else:
                print('Skipping unknown section type', type(section).__name__)
                continue
            for obj in section.children:
                if isinstance(obj, Line):
                    text = normalize_transliteration(obj.words)
                    line = tei.Line(obj.label, text)
                    div.append(line)
                    # Older pyoracc parses interlinear translatsions
                    # as notes. Remember them for serialization below.
                    for note in obj.notes:
                        if note.content.startswith('tr.'):
                            lang, text = note.content.split(':', maxsplit=1)
                            _, lang = lang.split('.')
                            # tr.ts is used for normalization, so mark
                            # this with the primary object's language.
                            if lang == 'ts':
                                lang == atf.text.language
                            tr_line = Line(obj.label)
                            tr_line.words = text.strip().split()
                            if lang not in translations:
                                translations[lang] = []
                            translations[lang].append(tr_line)
                elif isinstance(obj, State) or isinstance(obj, Ruling):
                    text = str(obj).strip()
                    # Strip the initial '$' off the ATF representation.
                    text = text[1:].strip()
                    div.append(tei.Note(text))
                else:
                    print('Skipping unknown section child type',
                          type(obj).__name__)
                    continue

    # Add accumulated interlinear translations to the document.
    for lang, tr_lines in translations.items():
        translation = tei.Translation()
        translation.language = lang
        doc.parts.append(translation)
        for tr_line in tr_lines:
            text = ' '.join(tr_line.words)
            line = tei.Line(tr_line.label, text)
            translation.append(line)

    # Traverse the tree again, recording any parallel translation sections.
    # pyoracc only supports these for English.
    translation = tei.Translation()
    translation.language = 'eng'
    translation_empty = True
    for item in objects:
        part = tei.TextPart(item.objecttype)
        translation.append(part)
        for section in item.children:
            # Skip anything which is not a translation for this pass.
            if not isinstance(section, Translation):
                continue
            for surface in section.children:
                if isinstance(surface, OraccObject):
                    div = tei.TextPart(surface.objecttype)
                    part.append(div)
                    for obj in surface.children:
                        if isinstance(obj, Line):
                            text = ' '.join(obj.words)
                            line = tei.Line(obj.label, text)
                            div.append(line)
                            translation_empty = False
                        else:
                            print('Skipping unknown section child type',
                                  {type(obj).__name__})
                            continue
    if not translation_empty:
        doc.parts.append(translation)

    return doc
Exemple #10
0
def convert(atf_text):
    """
    Create a TEI representation of a file-like object containing ATF.
    """
    atf = AtfFile(atf_text, 'cdli', False)
    if verbose:
        print("Parsed {} -- {}".format(atf.text.code, atf.text.description))
    result = '''<?xml version="1.0" encoding="UTF-8"?>
<TEI xmlns="http://www.tei-c.org/ns/1.0">

<teiHeader>
<fileDesc>
  <titleStmt>
    <title>{description}</title>
  </titleStmt>
  <publicationStmt>
    <p>Converted from ATF by atf2tei.</p>
  </publicationStmt>
  <sourceDesc>
    <idno type="CDLI">{code}</idno>
  </sourceDesc>
</fileDesc>
<encodingDesc>
  <refsDecl n="CTS">
    <cRefPattern n="line"
                 matchPattern="(\\w+)\\.(\\w+)\\.(\\w+)"
                 replacementPattern="#xpath(/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n=\'$1\']/tei:div[@n=\'$2\']/tei:l[@n=\'$3\'])">
      <p>This pointer pattern extracts a specific line.</p>
    </cRefPattern>
    <cRefPattern n="surface"
                 matchPattern="(\\w+)\\.(\\w+)"
                 replacementPattern="#xpath(/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n=\'$1\']/tei:div[@n=\'$2\'])">
      <p>This pointer pattern extracts an inscribed surface.</p>
    </cRefPattern>
    <cRefPattern n="object"
                 matchPattern="(\\w+)"
                 replacementPattern="#xpath(/tei:TEI/tei:text/tei:body/tei:div/tei:div[@n=\'$1\'])">
      <p>This pointer pattern extracts a specific artefact,
         usually a tablet.</p>
    </cRefPattern>
  </refsDecl>
</encodingDesc>
</teiHeader>
'''.format(description=escape(atf.text.description),
           code=escape(atf.text.code))
    urn = f'urn:cts:cdli:test.{atf.text.code}'
    result += f'<text n="{urn}"'
    if atf.text.language:
        result += f' xml:lang="{atf.text.language}"'
    result += '>\n'
    result += '<body>\n'
    translations = {}
    objects = [item for item in atf.text.children
               if isinstance(item, OraccObject)]
    result += '''  <div type="edition">\n'''
    for item in objects:
        result += f'  <div type="textpart" n="{item.objecttype}">\n'
        for section in item.children:
            if isinstance(section, OraccObject):
                result += '    <div type="textpart"' \
                          f' n="{section.objecttype}">\n'
            elif isinstance(section, Translation):
                # Handle in another pass.
                continue
            else:
                result += '    <div>\n' \
                         f'<!-- {type(section).__name__}: {section} -->\n'
            for line in section.children:
                if isinstance(line, Line):
                    text = normalize_transliteration(line.words)
                    result += f'      <l n="{line.label}">{text}</l>\n'
                    # Older pyoracc parses interlinear translatsions
                    # as notes. Remember them for serialization below.
                    for note in line.notes:
                        if note.content.startswith('tr.'):
                            lang, text = note.content.split(':', maxsplit=1)
                            _, lang = lang.split('.')
                            # tr.ts is used for normalization, so mark
                            # this with the primary object's language.
                            if lang == 'ts':
                                lang == atf.text.language
                            tr_line = Line(line.label)
                            tr_line.words = text.strip().split()
                            if lang not in translations:
                                translations[lang] = []
                            translations[lang].append(tr_line)
                else:
                    result += f'      <!-- {type(line).__name__}: {line} -->\n'
            result += '    </div>\n'
        result += '  </div>\n'
    result += '  </div>\n'
    objects = [item for item in atf.text.children
               if isinstance(item, OraccObject)]
    result += '  <div type="translation">\n'
    for item in objects:
        result += f'    <div type="textpart" n="{item.objecttype}">\n'
        for section in item.children:
            # Skip anything which is not a translation for this pass.
            if not isinstance(section, Translation):
                continue
            for surface in section.children:
                result += f'      <div type="textpart" ' \
                          f'n="{surface.objecttype}">\n'
                if isinstance(surface, OraccObject):
                    for line in surface.children:
                        if isinstance(line, Line):
                            text = ' '.join(line.words)
                            result += '        ' \
                                      f'<l n="{line.label}">{text}</l>\n'
                        else:
                            result += '        <!-- ' \
                                      f'{type(line).__name__}: {line} -->\n'
                    result += '      </div>\n'
        result += '    </div>\n'
    result += '  </div>\n'
    for lang, translation in translations.items():
        result += f'  <div type="translation" xml:lang="{lang}">\n'
        for line in translation:
            text = ' '.join(line.words)
            result += f'    <l n="{line.label}">{escape(text)}</l>\n'
        result += '  </div>\n'
    result += '''
</body>
</text>
</TEI>'''
    return result
Exemple #11
0
def consider_composite(name, code):
    """
    Parses ATF and checks CDLI ID coincides
    """
    afile = AtfFile(sample_file(name))
    assert afile.text.texts[0].code == code
Exemple #12
0
 def parse(any_str):
     """
     Parse input string, could be just a line or a whole file content.
     """
     parsed = AtfFile(any_str)
     return parsed