Esempio n. 1
0
def parseMorph():
    print('Parsing morphological data')

    global unknowns

    for (sura, suraData) in morphDb.items():
        for (aya, ayaData) in suraData.items():
            for (group, groupData) in ayaData.items():
                for (word, (form, tag, featureStr)) in groupData.items():
                    wordFeatures.setdefault((sura, aya, group, word),
                                            {})['ascii'] = form
                    wordFeatures.setdefault((sura, aya, group, word),
                                            {})['unicode'] = tr.to_arabic(form)
                    (theseFeatures,
                     theseUnknowns) = parseMorphItem(tag, featureStr)
                    for (k, v) in theseFeatures.items():
                        wordFeatures.setdefault((sura, aya, group, word),
                                                {})[k] = v
                    unknowns |= theseUnknowns

    if unknownFeatures:
        feats = ' '.join(unknownFeatures)
        print(f'\tUnknown features: {feats}')
    else:
        print(f'\tAll features known')
    if (unknownPerFeat):
        for feat in sorted(unknownPerFeat):
            vals = ' '.join(sorted(unknownPerFeat[feat]))
            print(f'\tUnknown: {feat}: {vals}')
    if unknowns:
        vals = ' '.join(sorted(unknowns))
        print(f'\tUnknown labels: {vals}')
    if not unknownPerFeat and not unknowns:
        print(f'\tAll feature values known')
    print(f'Done')
Esempio n. 2
0
def readData():
    print('Reading sura metadata')

    suraPat = r'<sura(.*?)/>'
    suraRe = re.compile(suraPat)
    juzPat = r'<juz(.*?)/>'
    juzRe = re.compile(juzPat)
    hizbPat = r'<quarter(.*?)/>'
    hizbRe = re.compile(hizbPat)
    manzilPat = r'<manzil(.*?)/>'
    manzilRe = re.compile(manzilPat)
    rukuPat = r'<ruku(.*?)/>'
    rukuRe = re.compile(rukuPat)
    pagePat = r'<page(.*?)/>'
    pageRe = re.compile(pagePat)
    sajdaPat = r'<sajda(.*?)/>'
    sajdaRe = re.compile(sajdaPat)

    with open(DATA_PATH) as fh:
        data = fh.read()

    suras = suraRe.findall(data)
    for sura in suras:
        atts = dict(attRe.findall(sura))
        sI = int(atts.get('index', 0))
        suraFeatures[sI] = {
            'name': atts.get('name', ''),
            'nameAscii': tr.from_arabic(atts.get('name', '')),
            'nameTrans': atts.get('tname', ''),
            'name@en': atts.get('ename', ''),
            'type': atts.get('type', ''),
        }
        if 'order' in atts:
            suraFeatures[sI]['order'] = int(atts['order'])
    print(f'Read features for {len(suras)} suras')

    for (sectionName, sectionRe, info) in (
        ('juz', juzRe, ()),
        ('hizb', hizbRe, ()),
        ('manzil', manzilRe, ()),
        ('ruku', rukuRe, ()),
        ('page', pageRe, ()),
        ('sajda', sajdaRe, ('type', )),
    ):
        sections = sectionRe.findall(data)
        for section in sections:
            atts = dict(attRe.findall(section))
            sI = int(atts.get('index', 0))
            sura = int(atts.get('sura', 0))
            aya = int(atts.get('aya', 0))
            features = {k: atts[k] for k in info}
            if sI > 1:
                sectionEnd.setdefault((sura, aya), []).append(
                    (sectionName, sI - 1))
            sectionStart.setdefault((sura, aya), []).append(
                (sectionName, sI, features))
Esempio n. 3
0
def director(cv):
    """Read tsv data fields.

    This is a function that does the work as indicated in the
    [walker converion engine of Text-Fabric](https://annotation.github.io/text-fabric/tf/convert/walker.html)
    See `fusus.convert` for a description of the fields in the TSV files.
    """

    stops = U.stops

    errors = collections.defaultdict(set)

    cur = [None, None, None, None]
    prev = [None, None, None, None]
    nSec = len(prev)

    data = []

    with open(SRC_FILE) as fh:
        next(fh)
        for line in fh:
            row = tuple(line.rstrip("\n").split("\t"))
            page = int(row[0])
            if pageNums is not None and page not in pageNums:
                continue

            if OCRED:
                row = (
                    page,
                    int(row[1]),
                    row[2],
                    int(row[3]),
                    *(None if c == "?" else int(c) for c in row[4:8]),
                    int(row[8]),
                    *row[9:11],
                )
            else:
                row = (
                    page,
                    *(int(c) for c in row[1:4]),
                    row[4],
                    *(None if c == "?" else int(c) for c in row[5:9]),
                    *row[9:11],
                )

            data.append(row)

    boxL = nSec if OCRED else nSec + 1

    if HAS_TOC:
        toc = getToc(data)
        curPiece = cv.node("piece")
        cv.feature(curPiece, n=1, title="front")

    curSentence = cv.node("sentence")
    nSentence = 1
    cv.feature(curSentence, n=nSentence)

    for (r, fields) in enumerate(data):
        if HAS_TOC:
            page = fields[0]
            if page in toc and page != prev[0]:
                for i in reversed(range(nSec)):
                    cv.terminate(cur[i])

                cv.terminate(curSentence)
                cv.terminate(curPiece)
                nSentence = 1
                curSentence = cv.node("sentence")
                cv.feature(curSentence, n=nSentence)

                (n, np, title) = toc[page]
                curPiece = cv.node("piece")
                cv.feature(curPiece, n=n, title=title)
                if np is not None:
                    cv.feature(curPiece, np=np)

        for i in range(nSec):
            if fields[i] != prev[i]:
                for j in reversed(range(i, nSec)):
                    cv.terminate(cur[j])
                for j in range(i, nSec):
                    cn = cv.node(TYPE_MAP[j])
                    cur[j] = cn
                    if OCRED and j == 2:
                        cv.feature(cn, b=fields[j])
                    elif OCRED and j == 3 or not OCRED and j == 1:
                        cv.feature(cn, ln=fields[j])
                    else:
                        cv.feature(cn, n=fields[j])
                    if not OCRED and j == nSec - 1:
                        cv.feature(cn, dir=fields[nSec])
                break
        for i in range(nSec):
            prev[i] = fields[i]

        letters = fields[-2]
        punc = fields[-1]
        lettersp = Tr.asciiFromArabic(letters) if letters else ""
        lettersn = Tr.latinFromArabic(letters) if letters else ""
        letterst = Tr.standardFromArabic(letters) if letters else ""
        punca = Tr.asciiFromArabic(punc) if punc else ""

        s = cv.slot()
        cv.feature(
            s,
            boxl=fields[boxL],
            boxt=fields[boxL + 1],
            boxr=fields[boxL + 2],
            boxb=fields[boxL + 3],
            letters=letters,
            lettersp=lettersp,
            lettersn=lettersn,
            letterst=letterst,
        )
        cv.feature(s, punc=punc, punca=punca)
        if any(c in stops for c in punc):
            cv.terminate(curSentence)
            curSentence = cv.node("sentence")
            nSentence += 1
            cv.feature(curSentence, n=nSentence)
        if OCRED:
            cv.feature(s, confidence=fields[-3])

    cv.terminate(curSentence)

    for i in reversed(range(nSec)):
        if cur[i]:
            cv.terminate(cur[i])

    if HAS_TOC:
        cv.terminate(curPiece)

    for feat in featureMeta:
        if not cv.occurs(feat):
            cv.meta(feat)

    if errors:
        for kind in sorted(errors):
            instances = sorted(errors[kind])
            nInstances = len(instances)
            showInstances = instances[0:20]
            print(f"ERROR {kind}: {nInstances} x")
            print(", ".join(showInstances))
Esempio n. 4
0
from tf.fabric import Fabric


VERSION = "0.2"

GH_BASE = os.path.expanduser("~/github")
ORG = "etcbc"
REPO = "peshitta"
SOURCE_DIR = f"source/{VERSION}"
PLAIN_DIR = f"plain/{VERSION}"
TF_DIR = f"tf/{VERSION}"
SOURCE_PATH = f"{GH_BASE}/{ORG}/{REPO}/{SOURCE_DIR}"
PLAIN_PATH = f"{GH_BASE}/{ORG}/{REPO}/{PLAIN_DIR}"
TF_PATH = f"{GH_BASE}/{ORG}/{REPO}/{TF_DIR}"

TR = Transcription()


allAcrosSeq = """
    Gn
    Ex
    Lv
    Nm
    Dt
    Jb
    Jos
    Jd
    Sm1
    Sm2
    Ps
    Rg1
Esempio n. 5
0
def toHeb(translit):
    return Transcription.toHebrew(
        Transcription.suffix_and_finales(translit)[0])
Esempio n. 6
0
def director(cv):
    print('Parsing morphological data')

    global unknowns

    lemmaIndex = {}
    sectionIndex = {}

    for (sura, suraData) in morphDb.items():
        curSura = cv.node('sura')
        cv.feature(curSura, number=sura)
        theseSuraFeatures = suraFeatures.get(sura, None)
        if theseSuraFeatures:
            cv.feature(curSura, **theseSuraFeatures)
        for (aya, ayaData) in suraData.items():
            curAya = cv.node('aya')
            cv.feature(curAya, number=aya)
            transFeatures = {
                f'translation@{lang}': trans[(sura, aya)]
                for (lang, trans) in translations.items()
            }
            cv.feature(curAya, **transFeatures)
            for s in sectionEnd.get((sura, aya), []):
                curSection = sectionIndex[s]
                cv.terminate(curSection)
                del sectionIndex[s]
            for (sName, sI, sFeatures) in sectionStart.get((sura, aya), []):
                curSection = cv.node(sName)
                cv.feature(curSection, number=sI, **sFeatures)
                sectionIndex[(sName, sI)] = curSection
            nAya = len(ayaData)
            for (ig, (group, groupData)) in enumerate(ayaData.items()):
                curGroup = cv.node('group')
                cv.feature(curGroup, number=group)
                nGroup = len(groupData)
                for (iw, (word, (form, tag,
                                 featureStr))) in enumerate(groupData.items()):
                    (theseFeatures,
                     theseUnknowns) = parseMorphItem(tag, featureStr)
                    lemma = theseFeatures.get('lemma', None)
                    if lemma:
                        thisLemma = lemmaIndex.get(lemma, None)
                        if thisLemma:
                            cv.resume(thisLemma)
                        else:
                            thisLemma = cv.node('lex')
                            lemmaIndex[lemma] = thisLemma
                        cv.feature(thisLemma, lemma=lemma)
                    curWord = cv.slot()
                    if lemma:
                        cv.terminate(thisLemma)
                    cv.feature(
                        curWord,
                        ascii=form,
                        unicode=tr.to_arabic(form),
                        space=' '
                        if iw == nGroup - 1 and ig != nAya - 1 else '',
                        number=word,
                    )
                    cv.feature(curWord, **theseFeatures)
                    unknowns |= theseUnknowns
                cv.terminate(curGroup)
            cv.terminate(curAya)
        cv.terminate(curSura)
    for curSection in sectionIndex.values():
        cv.terminate(curSection)

    if unknownFeatures:
        feats = ' '.join(unknownFeatures)
        print(f'\tUnknown features: {feats}')
    else:
        print(f'\tAll features known')
    if (unknownPerFeat):
        for feat in sorted(unknownPerFeat):
            vals = ' '.join(sorted(unknownPerFeat[feat]))
            print(f'\tUnknown: {feat}: {vals}')
    if unknowns:
        vals = ' '.join(sorted(unknowns))
        print(f'\tUnknown labels: {vals}')
    if not unknownPerFeat and not unknowns:
        print(f'\tAll feature values known')
    print(f'Done')
Esempio n. 7
0
error_limit = 10

kqFile = '{}/ketivqere.txt'.format(thisSource)
kqHandle = open(kqFile)

ln = 0
can = 0
cur_label = None
for line in kqHandle:
    ln += 1
    can += 1
    vlab = line[0:10]
    fields = line.rstrip('\n')[10:].split()
    (ketiv, qere) = fields[0:2]
    (qtrim, qtrailer) = Transcription.suffix_and_finales(qere)
    vnode = nodeFromLabel.get(vlab, None)
    if vnode == None:
        notFound.add(vlab)
        continue
    verseInfo[vnode].append((ketiv, qtrim, qtrailer))
kqHandle.close()
utils.caption(0, '\tRead {} ketiv-qere annotations'.format(ln))

# In[10]:

data = []

for vnode in verseInfo:
    wlookup = collections.defaultdict(lambda: [])
    wvisited = collections.defaultdict(lambda: -1)