Python SFMTools.SFMRecordReader 예제들, SFMUtils.SFMTools.SFMRecordReader Python 예제들

예제 #1

0

파일 보기

파일: SFMToolsUgly.py 프로젝트: davidbaines/PySfm

def check_duri():
    #This file had multiple senses split into consecutive entries. The script checks how safe it is to merge these.
    print('Checking the Duri file for merge safety...')
    prev_lx, safe, unsafe = '', 0, 0
    with open(
            'D:/files/aa-synced/jon/otherproj/kari-valkama/Duri key terms-UTF8.sfm',
            encoding='utf-8') as infile:
        sfm_records = S.SFMRecordReader(infile, S.RECORD_MARKER)
        for rec in sfm_records:
            r = rec.as_lists()
            lx = r[0][1]
            sn = rec.find_first('sn')[1]
            if not sn:
                print('WARNING: no sn field under lx {})'.format(lx))
            if sn and not (sn == '1'):
                # an explicitly numbered sense (2 or higher); safe to merge?
                if lx == prev_lx:
                    safe += 1
                else:
                    unsafe += 1


#                    print('WARNING: {} != {}'.format(lx, prev_lx))
            prev_lx = lx  # move the sliding window
        print("Done. {} are safe to merge; {} are unsafe.".format(
            safe, unsafe))

예제 #2

0

파일 보기

파일: SFMToolsUgly.py 프로젝트: davidbaines/PySfm

def check_homographs(markers=WORDS):
    ''' Add homograph numbers as needed, but leave alone any explicit numbers that are already there.
    
    Print a message if any of those explicit numbers conflict. (E.g. two explicitly identical lx's. Explicitly identical se's are probably fine.)
    Support homograph-as-number-suffix for lx and se; also support hm under lx.
    '''
    with open(INFILE, encoding='utf-8') as infile:  # load entire file
        sfm_rec = S.SFMRecordReader(infile, REC_MKR)
        header = sfm_rec.header
        sfm_records = list(sfm_rec)

    # Do one pass to index everything.
    words = identify_homographs(sfm_records, markers)

    for key in words:
        if len(words[key]) > 1:  # There are homographs
            print("HOMOGRAPHS of {}:\n{}".format(key, words[key]))
            add_hom_to_word(key, words[key], sfm_records)

    with open(OUTFILE, mode='w', encoding='utf-8') as outfile:
        outfile.write(header)
        for rec in sfm_records:
            outfile.write(rec.as_string())

예제 #3

0

파일 보기

파일: SFMToolsUgly.py 프로젝트: davidbaines/PySfm

def run_sample():

    print('RUNNING THE SAMPLE...')
    with open('lexicon-sample.txt', encoding='utf-8') as infile:
        with open('lexicon-sample-conv.txt', mode='w',
                  encoding='utf-8') as outfile:
            sfm_records = S.SFMRecordReader(infile, S.RECORD_MARKER)
            count, have_def_field, ps_inserted, sn_inserted = 0, 0, 0, 0
            outfile.write(sfm_records.header)
            for rec in sfm_records:
                count += 1
                rec.as_lists()
                if rec.find_first(['dn', 'de']) > -1:
                    have_def_field += 1  #Empty defs will be counted too
                ps_inserted += rec.insert_field_between(
                    ('lx', 'se', 'hm', 'lc'), ('sn', 'ge', 'de', 'gn', 'dn'),
                    ('ps', '\n'))
                sn_inserted += rec.insert_field_between(
                    ('ps', 'pn'), ('ge', 'de', 'gn', 'dn'), ('sn', '\n'))
                tmp = rec.as_string()
                outfile.write(tmp)

    print('{} out of {} total records contained a definition field.'.format(
        have_def_field, count))
    print(
        'Inserted {} blank \\ps fields directly between headword and sn/gloss/def.'
        .format(ps_inserted))
    print('Inserted {} blank \\sn fields directly between POS and gloss/def.'.
          format(sn_inserted))
    print('Success.')

    #CHAINING. You can take that output (lexicon-sample-conv.txt) and apply one or more regex files to it
    import os
    os.system(
        'python ApplyRE.py lexicon-sample-conv.txt lexicon-sample-conv.txt regex-sample.txt -o'
    )

예제 #4

0

파일 보기

파일: SFMPS.py 프로젝트: davidbaines/PySfm

def execute(args):
    ''' Split out any subentries, then pass each (sub)entry into the chosen function for processing.
    '''
    def split_out_subentries(sfm_records):
        ''' Chop entries more finely, so that each subentry is its own record for now.
        
        Fields at the end of a record will simply be treated as part of the last subentry, if one exists.
        (That's safe to do in this case.)
        '''
        recs = []

        for record in sfm_records:
            pieces = record.split(SES)
            for piece in pieces:
                recs.append(piece)

        return recs

    print('Enter SFMPS.py -h to learn the command line options.')
    print(
        '===== This script is intended to help bring ps and sn into a consistent relationship. It targets a one-to-one relationship, to work around an FLEx import problem (https://jira.sil.org/browse/LT-9353).'
    )
    print(
        "It's best to follow standard MDF (ps as parent of sn), but you can also do sn above ps."
    )
    print(
        "SUGGESTION: give ALL your empty ps an 'unknown' value, to work around a FLEx import issue: https://jira.sil.org/browse/LT-10739."
    )
    print(
        "\nWARNING: use at your own risk, and check the output with a diff tool such as WinMerge or KDiff3.=====\n"
    )
    in_fname = args['infile']
    out_fname = args['outfile']
    if not out_fname:
        out_fname = in_fname + OUTFILE_EXT

    # Decide what to do based on the passed parameters
    func = selective_copy
    msg = HELP_COPY
    if args['pushpsdown']:
        func = selective_push
        msg = HELP_PUSH
    if args['undopush']:
        func = undo_push
        msg = HELP_UNDO_PUSH
    if args['copyps']:
        func = selective_copy
        msg = HELP_COPY

    with open(in_fname, encoding='utf-8') as infile:
        sfm_records = sfm.SFMRecordReader(infile, REC_MKR)
        print(
            "Splitting each entry into 'word' chunks wherever subentries ({}) are found..."
            .format(EDGES))
        recs = split_out_subentries(sfm_records)
        print("Running the selected function (described below)...\n" + msg)

        with open(out_fname, mode='w', encoding='utf-8') as outfile:
            outfile.write(sfm_records.header)

            for record in recs:
                outfile.write(func(record))


#                outfile.write('\n\n=====\n')
#                outfile.write(record.as_string())  #to see the subentry breaks

    print('Done. Output saved to this file: {}'.format(out_fname))

예제 #5

0

파일 보기

from SFMUtils import SFMTools as S
from SFMUtils import SFMToolsUgly as sfm

if __name__ == '__main__':
    fnamesrc = r'D:\files\aa-synced\jon\aa-do-medium\tado\tado-saku\2009-05\erik\erik-from-kingston-sd-card\what janet sent erik 1\2009-03 kamustado given to Erik March 2009 - no dt - sort.db'
    fnamesrc = r'D:\files\aa-synced\jon\aa-do-medium\tado\tado-saku\2009-05\erik\erik-from-kingston-sd-card\kamustado-from-erik\2009-06-30-kamustado-from-erik-jv-no-dt.db'
    fnamein = r'D:\files\aa-synced\jon\aa-do-medium\tado\tado-saku\2009-05\erik\2010-03-10-kamustado-from-erik-no-dt-fixed7.db'
    fnameout = r'D:\files\aa-synced\jon\aa-do-medium\tado\tado-saku\2009-05\erik\2010-03-10-kamustado-from-erik-no-dt-fixed8tmp.db'
    print("Replacing corrupted se back from dn to se ...")

    with open(fnamein, encoding='utf-8') as infile:
        with open(fnamesrc, encoding='utf-8') as srcfile:
            with open(fnameout, mode='w', encoding='utf-8') as outfile:

                srcrecords = list(S.SFMRecordReader(srcfile, "lx"))
                srclexemes = sfm.get_lexemes(srcrecords)

                rec = S.SFMRecordReader(infile, "lx")
                outfile.write(rec.header)
                records = list(rec)
                lexemes = sfm.get_lexemes(records)  #just for convenience

                r, s = -1, -1
                while r < len(records):
                    r += 1
                    s += 1
                    if r >= len(records) or s >= len(srcrecords):
                        break

                    #get the record in broken down form
                    record = sfm.break_record(records[r])

예제 #6

0

파일 보기

파일: SFMMinor.py 프로젝트: davidbaines/PySfm

def execute():
    in_fname = INFILE
    out_fname = INFILE + '.out.txt'
    # generate output filenames and open them (SPLIT_OUT can be a list of any length)
    temp = [[x, INFILE + '.' + x + '.txt'] for x in SPLIT_OUT]
    out_fnames = dict(temp)
    out_files = dict()
    for o in out_fnames:
        fn = out_fnames[o]
        f = open(fn, mode='w', encoding='utf-8')
        out_files[o] = f
        f.write(
            "File created by SFMMinor.py while splitting out minor entries marked by {}\n\n"
            .format(o))

    with open(in_fname, encoding='utf-8') as infile:
        sfm_records = sfm.SFMRecordReader(infile)
        recs = list(sfm_records)  # load entire file into memory
        entries, _entries_stripped = sfm.build_indexes(
            recs, exclude_fields=DONT_INDEX_IF)

        with open(out_fname, mode='w', encoding='utf-8') as outfile:
            outfile.write(sfm_records.header)

            for rec in recs:
                r = rec.as_lists()
                is_minor = rec.find_first(MINOR_MKRS)
                if is_minor:
                    minlx = r[0][1].strip()
                    report = "Probable minor entry {} identified due to link {}".format(
                        minlx, str(is_minor))
                    #Strip it down to ASCII so the console can handle it
                    report = report.encode('ascii', 'replace')
                    #print(report)
                    mn = is_minor[1].strip()
                    matches = [main[2] for main in entries[mn]]
                    if len(matches) > 1:
                        print("ERROR: ambiguous link.")
                        print(report)
                    for main in matches:
                        for bref in BACKREF_MKRS:
                            found = main.find_values(bref)
                            if minlx in found:
                                #The following needs to be stripped down to ASCII
                                #print("  Main entry found ({}); it mentions the minor entry here: {} {}".format(main.as_lists()[0], bref, str(found)))
                                if not is_minor[0].endswith(bref):
                                    #The following needs to be stripped down to ASCII
                                    #print("  Updating the minor entry by appending {} to marker {}.".format(bref, is_minor[0]))
                                    is_minor[0] += bref


#                                print(rec.as_string())
                                break

                s = rec.as_string()
                if SPLIT_OUT:
                    found = rec.find_first(SPLIT_OUT)
                    if found:
                        f = out_files[found[0]]
                        #The following needs to be stripped down to ASCII
                        #print("  Removing entry {} from the main file since it contains field {}; saving it in {} instead".format(rec.as_lists()[0], found, f))
                        f.write(
                            s
                        )  # write the minor entry out to the appropriate separate file
                        s = ''
                outfile.write(s)

    print("Done writing to file {}".format(out_fname))
    for o in out_files:
        out_files[o].close()
        print("Done writing to file {}".format(out_fnames[o]))

예제 #7

0

파일 보기

파일: SFMToolsUgly.py 프로젝트: davidbaines/PySfm

def variants_as_minor():
    '''Supports variants of lx and se, but not of sn.
    '''
    with open(INFILE, encoding='utf-8') as infile:
        sfm_rec = S.SFMRecordReader(infile, REC_MKR)
        header = sfm_rec.header
        sfm_records = list(sfm_rec)  # load entire file into memory

    # Need to be able to follow links. Do one quick pass to index everything.
    lxD, seD, mnD, mnseD, vaD, vaDrev = build_indexes(sfm_records)
    to_add = ''

    with open(OUTFILE, mode='w', encoding='utf-8') as outfile:
        outfile.write(header)
        with open(OUTFILE_MINOR, mode='w', encoding='utf-8') as outfile_minor:

            for rec in sfm_records:
                if rec.find(['mn']):  # minor entry (variant)
                    outfile.write(rec.as_string())
                elif rec.find(['mnse']):
                    # minor entry: complex form
                    outfile_minor.write(rec.as_string())  # omit from outfile
                else:
                    # main entry
                    r = rec.as_lists()
                    lx = r[0][1].strip()
                    vas = rec.find([VA])
                    ses = rec.find('se')
                    lxse = rec.find_values(['lx', 'se'])
                    while vas:
                        # look at each va, in reverse order
                        _mkr, i = vas.pop()

                        va = r[i][1].strip()
                        print('lx or se {}, with va {}: '.format(lxse, va))
                        mn = mnD.get(va)
                        mnse = mnseD.get(va)
                        if mn:
                            if mn in lxse:
                                pass
#                                print('- match: Minor entry pointing to {}. Matched! va {} '.format(mn, va))
#                                r[i][0] = 'cfva'  # Disabling va to cfva for
                            else:
                                print(
                                    '- min diff: Minor entry {} found, but it points to a different target: {} .'
                                    .format(va, mn))
                        elif mnse:
                            print(
                                'Error?? found an lx matching va {} that contains an mnse field: mnse {}'
                                .format(va, mnse))
                        else:
                            print(
                                "- no min: No matching minor entry for va {} !"
                                .format(va))
                            #TODO: No!! Don't use lx. In ses, find the first se above this va_se field; use that.
                            se = find_above('se', r, i)
                            if se:
                                se = r[se][1].strip()
                                tmp = '\\lx {}\n\\mn {}\n\n'.format(va, se)
                                print("Will add this minor entry: {}".format(
                                    tmp))
                                to_add += tmp
                            tmp = lxD.get(va)
                            if tmp: print("- - lxD({}): {}".format(va, tmp))
                            tmp = seD.get(va)
                            if tmp: print("- - seD({}): {}".format(va, tmp))
                            tmp = vaD.get(va)
                            if tmp: print("- - vaD({}): {}".format(va, tmp))
                            tmp = vaDrev.get(va)
                            if tmp: print("- - vaDrev({}): {}".format(va, tmp))


#                                if (res2 != lx): pass  #TODO: ??
                    outfile.write(rec.as_string())

    print('PLEASE INSERT THESE minor entries into the file: ')
    print(to_add)