def check_duri(): #This file had multiple senses split into consecutive entries. The script checks how safe it is to merge these. print('Checking the Duri file for merge safety...') prev_lx, safe, unsafe = '', 0, 0 with open( 'D:/files/aa-synced/jon/otherproj/kari-valkama/Duri key terms-UTF8.sfm', encoding='utf-8') as infile: sfm_records = S.SFMRecordReader(infile, S.RECORD_MARKER) for rec in sfm_records: r = rec.as_lists() lx = r[0][1] sn = rec.find_first('sn')[1] if not sn: print('WARNING: no sn field under lx {})'.format(lx)) if sn and not (sn == '1'): # an explicitly numbered sense (2 or higher); safe to merge? if lx == prev_lx: safe += 1 else: unsafe += 1 # print('WARNING: {} != {}'.format(lx, prev_lx)) prev_lx = lx # move the sliding window print("Done. {} are safe to merge; {} are unsafe.".format( safe, unsafe))
def safe_add(key, val, dict, label=''): ''' Adds a value to a Python dictionary, or fails and prints a message to the screen if it already exists. ''' if key in dict: s = "{} Already has an item for key {}.".format(label, S.ascii(key)) print(s) else: dict[key] = val
def __self_test(): ''' A very minimal test harness. Not very good, and really only tests break_field() so far. ''' print('Begin self-test...') lines = ['\\lx test\n', '\sn\n\n1 \n'] r = S.SFMRecord(lines).as_lists() __assert(r[0][0] == 'lx', '1 lx') __assert(r[0][1] == 'test\n', '1 lx val') __assert(r[1][0] == 'sn', '2 sn (known limitation)') __assert(r[1][1] == '\n\n1 \n', '2 sn val (known limitation)') lines = ["\l test\n", "\sn\n\n", "\de blah\n"] r = S.SFMRecord(lines).as_lists() __assert(r[0][0] == 'l', '1 l') __assert(r[0][1] == 'test\n', '1 l val') __assert(r[1][0] == 'sn', '2 sn again') __assert(r[1][1] == '\n\n', '2 sn val again') __assert(r[2][0] == 'de', '3 de') __assert(r[2][1] == ' blah\n', '3 de val') print('Done with self-test.')
def get_lexemes(records): '''Given a list of lexical records, returns a list of lexemes. Assumes the first field contains the lexeme form. Does not retrieve homograph numbers, nor subentries, variants, etc. ''' lex = [] for r in records: #get data portion of first field f = S.break_field(r[0]) lexeme = f[1].rstrip().lstrip() lex.append(lexeme) return lex
def push_ps_down(record): ''' If non-empty ps occurs above one or more sn fields within scope, copy it down and then delete it. If no sn lines are found below the ps line, insert an sn immediately above it. (Formerly, when finished, it would delete ps if it's empty. That would help simplify messy data, but it's not a good idea yet due to this FLEx issue: LT-10739) ''' rec = record.as_lists() offset = 0 lxval = rec[0][1].strip() lxval = sfm.ascii(lxval) #so non-unicode consoles won't crash # find each occurrence of ps and deal with it pss = record.find(PS, 1) for _m, ps in pss: i = ps + offset if not rec[i][1].strip(): rec[i][1] = EMPTY_PS mrk, val = rec[i] val = val.strip() if not mrk == PS: raise Exception( "Programming error! Lost track of where the {} fields are.". format(PS)) sns = record.find(SN, i + 1, EDGES) if sns and val: debug( 'Push: For word {} near line {}, pushing {} (at {}) down to {} sense(s)' .format(lxval, record.location, PS, i, len(sns))) offset2 = 0 for _m, j in sns: rec.insert(j + offset2 + 1, rec[i]) offset2 += 1 offset += offset2 val = '' # mark the original ps for deletion elif val: debug( 'Insert: For word {} near line {}. No {} fields found. Inserting a new {} above {} instead.' .format(lxval, record.location, SN, SN, PS)) rec.insert(i, [SN, EMPTY_SN]) offset += 1 if not val: # delete original ps # print( 'Delete: for word {} near line {}, deleting ps (at {})'.format(lxval, record.location, i) ) rec.pop(i) offset -= 1 return record.as_string()
def push_ps_down (record): ''' If non-empty ps occurs above one or more sn fields within scope, copy it down and then delete it. If no sn lines are found below the ps line, insert an sn immediately above it. (Formerly, when finished, it would delete ps if it's empty. That would help simplify messy data, but it's not a good idea yet due to this FLEx issue: LT-10739) ''' rec = record.as_lists() offset = 0 lxval = rec[0][1].strip() lxval = sfm.ascii(lxval) #so non-unicode consoles won't crash # find each occurrence of ps and deal with it pss = record.find(PS, 1) for _m, ps in pss: i = ps + offset if not rec[i][1].strip(): rec[i][1] = EMPTY_PS mrk, val = rec[i] val = val.strip() if not mrk == PS: raise Exception("Programming error! Lost track of where the {} fields are.".format(PS)) sns = record.find(SN, i+1, EDGES) if sns and val: debug( 'Push: For word {} near line {}, pushing {} (at {}) down to {} sense(s)'.format(lxval, record.location, PS, i, len(sns)) ) offset2 = 0 for _m, j in sns: rec.insert(j + offset2 + 1, rec[i]) offset2 += 1 offset += offset2 val = '' # mark the original ps for deletion elif val: debug('Insert: For word {} near line {}. No {} fields found. Inserting a new {} above {} instead.'.format(lxval, record.location, SN, SN, PS)) rec.insert(i, [SN, EMPTY_SN]) offset += 1 if not val: # delete original ps # print( 'Delete: for word {} near line {}, deleting ps (at {})'.format(lxval, record.location, i) ) rec.pop(i) offset -= 1 return record.as_string()
def check_homographs(markers=WORDS): ''' Add homograph numbers as needed, but leave alone any explicit numbers that are already there. Print a message if any of those explicit numbers conflict. (E.g. two explicitly identical lx's. Explicitly identical se's are probably fine.) Support homograph-as-number-suffix for lx and se; also support hm under lx. ''' with open(INFILE, encoding='utf-8') as infile: # load entire file sfm_rec = S.SFMRecordReader(infile, REC_MKR) header = sfm_rec.header sfm_records = list(sfm_rec) # Do one pass to index everything. words = identify_homographs(sfm_records, markers) for key in words: if len(words[key]) > 1: # There are homographs print("HOMOGRAPHS of {}:\n{}".format(key, words[key])) add_hom_to_word(key, words[key], sfm_records) with open(OUTFILE, mode='w', encoding='utf-8') as outfile: outfile.write(header) for rec in sfm_records: outfile.write(rec.as_string())
def split_record(record, mkrs): ''' Given a record of type SFMRecord and a list of root markers, split it into multiple records. mkrs would typically be ['lx', 'se'], or just ['lx'] ''' words = record.find(mkrs) c = len(words) recs = [] if c > 1: r = record.as_lists() i = 0 while i < c: begin = words[i][1] if i >= c - 1: end = len(r) else: end = words[i + 1][1] wordlines = r[begin:end] recs.append(sfm.SFMRecord(wordlines)) i += 1 else: # only one word recs.append(record) return recs
def run_sample(): print('RUNNING THE SAMPLE...') with open('lexicon-sample.txt', encoding='utf-8') as infile: with open('lexicon-sample-conv.txt', mode='w', encoding='utf-8') as outfile: sfm_records = S.SFMRecordReader(infile, S.RECORD_MARKER) count, have_def_field, ps_inserted, sn_inserted = 0, 0, 0, 0 outfile.write(sfm_records.header) for rec in sfm_records: count += 1 rec.as_lists() if rec.find_first(['dn', 'de']) > -1: have_def_field += 1 #Empty defs will be counted too ps_inserted += rec.insert_field_between( ('lx', 'se', 'hm', 'lc'), ('sn', 'ge', 'de', 'gn', 'dn'), ('ps', '\n')) sn_inserted += rec.insert_field_between( ('ps', 'pn'), ('ge', 'de', 'gn', 'dn'), ('sn', '\n')) tmp = rec.as_string() outfile.write(tmp) print('{} out of {} total records contained a definition field.'.format( have_def_field, count)) print( 'Inserted {} blank \\ps fields directly between headword and sn/gloss/def.' .format(ps_inserted)) print('Inserted {} blank \\sn fields directly between POS and gloss/def.'. format(sn_inserted)) print('Success.') #CHAINING. You can take that output (lexicon-sample-conv.txt) and apply one or more regex files to it import os os.system( 'python ApplyRE.py lexicon-sample-conv.txt lexicon-sample-conv.txt regex-sample.txt -o' )
def execute(args): ''' Split out any subentries, then pass each (sub)entry into the chosen function for processing. ''' def split_out_subentries(sfm_records): ''' Chop entries more finely, so that each subentry is its own record for now. Fields at the end of a record will simply be treated as part of the last subentry, if one exists. (That's safe to do in this case.) ''' recs = [] for record in sfm_records: pieces = record.split(SES) for piece in pieces: recs.append(piece) return recs print('Enter SFMPS.py -h to learn the command line options.') print( '===== This script is intended to help bring ps and sn into a consistent relationship. It targets a one-to-one relationship, to work around an FLEx import problem (https://jira.sil.org/browse/LT-9353).' ) print( "It's best to follow standard MDF (ps as parent of sn), but you can also do sn above ps." ) print( "SUGGESTION: give ALL your empty ps an 'unknown' value, to work around a FLEx import issue: https://jira.sil.org/browse/LT-10739." ) print( "\nWARNING: use at your own risk, and check the output with a diff tool such as WinMerge or KDiff3.=====\n" ) in_fname = args['infile'] out_fname = args['outfile'] if not out_fname: out_fname = in_fname + OUTFILE_EXT # Decide what to do based on the passed parameters func = selective_copy msg = HELP_COPY if args['pushpsdown']: func = selective_push msg = HELP_PUSH if args['undopush']: func = undo_push msg = HELP_UNDO_PUSH if args['copyps']: func = selective_copy msg = HELP_COPY with open(in_fname, encoding='utf-8') as infile: sfm_records = sfm.SFMRecordReader(infile, REC_MKR) print( "Splitting each entry into 'word' chunks wherever subentries ({}) are found..." .format(EDGES)) recs = split_out_subentries(sfm_records) print("Running the selected function (described below)...\n" + msg) with open(out_fname, mode='w', encoding='utf-8') as outfile: outfile.write(sfm_records.header) for record in recs: outfile.write(func(record)) # outfile.write('\n\n=====\n') # outfile.write(record.as_string()) #to see the subentry breaks print('Done. Output saved to this file: {}'.format(out_fname))
def execute(): in_fname = INFILE out_fname = INFILE + '.out.txt' # generate output filenames and open them (SPLIT_OUT can be a list of any length) temp = [[x, INFILE + '.' + x + '.txt'] for x in SPLIT_OUT] out_fnames = dict(temp) out_files = dict() for o in out_fnames: fn = out_fnames[o] f = open(fn, mode='w', encoding='utf-8') out_files[o] = f f.write("File created by SFMMinor.py while splitting out minor entries marked by {}\n\n".format(o)) with open(in_fname, encoding='utf-8') as infile: sfm_records = sfm.SFMRecordReader(infile) recs = list(sfm_records) # load entire file into memory entries, _entries_stripped = sfm.build_indexes(recs, exclude_fields=DONT_INDEX_IF) with open(out_fname, mode='w', encoding='utf-8') as outfile: outfile.write(sfm_records.header) for rec in recs: r = rec.as_lists() is_minor = rec.find_first(MINOR_MKRS) if is_minor: minlx = r[0][1].strip() report = "Probable minor entry {} identified due to link {}".format(minlx, str(is_minor)) #Strip it down to ASCII so the console can handle it report = report.encode('ascii', 'replace') #print(report) mn = is_minor[1].strip() matches = [main[2] for main in entries[mn]] if len(matches) > 1: print("ERROR: ambiguous link.") print(report) for main in matches: for bref in BACKREF_MKRS: found = main.find_values(bref) if minlx in found: #The following needs to be stripped down to ASCII #print(" Main entry found ({}); it mentions the minor entry here: {} {}".format(main.as_lists()[0], bref, str(found))) if not is_minor[0].endswith(bref): #The following needs to be stripped down to ASCII #print(" Updating the minor entry by appending {} to marker {}.".format(bref, is_minor[0])) is_minor[0] += bref # print(rec.as_string()) break s = rec.as_string() if SPLIT_OUT: found = rec.find_first(SPLIT_OUT) if found: f = out_files[found[0]] #The following needs to be stripped down to ASCII #print(" Removing entry {} from the main file since it contains field {}; saving it in {} instead".format(rec.as_lists()[0], found, f)) f.write(s) # write the minor entry out to the appropriate separate file s = '' outfile.write(s) print("Done writing to file {}".format(out_fname)) for o in out_files: out_files[o].close() print("Done writing to file {}".format(out_fnames[o]))
from SFMUtils import SFMTools as S from SFMUtils import SFMToolsUgly as sfm if __name__ == '__main__': fnamesrc = r'D:\files\aa-synced\jon\aa-do-medium\tado\tado-saku\2009-05\erik\erik-from-kingston-sd-card\what janet sent erik 1\2009-03 kamustado given to Erik March 2009 - no dt - sort.db' fnamesrc = r'D:\files\aa-synced\jon\aa-do-medium\tado\tado-saku\2009-05\erik\erik-from-kingston-sd-card\kamustado-from-erik\2009-06-30-kamustado-from-erik-jv-no-dt.db' fnamein = r'D:\files\aa-synced\jon\aa-do-medium\tado\tado-saku\2009-05\erik\2010-03-10-kamustado-from-erik-no-dt-fixed7.db' fnameout = r'D:\files\aa-synced\jon\aa-do-medium\tado\tado-saku\2009-05\erik\2010-03-10-kamustado-from-erik-no-dt-fixed8tmp.db' print("Replacing corrupted se back from dn to se ...") with open(fnamein, encoding='utf-8') as infile: with open(fnamesrc, encoding='utf-8') as srcfile: with open(fnameout, mode='w', encoding='utf-8') as outfile: srcrecords = list(S.SFMRecordReader(srcfile, "lx")) srclexemes = sfm.get_lexemes(srcrecords) rec = S.SFMRecordReader(infile, "lx") outfile.write(rec.header) records = list(rec) lexemes = sfm.get_lexemes(records) #just for convenience r, s = -1, -1 while r < len(records): r += 1 s += 1 if r >= len(records) or s >= len(srcrecords): break #get the record in broken down form record = sfm.break_record(records[r])
def execute(): in_fname = INFILE out_fname = INFILE + '.out.txt' # generate output filenames and open them (SPLIT_OUT can be a list of any length) temp = [[x, INFILE + '.' + x + '.txt'] for x in SPLIT_OUT] out_fnames = dict(temp) out_files = dict() for o in out_fnames: fn = out_fnames[o] f = open(fn, mode='w', encoding='utf-8') out_files[o] = f f.write( "File created by SFMMinor.py while splitting out minor entries marked by {}\n\n" .format(o)) with open(in_fname, encoding='utf-8') as infile: sfm_records = sfm.SFMRecordReader(infile) recs = list(sfm_records) # load entire file into memory entries, _entries_stripped = sfm.build_indexes( recs, exclude_fields=DONT_INDEX_IF) with open(out_fname, mode='w', encoding='utf-8') as outfile: outfile.write(sfm_records.header) for rec in recs: r = rec.as_lists() is_minor = rec.find_first(MINOR_MKRS) if is_minor: minlx = r[0][1].strip() report = "Probable minor entry {} identified due to link {}".format( minlx, str(is_minor)) #Strip it down to ASCII so the console can handle it report = report.encode('ascii', 'replace') #print(report) mn = is_minor[1].strip() matches = [main[2] for main in entries[mn]] if len(matches) > 1: print("ERROR: ambiguous link.") print(report) for main in matches: for bref in BACKREF_MKRS: found = main.find_values(bref) if minlx in found: #The following needs to be stripped down to ASCII #print(" Main entry found ({}); it mentions the minor entry here: {} {}".format(main.as_lists()[0], bref, str(found))) if not is_minor[0].endswith(bref): #The following needs to be stripped down to ASCII #print(" Updating the minor entry by appending {} to marker {}.".format(bref, is_minor[0])) is_minor[0] += bref # print(rec.as_string()) break s = rec.as_string() if SPLIT_OUT: found = rec.find_first(SPLIT_OUT) if found: f = out_files[found[0]] #The following needs to be stripped down to ASCII #print(" Removing entry {} from the main file since it contains field {}; saving it in {} instead".format(rec.as_lists()[0], found, f)) f.write( s ) # write the minor entry out to the appropriate separate file s = '' outfile.write(s) print("Done writing to file {}".format(out_fname)) for o in out_files: out_files[o].close() print("Done writing to file {}".format(out_fnames[o]))
def variants_as_minor(): '''Supports variants of lx and se, but not of sn. ''' with open(INFILE, encoding='utf-8') as infile: sfm_rec = S.SFMRecordReader(infile, REC_MKR) header = sfm_rec.header sfm_records = list(sfm_rec) # load entire file into memory # Need to be able to follow links. Do one quick pass to index everything. lxD, seD, mnD, mnseD, vaD, vaDrev = build_indexes(sfm_records) to_add = '' with open(OUTFILE, mode='w', encoding='utf-8') as outfile: outfile.write(header) with open(OUTFILE_MINOR, mode='w', encoding='utf-8') as outfile_minor: for rec in sfm_records: if rec.find(['mn']): # minor entry (variant) outfile.write(rec.as_string()) elif rec.find(['mnse']): # minor entry: complex form outfile_minor.write(rec.as_string()) # omit from outfile else: # main entry r = rec.as_lists() lx = r[0][1].strip() vas = rec.find([VA]) ses = rec.find('se') lxse = rec.find_values(['lx', 'se']) while vas: # look at each va, in reverse order _mkr, i = vas.pop() va = r[i][1].strip() print('lx or se {}, with va {}: '.format(lxse, va)) mn = mnD.get(va) mnse = mnseD.get(va) if mn: if mn in lxse: pass # print('- match: Minor entry pointing to {}. Matched! va {} '.format(mn, va)) # r[i][0] = 'cfva' # Disabling va to cfva for else: print( '- min diff: Minor entry {} found, but it points to a different target: {} .' .format(va, mn)) elif mnse: print( 'Error?? found an lx matching va {} that contains an mnse field: mnse {}' .format(va, mnse)) else: print( "- no min: No matching minor entry for va {} !" .format(va)) #TODO: No!! Don't use lx. In ses, find the first se above this va_se field; use that. se = find_above('se', r, i) if se: se = r[se][1].strip() tmp = '\\lx {}\n\\mn {}\n\n'.format(va, se) print("Will add this minor entry: {}".format( tmp)) to_add += tmp tmp = lxD.get(va) if tmp: print("- - lxD({}): {}".format(va, tmp)) tmp = seD.get(va) if tmp: print("- - seD({}): {}".format(va, tmp)) tmp = vaD.get(va) if tmp: print("- - vaD({}): {}".format(va, tmp)) tmp = vaDrev.get(va) if tmp: print("- - vaDrev({}): {}".format(va, tmp)) # if (res2 != lx): pass #TODO: ?? outfile.write(rec.as_string()) print('PLEASE INSERT THESE minor entries into the file: ') print(to_add)