def check_duri(): #This file had multiple senses split into consecutive entries. The script checks how safe it is to merge these. print('Checking the Duri file for merge safety...') prev_lx, safe, unsafe = '', 0, 0 with open( 'D:/files/aa-synced/jon/otherproj/kari-valkama/Duri key terms-UTF8.sfm', encoding='utf-8') as infile: sfm_records = S.SFMRecordReader(infile, S.RECORD_MARKER) for rec in sfm_records: r = rec.as_lists() lx = r[0][1] sn = rec.find_first('sn')[1] if not sn: print('WARNING: no sn field under lx {})'.format(lx)) if sn and not (sn == '1'): # an explicitly numbered sense (2 or higher); safe to merge? if lx == prev_lx: safe += 1 else: unsafe += 1 # print('WARNING: {} != {}'.format(lx, prev_lx)) prev_lx = lx # move the sliding window print("Done. {} are safe to merge; {} are unsafe.".format( safe, unsafe))
def check_homographs(markers=WORDS): ''' Add homograph numbers as needed, but leave alone any explicit numbers that are already there. Print a message if any of those explicit numbers conflict. (E.g. two explicitly identical lx's. Explicitly identical se's are probably fine.) Support homograph-as-number-suffix for lx and se; also support hm under lx. ''' with open(INFILE, encoding='utf-8') as infile: # load entire file sfm_rec = S.SFMRecordReader(infile, REC_MKR) header = sfm_rec.header sfm_records = list(sfm_rec) # Do one pass to index everything. words = identify_homographs(sfm_records, markers) for key in words: if len(words[key]) > 1: # There are homographs print("HOMOGRAPHS of {}:\n{}".format(key, words[key])) add_hom_to_word(key, words[key], sfm_records) with open(OUTFILE, mode='w', encoding='utf-8') as outfile: outfile.write(header) for rec in sfm_records: outfile.write(rec.as_string())
def run_sample(): print('RUNNING THE SAMPLE...') with open('lexicon-sample.txt', encoding='utf-8') as infile: with open('lexicon-sample-conv.txt', mode='w', encoding='utf-8') as outfile: sfm_records = S.SFMRecordReader(infile, S.RECORD_MARKER) count, have_def_field, ps_inserted, sn_inserted = 0, 0, 0, 0 outfile.write(sfm_records.header) for rec in sfm_records: count += 1 rec.as_lists() if rec.find_first(['dn', 'de']) > -1: have_def_field += 1 #Empty defs will be counted too ps_inserted += rec.insert_field_between( ('lx', 'se', 'hm', 'lc'), ('sn', 'ge', 'de', 'gn', 'dn'), ('ps', '\n')) sn_inserted += rec.insert_field_between( ('ps', 'pn'), ('ge', 'de', 'gn', 'dn'), ('sn', '\n')) tmp = rec.as_string() outfile.write(tmp) print('{} out of {} total records contained a definition field.'.format( have_def_field, count)) print( 'Inserted {} blank \\ps fields directly between headword and sn/gloss/def.' .format(ps_inserted)) print('Inserted {} blank \\sn fields directly between POS and gloss/def.'. format(sn_inserted)) print('Success.') #CHAINING. You can take that output (lexicon-sample-conv.txt) and apply one or more regex files to it import os os.system( 'python ApplyRE.py lexicon-sample-conv.txt lexicon-sample-conv.txt regex-sample.txt -o' )
def execute(args): ''' Split out any subentries, then pass each (sub)entry into the chosen function for processing. ''' def split_out_subentries(sfm_records): ''' Chop entries more finely, so that each subentry is its own record for now. Fields at the end of a record will simply be treated as part of the last subentry, if one exists. (That's safe to do in this case.) ''' recs = [] for record in sfm_records: pieces = record.split(SES) for piece in pieces: recs.append(piece) return recs print('Enter SFMPS.py -h to learn the command line options.') print( '===== This script is intended to help bring ps and sn into a consistent relationship. It targets a one-to-one relationship, to work around an FLEx import problem (https://jira.sil.org/browse/LT-9353).' ) print( "It's best to follow standard MDF (ps as parent of sn), but you can also do sn above ps." ) print( "SUGGESTION: give ALL your empty ps an 'unknown' value, to work around a FLEx import issue: https://jira.sil.org/browse/LT-10739." ) print( "\nWARNING: use at your own risk, and check the output with a diff tool such as WinMerge or KDiff3.=====\n" ) in_fname = args['infile'] out_fname = args['outfile'] if not out_fname: out_fname = in_fname + OUTFILE_EXT # Decide what to do based on the passed parameters func = selective_copy msg = HELP_COPY if args['pushpsdown']: func = selective_push msg = HELP_PUSH if args['undopush']: func = undo_push msg = HELP_UNDO_PUSH if args['copyps']: func = selective_copy msg = HELP_COPY with open(in_fname, encoding='utf-8') as infile: sfm_records = sfm.SFMRecordReader(infile, REC_MKR) print( "Splitting each entry into 'word' chunks wherever subentries ({}) are found..." .format(EDGES)) recs = split_out_subentries(sfm_records) print("Running the selected function (described below)...\n" + msg) with open(out_fname, mode='w', encoding='utf-8') as outfile: outfile.write(sfm_records.header) for record in recs: outfile.write(func(record)) # outfile.write('\n\n=====\n') # outfile.write(record.as_string()) #to see the subentry breaks print('Done. Output saved to this file: {}'.format(out_fname))
from SFMUtils import SFMTools as S from SFMUtils import SFMToolsUgly as sfm if __name__ == '__main__': fnamesrc = r'D:\files\aa-synced\jon\aa-do-medium\tado\tado-saku\2009-05\erik\erik-from-kingston-sd-card\what janet sent erik 1\2009-03 kamustado given to Erik March 2009 - no dt - sort.db' fnamesrc = r'D:\files\aa-synced\jon\aa-do-medium\tado\tado-saku\2009-05\erik\erik-from-kingston-sd-card\kamustado-from-erik\2009-06-30-kamustado-from-erik-jv-no-dt.db' fnamein = r'D:\files\aa-synced\jon\aa-do-medium\tado\tado-saku\2009-05\erik\2010-03-10-kamustado-from-erik-no-dt-fixed7.db' fnameout = r'D:\files\aa-synced\jon\aa-do-medium\tado\tado-saku\2009-05\erik\2010-03-10-kamustado-from-erik-no-dt-fixed8tmp.db' print("Replacing corrupted se back from dn to se ...") with open(fnamein, encoding='utf-8') as infile: with open(fnamesrc, encoding='utf-8') as srcfile: with open(fnameout, mode='w', encoding='utf-8') as outfile: srcrecords = list(S.SFMRecordReader(srcfile, "lx")) srclexemes = sfm.get_lexemes(srcrecords) rec = S.SFMRecordReader(infile, "lx") outfile.write(rec.header) records = list(rec) lexemes = sfm.get_lexemes(records) #just for convenience r, s = -1, -1 while r < len(records): r += 1 s += 1 if r >= len(records) or s >= len(srcrecords): break #get the record in broken down form record = sfm.break_record(records[r])
def execute(): in_fname = INFILE out_fname = INFILE + '.out.txt' # generate output filenames and open them (SPLIT_OUT can be a list of any length) temp = [[x, INFILE + '.' + x + '.txt'] for x in SPLIT_OUT] out_fnames = dict(temp) out_files = dict() for o in out_fnames: fn = out_fnames[o] f = open(fn, mode='w', encoding='utf-8') out_files[o] = f f.write( "File created by SFMMinor.py while splitting out minor entries marked by {}\n\n" .format(o)) with open(in_fname, encoding='utf-8') as infile: sfm_records = sfm.SFMRecordReader(infile) recs = list(sfm_records) # load entire file into memory entries, _entries_stripped = sfm.build_indexes( recs, exclude_fields=DONT_INDEX_IF) with open(out_fname, mode='w', encoding='utf-8') as outfile: outfile.write(sfm_records.header) for rec in recs: r = rec.as_lists() is_minor = rec.find_first(MINOR_MKRS) if is_minor: minlx = r[0][1].strip() report = "Probable minor entry {} identified due to link {}".format( minlx, str(is_minor)) #Strip it down to ASCII so the console can handle it report = report.encode('ascii', 'replace') #print(report) mn = is_minor[1].strip() matches = [main[2] for main in entries[mn]] if len(matches) > 1: print("ERROR: ambiguous link.") print(report) for main in matches: for bref in BACKREF_MKRS: found = main.find_values(bref) if minlx in found: #The following needs to be stripped down to ASCII #print(" Main entry found ({}); it mentions the minor entry here: {} {}".format(main.as_lists()[0], bref, str(found))) if not is_minor[0].endswith(bref): #The following needs to be stripped down to ASCII #print(" Updating the minor entry by appending {} to marker {}.".format(bref, is_minor[0])) is_minor[0] += bref # print(rec.as_string()) break s = rec.as_string() if SPLIT_OUT: found = rec.find_first(SPLIT_OUT) if found: f = out_files[found[0]] #The following needs to be stripped down to ASCII #print(" Removing entry {} from the main file since it contains field {}; saving it in {} instead".format(rec.as_lists()[0], found, f)) f.write( s ) # write the minor entry out to the appropriate separate file s = '' outfile.write(s) print("Done writing to file {}".format(out_fname)) for o in out_files: out_files[o].close() print("Done writing to file {}".format(out_fnames[o]))
def variants_as_minor(): '''Supports variants of lx and se, but not of sn. ''' with open(INFILE, encoding='utf-8') as infile: sfm_rec = S.SFMRecordReader(infile, REC_MKR) header = sfm_rec.header sfm_records = list(sfm_rec) # load entire file into memory # Need to be able to follow links. Do one quick pass to index everything. lxD, seD, mnD, mnseD, vaD, vaDrev = build_indexes(sfm_records) to_add = '' with open(OUTFILE, mode='w', encoding='utf-8') as outfile: outfile.write(header) with open(OUTFILE_MINOR, mode='w', encoding='utf-8') as outfile_minor: for rec in sfm_records: if rec.find(['mn']): # minor entry (variant) outfile.write(rec.as_string()) elif rec.find(['mnse']): # minor entry: complex form outfile_minor.write(rec.as_string()) # omit from outfile else: # main entry r = rec.as_lists() lx = r[0][1].strip() vas = rec.find([VA]) ses = rec.find('se') lxse = rec.find_values(['lx', 'se']) while vas: # look at each va, in reverse order _mkr, i = vas.pop() va = r[i][1].strip() print('lx or se {}, with va {}: '.format(lxse, va)) mn = mnD.get(va) mnse = mnseD.get(va) if mn: if mn in lxse: pass # print('- match: Minor entry pointing to {}. Matched! va {} '.format(mn, va)) # r[i][0] = 'cfva' # Disabling va to cfva for else: print( '- min diff: Minor entry {} found, but it points to a different target: {} .' .format(va, mn)) elif mnse: print( 'Error?? found an lx matching va {} that contains an mnse field: mnse {}' .format(va, mnse)) else: print( "- no min: No matching minor entry for va {} !" .format(va)) #TODO: No!! Don't use lx. In ses, find the first se above this va_se field; use that. se = find_above('se', r, i) if se: se = r[se][1].strip() tmp = '\\lx {}\n\\mn {}\n\n'.format(va, se) print("Will add this minor entry: {}".format( tmp)) to_add += tmp tmp = lxD.get(va) if tmp: print("- - lxD({}): {}".format(va, tmp)) tmp = seD.get(va) if tmp: print("- - seD({}): {}".format(va, tmp)) tmp = vaD.get(va) if tmp: print("- - vaD({}): {}".format(va, tmp)) tmp = vaDrev.get(va) if tmp: print("- - vaDrev({}): {}".format(va, tmp)) # if (res2 != lx): pass #TODO: ?? outfile.write(rec.as_string()) print('PLEASE INSERT THESE minor entries into the file: ') print(to_add)