def init_extra(filein): import os recs = [] if not os.path.isfile(filein): print('keydoc_finalptrs: file not found',filein) return recs with codecs.open(filein,"r","utf-8") as f: lines = [x.rstrip('\r\n') for x in f if not x.startswith(';')] for iline,line in enumerate(lines): try: hw,ptrstr = re.split('[\t]',line) except: print('format error in line',iline+1,'of',filein) print(line) exit(1) ptrs = ptrstr.split(',') normptrs = [normalize_key(ptr) for ptr in ptrs] normptrstr = ','.join(normptrs) normhw = normalize_key(hw) docline = '%s\t%s' % (normhw,normptrstr) rec = HWDoc(docline) recs.append(rec) print(len(recs),"extra keydoc_input records from",filein) return recs
def normalize_recs(recs): for rec in recs: docptrs = rec.docptrs allptrs = rec.dochws + docptrs for key in allptrs: norm = normalize_key(key) if norm not in rec.normptrs: rec.normptrs.append(norm)
def normalize_eqs(recs): """ recs is a list of headword equivalences Normalize each of the headwords """ ans = [] for hwlist in recs: normlist = [normalize_key(hw) for hw in hwlist] ans.append(normlist) return ans
def normalize_recs(recs): normrecs = [] for rec in recs: docptrs = rec.docptrs allptrs = rec.dochws + docptrs for key in allptrs: norm = normalize_key(key) if norm != key: normrec = (key, norm) normrecs.append(normrec) return normrecs
# first occurence of dup wfreqdups = [r.wf for r in dups] wf = max(wfreqdups) out = '%s %s' % (word, wf) fout.write(out + '\n') nout = nout + 1 print nout, "records written to", fileout if __name__ == "__main__": filein = sys.argv[1] fileout = sys.argv[2] filediff = sys.argv[3] # differences in spelling recs = init_wf(filein) print len(recs), "read from", filein # count of number of words whose spelling differs from normalized spelling ndiff = 0 fdiff = codecs.open(filediff, "w", "utf-8") with codecs.open(fileout, "w", "utf-8") as f: for rec in recs: norm = normalize_key(rec.word) out = '%s %s' % (norm, rec.wf) f.write(out + '\n') if norm != rec.word: ndiff = ndiff + 1 fdiff.write('%s %s -> %s\n' % (rec.wf, rec.word, norm)) print len(recs), "normalized records written to", fileout print ndiff, "records whose spelling is changed by normalization" fdiff.close() print ndiff, "records written to", filediff