def init_extra(filein):
 import os
 recs = []
 if not os.path.isfile(filein):
  print('keydoc_finalptrs: file not found',filein)
  return recs
 
 with codecs.open(filein,"r","utf-8") as f:
  lines = [x.rstrip('\r\n') for x in f if not x.startswith(';')]
 for iline,line in enumerate(lines):
  try:
   hw,ptrstr = re.split('[\t]',line)
  except:
   print('format error in line',iline+1,'of',filein)
   print(line)
   exit(1)
  ptrs = ptrstr.split(',')
  normptrs = [normalize_key(ptr) for ptr in ptrs]
  normptrstr = ','.join(normptrs)
  normhw  = normalize_key(hw)
  docline = '%s\t%s' % (normhw,normptrstr)
  rec = HWDoc(docline)
  recs.append(rec)
 print(len(recs),"extra keydoc_input records from",filein)
 return recs
Esempio n. 2
0
def normalize_recs(recs):
    for rec in recs:
        docptrs = rec.docptrs
        allptrs = rec.dochws + docptrs
        for key in allptrs:
            norm = normalize_key(key)
            if norm not in rec.normptrs:
                rec.normptrs.append(norm)
Esempio n. 3
0
def normalize_eqs(recs):
    """ recs is a list of headword equivalences
     Normalize each of the headwords
 """
    ans = []
    for hwlist in recs:
        normlist = [normalize_key(hw) for hw in hwlist]
        ans.append(normlist)
    return ans
Esempio n. 4
0
def normalize_recs(recs):
    normrecs = []
    for rec in recs:
        docptrs = rec.docptrs
        allptrs = rec.dochws + docptrs
        for key in allptrs:
            norm = normalize_key(key)
            if norm != key:
                normrec = (key, norm)
                normrecs.append(normrec)
    return normrecs
Esempio n. 5
0
            # first occurence of dup
            wfreqdups = [r.wf for r in dups]
            wf = max(wfreqdups)
            out = '%s %s' % (word, wf)
            fout.write(out + '\n')
            nout = nout + 1
    print nout, "records written to", fileout


if __name__ == "__main__":
    filein = sys.argv[1]
    fileout = sys.argv[2]
    filediff = sys.argv[3]  # differences in spelling
    recs = init_wf(filein)
    print len(recs), "read from", filein
    # count of number of words whose spelling differs from normalized spelling
    ndiff = 0
    fdiff = codecs.open(filediff, "w", "utf-8")
    with codecs.open(fileout, "w", "utf-8") as f:
        for rec in recs:
            norm = normalize_key(rec.word)
            out = '%s %s' % (norm, rec.wf)
            f.write(out + '\n')
            if norm != rec.word:
                ndiff = ndiff + 1
                fdiff.write('%s %s -> %s\n' % (rec.wf, rec.word, norm))
    print len(recs), "normalized records written to", fileout
    print ndiff, "records whose spelling is changed by normalization"
    fdiff.close()
    print ndiff, "records written to", filediff