#-*- coding:utf-8 -*- """preverb1.py """ from __future__ import print_function import sys, re,codecs from parseheadline import parseheadline import transcoder transcoder.transcoder_set_dir('transcoder') class Entry(object): Ldict = {} def __init__(self,lines,linenum1,linenum2): # linenum1,2 are int self.metaline = lines[0] self.lend = lines[-1] # the <LEND> line self.datalines = lines[1:-1] # the non-meta lines # parse the meta line into a dictionary #self.meta = Hwmeta(self.metaline) self.metad = parseheadline(self.metaline) self.linenum1 = linenum1 self.linenum2 = linenum2 #L = self.meta.L L = self.metad['L'] if L in self.Ldict: print("Entry init error: duplicate L",L,linenum1) exit(1) self.Ldict[L] = self # extra attributes self.marked = False # from a filter of markup associated with verbs
# -*- coding: utf-8 -*- """ analyze_an.py Write analysis of lexnorm-all2 records that end with 'an' (vocalic 'r' python3 analyze_an.py <tranout> <filein> <fileout> <tranout> python3 analyze_an.py slp1 temp_lexnorm_todo.txt analyze_an.txt """ import sys,re,codecs from slp_cmp import slp_cmp_key sys.path.append('../../transcode') # MWinflect import transcoder transcoder.transcoder_set_dir('../../transcode/transcode'); class Lexnorm(object): """ The format of a line of lexnorm.txt is now 4 tab-delimited fields: lnum, key1, key2, lexinfo And, the lexinfo field has form of 1 or more colon-delimited fields, each of which has one of two forms: gender OR gender#ending """ def __init__(self,line): line = line.rstrip('\r\n') (self.L,self.key1,self.key2,self.lexnorm) = line.split('\t') self.keep = False def toString(self): s = '\t'.join([self.L,self.key1,self.key2,self.lexnorm]) return s def init_lexnorm(filein): with codecs.open(filein,"r","utf-8") as f:
""" word_frequency.py 06-01-2017 """ import sys, re, codecs import transcoder transcoder.transcoder_set_dir("../../utilities/transcoder/") if __name__ == "__main__": filein = sys.argv[1] fileout = sys.argv[2] with codecs.open(filein, "r", "utf-8") as f: with codecs.open(fileout, "w", "utf-8") as fout: for line in f: line = line.rstrip('\r\n') m = re.search(r'localStorage.setItem\("(.*?)", *"(.*?)"\);', line) if not m: print "COULD NOT PARSE:", line.encode('utf-8') continue keyas = m.group(1) freq = m.group(2) keyslp1 = transcoder.transcoder_processString( keyas, "roman", "slp1") fout.write("%s %s\n" % (keyslp1, freq))
""" slp1_to_hk.py Jan 30, 2014 Inverse transformation to hk_slp1.py. Use specialized vcp version of slp1_hk.xml """ import sys, re import codecs, unicodedata import transcoder transcoder.transcoder_set_dir(""); def adjust_slp1(x): # modfied to return both outarr = [] # slp1 parts = re.split(r'(<[^>]+>)|(\[Page.*?\])',x) # xml tags for part in parts: if not part: #why needed? pass elif part.startswith('<') and part.endswith('>'): outarr.append(part) elif part.startswith('[Page') and part.endswith(']'): outarr.append(part) else: # assume text in slp. Convert to slp1. Use specialized slp1_hk.xml y = transcoder.transcoder_processString(part,'slp1','hk') outarr.append(y) ans = ''.join(outarr) return ans def make_txt(filein,fileout): f = codecs.open(filein,encoding='utf-8',mode='r') fout = codecs.open(fileout,'w','utf-8') nout = 0 # count of lines written
# coding=utf-8 """ tooltip.py 04-04-2018. """ import sys, re, codecs sys.path.append('../') import transcoder transcoder.transcoder_set_dir("transcoder/") class Unused_Link(object): def __init__(self, line): line = line.rstrip() self.line = line (self.linkkey, self.numinstance, self.authkey) = line.split('\t') self.authrec = None self.entry = None def make_entry(self): authrec = self.authrec authtype = authrec.authtype authdata = authrec.authdata self.entry = '<entry type="%s">%s</entry>' % (authtype, authdata) def print_type(self): known_types = { 'ti': 'Title', 'au': 'Author', 'litcat': 'Literary category', 'subti': 'subtitle', }
""" simpleslp1.py """ import codecs, re, sys import transcoder #transcoder_dir = transcoder.transcoder_set_dir('../../utilities/transcoder') import os dir_path = os.path.dirname(os.path.realpath(__file__)) transcoder_dir_path = os.path.join(dir_path, 'transcoder') transcoder_dir = transcoder.transcoder_set_dir(transcoder_dir_path) #print('transcoder_dir_path=',transcoder_dir_path) #exit(1) def simple_lower(word): """ lower case all letters in word, EXCEPT Y (palatal nasal) and R (cerebral nasal) -- Y and R are changed to 'n' in transcoder. """ def sub(m): a = m.group(1) return a.lower() regex = '([AIUFXEOMHKGNCJWQTDPBLVSZ])' word1 = re.sub(regex, sub, word) return word1 def remove_double(word): """ replace xx with x in word """ regex = r'(.)\1'
""" as_roman.py Usage: python as_roman.py pwgbib1_utf8.txt pwgbib1_roman.txt """ import codecs,sys,re import transcoder transcoder.transcoder_set_dir('.') def unused_convertrecs(recs,tranin,tranout): "Modifies recs" n=0 for rec in recs: n=n+1 try: rec.abbrvunicode = transcoder.transcoder_processString(rec.abbrv,tranin,tranout) rec.titleunicode = transcoder.transcoder_processString(rec.title,tranin,tranout) m = re.search(r'[a-zA-Z][1-9]',rec.abbrvunicode + " " + rec.titleunicode ) if m: print "TRANSCODER WARNING: ",m.group(0).encode('utf-8') except: print "convertrecs problem",n,rec.line.encode('utf-8') #exit(1) def unused_writerecs(recs,fileout): fout = codecs.open(fileout,"w","utf-8") n=0 for rec in recs: n = n + 1 outarr=[] # array of fields to write. outarr.append(rec.abbrv) outarr.append('%03d' % n) # sequence number in pwbib0 if rec.checked:
# From ../ import sys import codecs import re sys.path.append('./ext/sanskrit-transcoding/') import transcoder # noqa transcoder.transcoder_set_dir('./ext/sanskrit-transcoding/transcoder') def convert(filein, fileout, tranin, tranout): fp = codecs.open(filein, 'r', 'utf-8') fpout = codecs.open(fileout, 'w', 'utf-8') n = 0 for x in fp: x = x.rstrip('\r\n') if (x == ''): continue n = n + 1 m = re.search(r'^([^ ]+) (.+)$', x) if not m: out = 'line %s is unknown: %s' % (n, x) exit(1) head = m.group(1) body = m.group(2) body1 = transcoder.transcoder_processString(body, tranin, tranout) y = '%s %s' % (head, body1) fpout.write('%s\n' % y) fp.close() fpout.close()
# coding=utf-8 """ adjtxt2.py transcoding of mw72 """ import sys, re,codecs sys.path.append('../') # for transcoder import transcoder transcoder.transcoder_set_dir("") # use local versions of transcoder files def convertline(line,tranfrom,tranto): """ do transcoder, but don't convert [Page...] """ parts=line.split('[Page') parts[0] = transcoder.transcoder_processString(parts[0],tranfrom,tranto) if re.search(r'[a-zA-Z][0-9]',parts[0]): unconverted=True else: unconverted=False return (unconverted,'[Page'.join(parts)) def make(tranfrom,tranto,filein,fileout): f = codecs.open(filein,encoding='utf-8',mode='r') fout = codecs.open(fileout,'w','utf-8') n = 0 nchg = 0 nprob = 0 nunc = 0 # number of lines with unconverted AS codes for line in f:
""" simpleslp1.py """ import codecs, re, sys import transcoder transcoder_dir = transcoder.transcoder_set_dir('../../utilities/transcoder') def simpleslp1(word): """ Apply slp1_simpleslp1 transcoder. lower case all letters in word, EXCEPT Y (palatal nasal) and R (cerebral nasal) -- Y and R are changed to 'n' in transcoder. Also, replace a doubled letter by the single letter. """ def sub1(m): a = m.group(1) return a.lower() regex1 = '([AIUFXEOMHKGNCJWQTDPBLVSZ])' word1 = re.sub(regex1, sub1, word) regex2 = r'(.)\1' def sub2(m): a = m.group(0) # xx return a[0] # x word2 = re.sub(regex2, sub2, word1) var = transcoder.transcoder_processString(word2, 'slp1', 'simpleslp1lo') #if word != word2: # if word.startswith('kar'): # print('dbg:',word,word1,word2,var)
# coding=utf-8 """ transcode.py for PW Reads/Writes utf-8 For missing Oct 18, 2014: Take care of 'junk' in #{..} to preserve invertibility Use local copy of hk_slp1.xml and slp1_hk.xml to also transcode accents """ import sys, re,codecs sys.path.insert(0,"../") # where transcoder resides import transcoder #transcoder.transcoder_set_dir("../"); transcoder.transcoder_set_dir(""); def adjust_hk_slp1(m): x1 = m.group(1) x2 = m.group(2) x3 = m.group(3) partsin = re.split(u'(ƒPage.*?ƒ)|([.])',x2) partsout = [x1] for part in partsin: #if re.search(u'^(ƒPage.*?ƒ)$',part): if not part: continue elif part.startswith(u'ƒ'): #re.search(u'^(ƒPage.*?ƒ)$',part): partsout.append('}%s#{' % part) elif re.search(r'^([.])$',part): partsout.append('}%s#{' % part) else: partout = transcoder.transcoder_processString(part,'hk','slp1') partsout.append(partout) partsout.append(x3)
"""convert.py Python example of transcoding """ import sys,codecs,re sys.path.append('../') import transcoder transcoder.transcoder_set_dir('../transcoder'); def convert(filein,fileout,tranin,tranout): fp = codecs.open(filein,"r",'utf-8') fpout = codecs.open(fileout,"w",'utf-8') n=0; for x in fp: x = x.rstrip('\r\n') if (x == ''): continue n=n+1 m = re.search(r'^([^ ]+) (.+)$',x) if not m: out = "line %s is unknown: %s" %(n,x) exit(1) head = m.group(1) body = m.group(2) #body = re.sub('/\|/',' # ',body); #body = preg_replace('/ +/',' ',body); body1 = transcoder.transcoder_processString(body,tranin,tranout) y = "%s %s" % (head,body1) fpout.write("%s\n" % y) fp.close() fpout.close() print n,"lines converted\n"
"""demok.py Python example of transcoding """ import sys, codecs, re import unicodedata #sys.path.append('../') import transcoder #transcoder.transcoder_set_dir('../transcoder'); transcoder.transcoder_set_dir('') def convert4(datain, fileout, tranin, tranout): body = datain body1 = transcoder.transcoder_processString(body, tranin, tranout) with codecs.open(fileout, "w", 'utf-8') as f: f.write('%s\n' % body1) #y = "%s %s" % (head,body1) #fpout.write("%s\n" % y) #fp.close() #fpout.close() print "fileout=", fileout def output(f, tranin, tranout, body): body1 = transcoder.transcoder_processString(body, tranin, tranout) f.write('%4s: %s\n' % (tranin, body)) f.write('%s %s\n' % (tranout, body1)) outarr = [repr(c) for c in body1] out = ' '.join(outarr) f.write('unic: %s\n' % out)
"""convert.py Python example of transcoding """ import sys, codecs, re sys.path.append('../') import transcoder transcoder.transcoder_set_dir('../transcoder') def convert(filein, fileout, tranin, tranout): fp = codecs.open(filein, "r", 'utf-8') fpout = codecs.open(fileout, "w", 'utf-8') n = 0 for x in fp: x = x.rstrip('\r\n') if (x == ''): continue n = n + 1 m = re.search(r'^([^ ]+) (.+)$', x) if not m: out = "line %s is unknown: %s" % (n, x) exit(1) head = m.group(1) body = m.group(2) #body = re.sub('/\|/',' # ',body); #body = preg_replace('/ +/',' ',body); body1 = transcoder.transcoder_processString(body, tranin, tranout) y = "%s %s" % (head, body1) fpout.write("%s\n" % y) fp.close() fpout.close()
# coding=utf-8 """ tooltip.py 04-04-2018. """ import sys,re,codecs sys.path.append('../') import transcoder transcoder.transcoder_set_dir("transcoder/"); class Unused_Link(object): def __init__(self,line): line = line.rstrip() self.line = line (self.linkkey,self.numinstance,self.authkey) = line.split('\t') self.authrec = None self.entry = None def make_entry(self): authrec = self.authrec authtype = authrec.authtype authdata = authrec.authdata self.entry = '<entry type="%s">%s</entry>' %(authtype,authdata) def print_type(self): known_types = { 'ti':'Title', 'au':'Author', 'litcat':'Literary category', 'subti':'subtitle', } authtype = self.authrec.authtype if authtype in known_types: type1 = known_types[authtype]
""" pwbib1.py Usage: python pwbib1.py pwbib0.txt pwbib1.txt Reads pwbib0 (using pwbib_parse0) into list of Pwbib records. converts 'abbrv' and 'title' fields to 'Roman' (using transcoder with as_roman.xml). Saves Unicode abbrv field as a separate field 'abbrvunicode'. Replaces title field with unicode. Writes all fields to output, as tab-delimited text file. (See writerecs for details of fields written) Starts with as_roman.xml as per pw_dhaval """ import codecs,sys,re import pwbib_parse0 import transcoder transcoder.transcoder_set_dir('.'); def convertrecs(recs,tranin,tranout): "Modifies recs" n=0 for rec in recs: n=n+1 try: rec.abbrvunicode = transcoder.transcoder_processString(rec.abbrv,tranin,tranout) rec.titleunicode = transcoder.transcoder_processString(rec.title,tranin,tranout) m = re.search(r'[a-zA-Z][1-9]',rec.abbrvunicode + " " + rec.titleunicode ) if m: print "TRANSCODER WARNING: ",m.group(0).encode('utf-8') # Undo some transcodings rec.titleunicode = re.sub(r'YOLLY','JOLLY',rec.titleunicode) # JOLLY is an author except: print "convertrecs problem",n,rec.line.encode('utf-8')