Ejemplo n.º 1
0
#-*- coding:utf-8 -*-
"""preverb1.py
 
 
"""
from __future__ import print_function
import sys, re,codecs
from parseheadline import parseheadline
import transcoder
transcoder.transcoder_set_dir('transcoder')

class Entry(object):
 Ldict = {}
 def __init__(self,lines,linenum1,linenum2):
  # linenum1,2 are int
  self.metaline = lines[0]
  self.lend = lines[-1]  # the <LEND> line
  self.datalines = lines[1:-1]  # the non-meta lines
  # parse the meta line into a dictionary
  #self.meta = Hwmeta(self.metaline)
  self.metad = parseheadline(self.metaline)
  self.linenum1 = linenum1
  self.linenum2 = linenum2
  #L = self.meta.L
  L = self.metad['L']
  if L in self.Ldict:
   print("Entry init error: duplicate L",L,linenum1)
   exit(1)
  self.Ldict[L] = self
  #  extra attributes
  self.marked = False # from a filter of markup associated with verbs
Ejemplo n.º 2
0
# -*- coding: utf-8 -*-
""" analyze_an.py
   Write analysis of lexnorm-all2 records that end with 'an' (vocalic 'r'
   python3 analyze_an.py <tranout> <filein> <fileout> <tranout>
   python3 analyze_an.py slp1 temp_lexnorm_todo.txt analyze_an.txt
"""
import sys,re,codecs
from slp_cmp import slp_cmp_key
sys.path.append('../../transcode')   # MWinflect
import transcoder
transcoder.transcoder_set_dir('../../transcode/transcode');

class Lexnorm(object):
 """
  The format of a line of lexnorm.txt is now 4 tab-delimited fields:
  lnum, key1, key2, lexinfo
  And, the lexinfo field has form of 1 or more colon-delimited fields, each
  of which has one of two forms:
  gender OR  gender#ending
 """
 def __init__(self,line):
  line = line.rstrip('\r\n')
  (self.L,self.key1,self.key2,self.lexnorm) = line.split('\t')
  self.keep = False
  
 def toString(self):
  s = '\t'.join([self.L,self.key1,self.key2,self.lexnorm])
  return s

def init_lexnorm(filein):
 with codecs.open(filein,"r","utf-8") as f:
Ejemplo n.º 3
0
""" word_frequency.py
    06-01-2017
"""
import sys, re, codecs
import transcoder
transcoder.transcoder_set_dir("../../utilities/transcoder/")
if __name__ == "__main__":
    filein = sys.argv[1]
    fileout = sys.argv[2]
    with codecs.open(filein, "r", "utf-8") as f:
        with codecs.open(fileout, "w", "utf-8") as fout:
            for line in f:
                line = line.rstrip('\r\n')
                m = re.search(r'localStorage.setItem\("(.*?)", *"(.*?)"\);',
                              line)
                if not m:
                    print "COULD NOT PARSE:", line.encode('utf-8')
                    continue
                keyas = m.group(1)
                freq = m.group(2)
                keyslp1 = transcoder.transcoder_processString(
                    keyas, "roman", "slp1")
                fout.write("%s %s\n" % (keyslp1, freq))
Ejemplo n.º 4
0
""" slp1_to_hk.py  Jan 30, 2014
 Inverse transformation to hk_slp1.py.  Use specialized vcp version of
  slp1_hk.xml
"""
import sys, re
import codecs, unicodedata
import transcoder
transcoder.transcoder_set_dir("");

def adjust_slp1(x):
 # modfied to return both 
 outarr = [] # slp1
 parts = re.split(r'(<[^>]+>)|(\[Page.*?\])',x) # xml tags
 for part in parts: 
  if not part: #why needed? 
   pass 
  elif part.startswith('<') and part.endswith('>'):
   outarr.append(part)
  elif part.startswith('[Page') and part.endswith(']'):
   outarr.append(part)
  else: 
   # assume text in slp. Convert to slp1. Use specialized slp1_hk.xml
   y = transcoder.transcoder_processString(part,'slp1','hk')
   outarr.append(y)
 ans = ''.join(outarr)
 return ans

def make_txt(filein,fileout):
 f = codecs.open(filein,encoding='utf-8',mode='r')
 fout = codecs.open(fileout,'w','utf-8')
 nout = 0  # count of lines written
Ejemplo n.º 5
0
# coding=utf-8
""" tooltip.py
    04-04-2018.  
"""
import sys, re, codecs
sys.path.append('../')
import transcoder
transcoder.transcoder_set_dir("transcoder/")


class Unused_Link(object):
    def __init__(self, line):
        line = line.rstrip()
        self.line = line
        (self.linkkey, self.numinstance, self.authkey) = line.split('\t')
        self.authrec = None
        self.entry = None

    def make_entry(self):
        authrec = self.authrec
        authtype = authrec.authtype
        authdata = authrec.authdata
        self.entry = '<entry type="%s">%s</entry>' % (authtype, authdata)

    def print_type(self):
        known_types = {
            'ti': 'Title',
            'au': 'Author',
            'litcat': 'Literary category',
            'subti': 'subtitle',
        }
Ejemplo n.º 6
0
""" simpleslp1.py
"""

import codecs, re, sys
import transcoder

#transcoder_dir = transcoder.transcoder_set_dir('../../utilities/transcoder')
import os
dir_path = os.path.dirname(os.path.realpath(__file__))
transcoder_dir_path = os.path.join(dir_path, 'transcoder')
transcoder_dir = transcoder.transcoder_set_dir(transcoder_dir_path)
#print('transcoder_dir_path=',transcoder_dir_path)
#exit(1)


def simple_lower(word):
    """ lower case all letters in word, EXCEPT Y (palatal nasal) and
  R (cerebral nasal) -- Y and R are changed to 'n' in transcoder.
 """
    def sub(m):
        a = m.group(1)
        return a.lower()

    regex = '([AIUFXEOMHKGNCJWQTDPBLVSZ])'
    word1 = re.sub(regex, sub, word)
    return word1


def remove_double(word):
    """ replace xx with x in word """
    regex = r'(.)\1'
Ejemplo n.º 7
0
""" as_roman.py
    Usage: python as_roman.py pwgbib1_utf8.txt pwgbib1_roman.txt
"""
import codecs,sys,re
import transcoder
transcoder.transcoder_set_dir('.')

def unused_convertrecs(recs,tranin,tranout):
 "Modifies recs"
 n=0
 for rec in recs:
  n=n+1
  try:
   rec.abbrvunicode = transcoder.transcoder_processString(rec.abbrv,tranin,tranout)
   rec.titleunicode = transcoder.transcoder_processString(rec.title,tranin,tranout)
   m = re.search(r'[a-zA-Z][1-9]',rec.abbrvunicode + " " + rec.titleunicode )
   if m:
    print "TRANSCODER WARNING: ",m.group(0).encode('utf-8')
  except:
   print "convertrecs problem",n,rec.line.encode('utf-8')
   #exit(1)

def unused_writerecs(recs,fileout):
 fout = codecs.open(fileout,"w","utf-8")
 n=0
 for rec in recs:
  n = n + 1
  outarr=[]  # array of fields to write.
  outarr.append(rec.abbrv)
  outarr.append('%03d' % n) # sequence number in pwbib0
  if rec.checked:
Ejemplo n.º 8
0
# From ../

import sys
import codecs
import re
sys.path.append('./ext/sanskrit-transcoding/')

import transcoder  # noqa
transcoder.transcoder_set_dir('./ext/sanskrit-transcoding/transcoder')


def convert(filein, fileout, tranin, tranout):
    fp = codecs.open(filein, 'r', 'utf-8')
    fpout = codecs.open(fileout, 'w', 'utf-8')
    n = 0
    for x in fp:
        x = x.rstrip('\r\n')
        if (x == ''):
            continue
        n = n + 1
        m = re.search(r'^([^ ]+) (.+)$', x)
        if not m:
            out = 'line %s is unknown: %s' % (n, x)
            exit(1)
        head = m.group(1)
        body = m.group(2)
        body1 = transcoder.transcoder_processString(body, tranin, tranout)
        y = '%s %s' % (head, body1)
        fpout.write('%s\n' % y)
    fp.close()
    fpout.close()
Ejemplo n.º 9
0
# coding=utf-8
""" adjtxt2.py
 transcoding of mw72

"""
import sys, re,codecs
sys.path.append('../')  # for transcoder
import transcoder
transcoder.transcoder_set_dir("") # use local versions of transcoder files



def convertline(line,tranfrom,tranto):
 """ do transcoder, but don't convert [Page...]
 """
 parts=line.split('[Page')
 parts[0] = transcoder.transcoder_processString(parts[0],tranfrom,tranto)
 if re.search(r'[a-zA-Z][0-9]',parts[0]):
  unconverted=True
 else:
  unconverted=False
 return (unconverted,'[Page'.join(parts))

def make(tranfrom,tranto,filein,fileout):
 f = codecs.open(filein,encoding='utf-8',mode='r')
 fout = codecs.open(fileout,'w','utf-8')
 n = 0
 nchg = 0
 nprob = 0
 nunc = 0 # number of lines with unconverted AS codes
 for line in f:
Ejemplo n.º 10
0
""" simpleslp1.py
"""

import codecs, re, sys
import transcoder
transcoder_dir = transcoder.transcoder_set_dir('../../utilities/transcoder')


def simpleslp1(word):
    """ Apply slp1_simpleslp1 transcoder. 
  lower case all letters in word, EXCEPT Y (palatal nasal) and
  R (cerebral nasal) -- Y and R are changed to 'n' in transcoder.
  Also, replace a doubled letter by the single letter.
 """
    def sub1(m):
        a = m.group(1)
        return a.lower()

    regex1 = '([AIUFXEOMHKGNCJWQTDPBLVSZ])'
    word1 = re.sub(regex1, sub1, word)
    regex2 = r'(.)\1'

    def sub2(m):
        a = m.group(0)  # xx
        return a[0]  # x

    word2 = re.sub(regex2, sub2, word1)
    var = transcoder.transcoder_processString(word2, 'slp1', 'simpleslp1lo')
    #if word != word2:
    # if word.startswith('kar'):
    #  print('dbg:',word,word1,word2,var)
Ejemplo n.º 11
0
# coding=utf-8
""" transcode.py for PW
 Reads/Writes utf-8
 For missing
 Oct 18, 2014:  Take care of 'junk' in #{..} to preserve invertibility
  Use local copy of hk_slp1.xml and slp1_hk.xml to also transcode accents
"""
import sys, re,codecs
sys.path.insert(0,"../")  # where transcoder resides

import transcoder
#transcoder.transcoder_set_dir("../");
transcoder.transcoder_set_dir("");
def adjust_hk_slp1(m):
 x1 = m.group(1)
 x2 = m.group(2)
 x3 = m.group(3)
 partsin = re.split(u'(ƒPage.*?ƒ)|([.])',x2)
 partsout = [x1]
 for part in partsin:
  #if re.search(u'^(ƒPage.*?ƒ)$',part):
  if not part:
   continue
  elif part.startswith(u'ƒ'): #re.search(u'^(ƒPage.*?ƒ)$',part):
   partsout.append('}%s#{' % part)
  elif re.search(r'^([.])$',part):
   partsout.append('}%s#{' % part)
  else:
   partout = transcoder.transcoder_processString(part,'hk','slp1')
   partsout.append(partout)
 partsout.append(x3)
Ejemplo n.º 12
0
"""convert.py
  Python example of transcoding
"""
import sys,codecs,re
sys.path.append('../')
import transcoder
transcoder.transcoder_set_dir('../transcoder');

def convert(filein,fileout,tranin,tranout):
 fp = codecs.open(filein,"r",'utf-8')
 fpout = codecs.open(fileout,"w",'utf-8')
 n=0;
 for x in fp:
  x = x.rstrip('\r\n')
  if (x == ''):
   continue
  n=n+1
  m = re.search(r'^([^ ]+) (.+)$',x)
  if not m:
   out = "line %s is unknown: %s" %(n,x)
   exit(1)
  head = m.group(1)
  body = m.group(2)
  #body = re.sub('/\|/',' # ',body); 
  #body = preg_replace('/ +/',' ',body);
  body1 = transcoder.transcoder_processString(body,tranin,tranout)
  y = "%s %s" % (head,body1)
  fpout.write("%s\n" % y)
 fp.close()
 fpout.close()
 print n,"lines converted\n"
Ejemplo n.º 13
0
"""demok.py
  Python example of transcoding
  
"""
import sys, codecs, re
import unicodedata
#sys.path.append('../')
import transcoder
#transcoder.transcoder_set_dir('../transcoder');
transcoder.transcoder_set_dir('')


def convert4(datain, fileout, tranin, tranout):
    body = datain
    body1 = transcoder.transcoder_processString(body, tranin, tranout)
    with codecs.open(fileout, "w", 'utf-8') as f:
        f.write('%s\n' % body1)
    #y = "%s %s" % (head,body1)
    #fpout.write("%s\n" % y)
    #fp.close()
    #fpout.close()
    print "fileout=", fileout


def output(f, tranin, tranout, body):
    body1 = transcoder.transcoder_processString(body, tranin, tranout)
    f.write('%4s: %s\n' % (tranin, body))
    f.write('%s %s\n' % (tranout, body1))
    outarr = [repr(c) for c in body1]
    out = ' '.join(outarr)
    f.write('unic: %s\n' % out)
Ejemplo n.º 14
0
"""convert.py
  Python example of transcoding
"""
import sys, codecs, re
sys.path.append('../')
import transcoder
transcoder.transcoder_set_dir('../transcoder')


def convert(filein, fileout, tranin, tranout):
    fp = codecs.open(filein, "r", 'utf-8')
    fpout = codecs.open(fileout, "w", 'utf-8')
    n = 0
    for x in fp:
        x = x.rstrip('\r\n')
        if (x == ''):
            continue
        n = n + 1
        m = re.search(r'^([^ ]+) (.+)$', x)
        if not m:
            out = "line %s is unknown: %s" % (n, x)
            exit(1)
        head = m.group(1)
        body = m.group(2)
        #body = re.sub('/\|/',' # ',body);
        #body = preg_replace('/ +/',' ',body);
        body1 = transcoder.transcoder_processString(body, tranin, tranout)
        y = "%s %s" % (head, body1)
        fpout.write("%s\n" % y)
    fp.close()
    fpout.close()
Ejemplo n.º 15
0
# coding=utf-8
""" tooltip.py
    04-04-2018.  
"""
import sys,re,codecs
sys.path.append('../')
import transcoder
transcoder.transcoder_set_dir("transcoder/");

class Unused_Link(object):
 def __init__(self,line):
  line = line.rstrip()
  self.line = line
  (self.linkkey,self.numinstance,self.authkey) = line.split('\t')
  self.authrec = None
  self.entry = None
 def make_entry(self):
  authrec = self.authrec
  authtype = authrec.authtype
  authdata = authrec.authdata
  self.entry = '<entry type="%s">%s</entry>' %(authtype,authdata)
 def print_type(self):
  known_types = {
   'ti':'Title',
   'au':'Author',
   'litcat':'Literary category',
   'subti':'subtitle',
  }
  authtype = self.authrec.authtype
  if authtype in known_types:
   type1 = known_types[authtype]
Ejemplo n.º 16
0
""" pwbib1.py
    Usage: python pwbib1.py pwbib0.txt pwbib1.txt
    Reads pwbib0 (using pwbib_parse0) into list of Pwbib records.
    converts 'abbrv' and 'title' fields to 'Roman' (using transcoder with
    as_roman.xml).  Saves Unicode abbrv field as a separate 
      field 'abbrvunicode'.
    Replaces title field with unicode.
    Writes all fields to output, as tab-delimited text file. (See writerecs for
    details of fields written)
    Starts with as_roman.xml as per pw_dhaval
"""
import codecs,sys,re
import pwbib_parse0
import transcoder
transcoder.transcoder_set_dir('.');

def convertrecs(recs,tranin,tranout):
 "Modifies recs"
 n=0
 for rec in recs:
  n=n+1
  try:
   rec.abbrvunicode = transcoder.transcoder_processString(rec.abbrv,tranin,tranout)
   rec.titleunicode = transcoder.transcoder_processString(rec.title,tranin,tranout)
   m = re.search(r'[a-zA-Z][1-9]',rec.abbrvunicode + " " + rec.titleunicode )
   if m:
    print "TRANSCODER WARNING: ",m.group(0).encode('utf-8')
   # Undo some transcodings
   rec.titleunicode = re.sub(r'YOLLY','JOLLY',rec.titleunicode)  # JOLLY is an author
  except:
   print "convertrecs problem",n,rec.line.encode('utf-8')