def pagenumberfixer(uri): #,segmenter): # read file inp = open(uri).read() (_,path) = os.path.split(uri) out = 'fixedTaggedTest/'+encode(path) bak = 'sentenceSplit/'+encode(path) # group paragraphs together if they are separated by a newpage new = xmlparse.use(inp,uri) # this should be done later, by nice kark-tools ## add here: sentence segmentation #mode = chunker.findmode(uri) #chunker.putStops(new,mode,segmenter) string = etree.tostring(new,encoding='utf-8') open(bak,'w').write(string) # fix page numbers newer = xmlparse.tagPageN(string) # remove bad characters (TODO remove this? might be useful, although dangerous for kark) ok = re.sub(r'	','',newer) #ok = re.sub(u'ΒΆ','',ok1) # write file print "write file",out open(out,'w').write(ok)
def concater(fil): inp = open(uri).read() (_,path) = os.path.split(uri) out = 'fixedTaggedTest/'+encode(path) # concat tags if they are separated by a newpage new = xmlparse.use(inp) open(out,'w').write(new)