Esempio n. 1
0
def pagenumberfixer(uri): #,segmenter):
    # read file
    inp = open(uri).read()
    (_,path) = os.path.split(uri)
    out = 'fixedTaggedTest/'+encode(path)
    bak = 'sentenceSplit/'+encode(path)
    # group paragraphs together if they are separated by a newpage
    new = xmlparse.use(inp,uri)

    # this should be done later, by nice kark-tools
    ## add here: sentence segmentation
    #mode = chunker.findmode(uri)
    #chunker.putStops(new,mode,segmenter)

    string = etree.tostring(new,encoding='utf-8')
    open(bak,'w').write(string)

    # fix page numbers
    newer = xmlparse.tagPageN(string)
    # remove bad characters (TODO remove this? might be useful, although dangerous for kark)
    ok = re.sub(r'	','',newer)
    #ok  = re.sub(u'¶','',ok1)
    # write file
    print "write file",out
    open(out,'w').write(ok)
Esempio n. 2
0
 def concater(fil):
   inp = open(uri).read()
   (_,path) = os.path.split(uri)
   out = 'fixedTaggedTest/'+encode(path)
   # concat tags if they are separated by a newpage
   new = xmlparse.use(inp)
   open(out,'w').write(new)