Example #1
0
        f = os.path.join(dir,f)
        if os.path.isfile(f):
            txt = open(f,'rb').read()
            m = re.search('<title>(.*?)</title>',txt)
            if m:
                title = m.group(1)
                if ':' not in title:
                    yield f

if __name__=="__main__":
    if len(sys.argv) < 2:
        print "Usage: $formatter.py File"
        print "Usage: $formatter.py Direcotory"
        sys.exit()
    p = xml.sax.make_parser()
    contentHandler = subTree("text", process)
    p.setContentHandler(contentHandler)
    if os.path.isfile(Input):
        currentFile = Input
        fh = open(Input,'r')
        p.parse(fh)
        fh.close()
    else:
        Input = os.path.abspath(Input)
        for f in articles(Input):
            print f
            currentFile = f
            fh = open(f,'r')
            p.parse(fh)
            fh.close()
Example #2
0
def generateNewXML(text):
    text = '<?xml version="1.0" encoding="utf-8"?>'+ text 
    text = text.encode("utf-8")
    m = re.search('<id>(.*?)</id>', text)
    if m:
        fName = outputPath+"/"+m.group(1)
        print fName 
        open(fName,'w').write(text)
#    xml.sax.parseString(text, contentHandler)
#    tmpFile ="/tmp/_987654321.xml"
#    tmpHandler = open(tmpFile,'w')
#    tmpHandler.write(text2.encode("utf-8"))
#    tmpHandler.close()
#    tmpHandler2 = open(tmpFile, 'r')
#    parser2.parse(tmpHandler2)
#    tmpHandler2.close()
    
def puts(text):
    print text


if __name__=="__main__":
    if len(sys.argv) < 3:
        print "Usage: $fragmenter.py wikipediaDump outputDirectory"
        sys.exit()
    outputPath = sys.argv[2]
    parser = xml.sax.make_parser()
    parser.setContentHandler(subTree("page", generateNewXML))
    parser.parse(open(sys.argv[1],"r"))