Beispiel #1
0
def analyzeTask( inputFilePath ):
    """ Read a .txt file, anylize it and store the resulti into a .bulk and/or .json file.
        If the analysis fails, an according .json.failed and/or .bulk.failed file is created with the erorr message captured.
        Return 0 on sucess and -1 on failure.
    """
    print 'working on %s' % repr( inputFilePath )
    txtDone=False
    jsonDone=False
    bulkDone=False
    try:
        frame = cleanedFrame( pagesToFrame( iteratePageText(inputFilePath) ) )
        if args.txt:
            cleanedText = '\n'.join(frame['text']).encode('utf-8')
            saveText( cleanedText, mkOutputPath( inputFilePath, '.txt', args.txt ) )
            txtDone=True

        if args.json or args.bulk:
            split( frame )
            content = analyzeContent( frame )
            if args.json: saveJson( content, mkOutputPath( inputFilePath, '.json', args.json ) )
            jsonDone=True
            if args.bulk: saveBulk( content, mkOutputPath( inputFilePath, '.bulk', args.bulk), index_key='id' )
            bulkDone=True
    except Exception as e:
        from traceback import print_exc
        if args.txt and not txtDone:
            with open( mkOutputPath( inputFilePath, ".txt.failed", args.txt ), 'wt' ) as outputFile:
                print_exc( None, outputFile )
        if args.json and not jsonDone:
            with open( mkOutputPath( inputFilePath, ".json.failed", args.txt ), 'wt' ) as outputFile:
                print_exc( None, outputFile )
        if args.bulk and not bulkDone:
            with open( mkOutputPath( inputFilePath, ".bulk.failed", args.txt ), 'wt' ) as outputFile:
                print_exc( None, outputFile )
        print 'error working on %s' % repr( inputFilePath )
        return -1
        

    print 'done working on %s' % repr( inputFilePath )
    return 0
    import argparse
    from saver import mkOutputPath, autoSave
    from text_extractor import iteratePageText
    from pagetext_cleaner import cleanedFrame, pagesToFrame
    
    argparser = argparse.ArgumentParser(
            description='Convert a pdf file to a txt file.',
            usage='%(prog)s [options] file [file ...]',
            )

    argparser.add_argument( 'files', action='store', nargs='+',
            help='the json files to be parsed' )
    argparser.add_argument( '-o', '--output', action='store',
            help='save the parsed files to directory OUTPUT' )
    args = argparser.parse_args()


    for ifpath in args.files:
        frame = cleanedFrame( pagesToFrame( iteratePageText(ifpath) ) )
        split( frame )
        
        for sectionType, sectionFrame in frame.groupby( 'section' ):
            with open( mkOutputPath( ifpath, '.'+sectionType+'.txt', args.output ), 'wt' ) as ofile:
                for row in sectionFrame.itertuples():
                    ofile.write( "%s\n" % (row[5].encode('utf-8')) )
                    
        frame.to_pickle( mkOutputPath( ifpath, '.phase0.pkl', args.output ) )
            
        

    with open( inputFilePath, 'rt' ) as inputFile:
        wholeText = inputFile.read().decode('utf-8')
        pages = wholeText.split('')
        for page in pages:
            yield page

if __name__ == '__main__':
    import argparse
    from saver import mkOutputPath, autoSave
    argparser = argparse.ArgumentParser(
            description='Convert a pdf file to a txt file.',
            usage='%(prog)s [options] file [file ...]',
            )

    argparser.add_argument( 'files', action='store', nargs='+',
            help='the files to be parsed' )
    argparser.add_argument( '-o', '--output', action='store',
            help='save the parsed json files to directory OUTPUT' )
    args = argparser.parse_args()


    for ifpath in args.files:
        
        with open( mkOutputPath( ifpath, '.raw.txt', args.output ), 'wt' ) as ofile:
            for pagetxt in iteratePageText( ifpath ):
                ofile.write( pagetxt.encode('utf-8') )
                ofile.write('\n'+'#'*80+'\n')
            


    from article_splitter import split
    
    argparser = argparse.ArgumentParser(
            description='Convert a pdf file to a json bulk file.',
            usage='%(prog)s [options] file [file ...]',
            )

    argparser.add_argument( 'files', action='store', nargs='+',
            help='the json files to be parsed' )
    argparser.add_argument( '-o', '--output', action='store',
            help='save the analyzed files to directory OUTPUT' )
    args = argparser.parse_args()


    for ifpath in args.files:
        frame = cleanedFrame( pagesToFrame( iteratePageText(ifpath) ) )
        split( frame )

        content = analyzeContent( frame )
        
        saveBulk( content, mkOutputPath( ifpath, '.phase1.bulk', args.output ), index_key='id' )
        saveJson( content, mkOutputPath( ifpath, '.phase1.json', args.output ) )


    #frame = pd.read_pickle("/home/alice/Documents/test/memorial--2007--0001.phase0.pkl")
    #analyzePre( frame['text'][ frame['section'] == 'pre' ] )
    #for i in xrange(98):
    #    print analyzeArticle( frame['text'][ frame['section'] == 'article%04d'%i ] )
        
    
def iteratePageText(inputFilePath):
    """ Open PDF file at location `inputFilePath`.
        and iterate page-per-page the file, yielding the text of each page as a string.
    """
    doc = poppler.Document.new_from_file(inputFilePath)
    for j in xrange(doc.get_n_pages):
        page = doc.get_page(j)
        yield page.get_text()


# test
if __name__ == "__main__":
    import argparse
    from saver import mkOutputPath, autoSave

    argparser = argparse.ArgumentParser(
        description="Convert a pdf file to a txt file.", usage="%(prog)s [options] file [file ...]"
    )

    argparser.add_argument("files", action="store", nargs="+", help="the json files to be parsed")
    argparser.add_argument("-o", "--output", action="store", help="save the parsed json files to directory OUTPUT")
    args = argparser.parse_args()

    for ifpath in args.files:

        with open(mkOutputPath(ifpath, ".raw.txt", args.output), "wt") as ofile:
            for pagetxt in iteratePageText(ifpath):
                ofile.write(pagetxt.encode("utf-8"))
                ofile.write("\n" + "#" * 80 + "\n")