def analyzeTask( inputFilePath ): """ Read a .txt file, anylize it and store the resulti into a .bulk and/or .json file. If the analysis fails, an according .json.failed and/or .bulk.failed file is created with the erorr message captured. Return 0 on sucess and -1 on failure. """ print 'working on %s' % repr( inputFilePath ) txtDone=False jsonDone=False bulkDone=False try: frame = cleanedFrame( pagesToFrame( iteratePageText(inputFilePath) ) ) if args.txt: cleanedText = '\n'.join(frame['text']).encode('utf-8') saveText( cleanedText, mkOutputPath( inputFilePath, '.txt', args.txt ) ) txtDone=True if args.json or args.bulk: split( frame ) content = analyzeContent( frame ) if args.json: saveJson( content, mkOutputPath( inputFilePath, '.json', args.json ) ) jsonDone=True if args.bulk: saveBulk( content, mkOutputPath( inputFilePath, '.bulk', args.bulk), index_key='id' ) bulkDone=True except Exception as e: from traceback import print_exc if args.txt and not txtDone: with open( mkOutputPath( inputFilePath, ".txt.failed", args.txt ), 'wt' ) as outputFile: print_exc( None, outputFile ) if args.json and not jsonDone: with open( mkOutputPath( inputFilePath, ".json.failed", args.txt ), 'wt' ) as outputFile: print_exc( None, outputFile ) if args.bulk and not bulkDone: with open( mkOutputPath( inputFilePath, ".bulk.failed", args.txt ), 'wt' ) as outputFile: print_exc( None, outputFile ) print 'error working on %s' % repr( inputFilePath ) return -1 print 'done working on %s' % repr( inputFilePath ) return 0
import argparse from saver import mkOutputPath, autoSave from text_extractor import iteratePageText from pagetext_cleaner import cleanedFrame, pagesToFrame argparser = argparse.ArgumentParser( description='Convert a pdf file to a txt file.', usage='%(prog)s [options] file [file ...]', ) argparser.add_argument( 'files', action='store', nargs='+', help='the json files to be parsed' ) argparser.add_argument( '-o', '--output', action='store', help='save the parsed files to directory OUTPUT' ) args = argparser.parse_args() for ifpath in args.files: frame = cleanedFrame( pagesToFrame( iteratePageText(ifpath) ) ) split( frame ) for sectionType, sectionFrame in frame.groupby( 'section' ): with open( mkOutputPath( ifpath, '.'+sectionType+'.txt', args.output ), 'wt' ) as ofile: for row in sectionFrame.itertuples(): ofile.write( "%s\n" % (row[5].encode('utf-8')) ) frame.to_pickle( mkOutputPath( ifpath, '.phase0.pkl', args.output ) )
with open( inputFilePath, 'rt' ) as inputFile: wholeText = inputFile.read().decode('utf-8') pages = wholeText.split('') for page in pages: yield page if __name__ == '__main__': import argparse from saver import mkOutputPath, autoSave argparser = argparse.ArgumentParser( description='Convert a pdf file to a txt file.', usage='%(prog)s [options] file [file ...]', ) argparser.add_argument( 'files', action='store', nargs='+', help='the files to be parsed' ) argparser.add_argument( '-o', '--output', action='store', help='save the parsed json files to directory OUTPUT' ) args = argparser.parse_args() for ifpath in args.files: with open( mkOutputPath( ifpath, '.raw.txt', args.output ), 'wt' ) as ofile: for pagetxt in iteratePageText( ifpath ): ofile.write( pagetxt.encode('utf-8') ) ofile.write('\n'+'#'*80+'\n')
from article_splitter import split argparser = argparse.ArgumentParser( description='Convert a pdf file to a json bulk file.', usage='%(prog)s [options] file [file ...]', ) argparser.add_argument( 'files', action='store', nargs='+', help='the json files to be parsed' ) argparser.add_argument( '-o', '--output', action='store', help='save the analyzed files to directory OUTPUT' ) args = argparser.parse_args() for ifpath in args.files: frame = cleanedFrame( pagesToFrame( iteratePageText(ifpath) ) ) split( frame ) content = analyzeContent( frame ) saveBulk( content, mkOutputPath( ifpath, '.phase1.bulk', args.output ), index_key='id' ) saveJson( content, mkOutputPath( ifpath, '.phase1.json', args.output ) ) #frame = pd.read_pickle("/home/alice/Documents/test/memorial--2007--0001.phase0.pkl") #analyzePre( frame['text'][ frame['section'] == 'pre' ] ) #for i in xrange(98): # print analyzeArticle( frame['text'][ frame['section'] == 'article%04d'%i ] )
def iteratePageText(inputFilePath): """ Open PDF file at location `inputFilePath`. and iterate page-per-page the file, yielding the text of each page as a string. """ doc = poppler.Document.new_from_file(inputFilePath) for j in xrange(doc.get_n_pages): page = doc.get_page(j) yield page.get_text() # test if __name__ == "__main__": import argparse from saver import mkOutputPath, autoSave argparser = argparse.ArgumentParser( description="Convert a pdf file to a txt file.", usage="%(prog)s [options] file [file ...]" ) argparser.add_argument("files", action="store", nargs="+", help="the json files to be parsed") argparser.add_argument("-o", "--output", action="store", help="save the parsed json files to directory OUTPUT") args = argparser.parse_args() for ifpath in args.files: with open(mkOutputPath(ifpath, ".raw.txt", args.output), "wt") as ofile: for pagetxt in iteratePageText(ifpath): ofile.write(pagetxt.encode("utf-8")) ofile.write("\n" + "#" * 80 + "\n")