if "debug" in sys.argv: debug_arg=True else: debug_arg=False title=False creator=False metadata={} # f = GzipFile(fileobj=urlopen(sys.argv[1])) # f = open(sys.argv[1]) pars=[] for par in abbyystreams.abbyytext(f, header=htmlhead, footer=htmlfoot, format=xmlformat,blockfn=htmlblock, pagefn=htmlpage,picture=htmlimg, table=htmltable,escapefn=cgi.escape, linefn=linehandler,floatfn=floatblock, debug=debug_arg): pars.append(par) topclass=False topscore=-1 for x in classhist: if not (topclass and (classhist[x]<topscore)): topclass=x topscore=classhist[x] paraprefix="<span class='%s'>"%topclass paracount=1 print "<?xml version='1.0' encoding='utf-8' ?>" print "<!DOCTYPE html>"
content_label=False format_arg=False layout_arg=False page_arg=False block_arg=False if (os.path.exists(sys.argv[1])): if sys.argv[1].endswith('.gz'): f=gzip.open(sys.argv[1]) else: f=open(sys.argv[1]) elif (sys.argv[1].startswith('http')): if sys.argv[1].endswith('.gz'): f=GzipFile(fileobj=urlopen(sys.argv[1])) else: f=urlopen(sys.argv[1]) else: urlstream=urlopen("http://www.archive.org/download/%s/%s_abbyy.gz"% (sys.argv[1],sys.argv[1])) zipdata=urlstream.read() f=GzipFile(fileobj=StringIO.StringIO(zipdata)) # f = GzipFile(fileobj=urlopen(sys.argv[1])) # f = open(sys.argv[1]) for par in abbyystreams.abbyytext(f, header=header_arg, footer=footer_arg, format=format_arg,layout=layout_arg, blockfn=block_arg,pagefn=page_arg, debug=debug_arg): if content_label: print "Content:",par.encode('utf-8') else: print par.encode('utf-8')