Exemple #1
0
if "debug" in sys.argv:
    debug_arg=True
else: debug_arg=False

title=False
creator=False
metadata={}

# f = GzipFile(fileobj=urlopen(sys.argv[1]))
# f = open(sys.argv[1])

pars=[]
for par in abbyystreams.abbyytext(f, header=htmlhead, footer=htmlfoot,
    	   			     format=xmlformat,blockfn=htmlblock,
				     pagefn=htmlpage,picture=htmlimg,
				     table=htmltable,escapefn=cgi.escape,
                                     linefn=linehandler,floatfn=floatblock,
                                     debug=debug_arg):
    pars.append(par)

topclass=False
topscore=-1
for x in classhist:
    if not (topclass and (classhist[x]<topscore)):
       topclass=x
       topscore=classhist[x]
paraprefix="<span class='%s'>"%topclass
paracount=1

print "<?xml version='1.0' encoding='utf-8' ?>"
print "<!DOCTYPE html>"
Exemple #2
0
    content_label=False
    format_arg=False
    layout_arg=False
    page_arg=False
    block_arg=False

if (os.path.exists(sys.argv[1])):
    if sys.argv[1].endswith('.gz'):
        f=gzip.open(sys.argv[1])
    else: f=open(sys.argv[1])
elif (sys.argv[1].startswith('http')):
    if sys.argv[1].endswith('.gz'):
        f=GzipFile(fileobj=urlopen(sys.argv[1]))
    else: f=urlopen(sys.argv[1])
else:
    urlstream=urlopen("http://www.archive.org/download/%s/%s_abbyy.gz"%
                      (sys.argv[1],sys.argv[1]))
    zipdata=urlstream.read()
    f=GzipFile(fileobj=StringIO.StringIO(zipdata))
    
# f = GzipFile(fileobj=urlopen(sys.argv[1]))
# f = open(sys.argv[1])

for par in abbyystreams.abbyytext(f, header=header_arg, footer=footer_arg,
    	   			     format=format_arg,layout=layout_arg,
				     blockfn=block_arg,pagefn=page_arg,
                                     debug=debug_arg):
    if content_label:
        print "Content:",par.encode('utf-8')
    else: print par.encode('utf-8')