Example #1
0
def number_roots(m):
    global TEIcount
    TEIcount = TEIcount + 3
    return """<TEI xml:id="ch%s" xmlns="http://www.tei-c.org/ns/1.0">""" % TEIcount


## Main
files = handle_args(sys.argv[1:])

if not os.path.isdir(output_dir):
    os.mkdir(output_dir)

for file in files:
    # set counters to 0 (or a value) to make them reset to that value for each file
    # comment counters out to increment across files
    # (maybe add a command-line switch for this)
    # bibcount = 70
    # divcount = 110
    # pcount = 480
    s = read_file(file)
    s = re.sub(r"""<bibl>""", number_bibs, s)  ## <-- edit search regex! ## previous
    s = re.sub(r"""<div>""", number_divs, s)
    s = re.sub(r"""<p>""", number_paras, s)
    s = re.sub(r"""<TEI>""", number_roots, s)
    write_file(os.path.join(output_dir, file), s)

print "\n  TEIcount ended at %s." % TEIcount
print "\n  Bibcount ended at %s." % bibcount
print "\n  Divcount ended at %s." % divcount
print "\n  Pcount ended at %s." % pcount
from implib.io import read_file, write_file
import re, sys
f = read_file(sys.argv[1])
split = re.compile(r'^.*?$', re.MULTILINE)
l = split.findall(f)

for item in l:
	while l.count(item) > 1:
		l.remove(item)

s = ''
for item in l:
	s = s + item + '\n'
	
write_file(sys.argv[2], s)
Example #3
0
    ('\[hm(.+?)\]', r''),
    ('\[cf..?\]', r''),
    ('\[..', r''),
    ('<page.*?>', r''),
    ('<(/?)I>', r'<\1i>'),
    ('<(/?)B>', r'<\1b>'),
    ('<(/?)INF>', r'<\1sub>'),
    ('<(/?)SUB>', r'<\1sub>'),
    ('<(/?)SUP>', r'<\1sup>'),
    ('0\]-9\]', r'')
]

files = glob.glob('*toc')

for file in files:
    f = read_file(file)
    s_id = shortID(f)
    l_id = longID(f)
    toc = '<?xml version="1.0" encoding="iso-8859-1"?>\n' \
       '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" ' \
       '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n' \
       '<html xmlns="http://www.w3.org/1999/xhtml">\n<head>\n<title>' \
       'BSSA, Volume ' + s_id + '</title>\n</head>\n' \
       '<body bgcolor="#FFFFFF" link="#0000FF" vlink="#FF0000" alink="#E6E6E6">' \
       '\n<table cellpadding="20" cellspacing="0" width="500">\n<tr><td>\n' \
       '<h2>Bulletin of the<br />Seismological Society of America</h2>\n' \
       '<p><b><span style="font-size: larger">' + l_id + '</span></b></p>\n' \
       '<h2>Contents</h2>\n'
    regex = re.compile('<pag>.*?\]<1>', re.DOTALL)
    f = regex.sub('<1>', f)
    f = regexr(cleanupL, f)
Example #4
0
from implib.io import read_file, write_file
import re, sys
f = read_file(sys.argv[1])
split = re.compile(r'^.*?$', re.MULTILINE)
l = split.findall(f)

for item in l:
    while l.count(item) > 1:
        l.remove(item)

s = ''
for item in l:
    s = s + item + '\n'

write_file(sys.argv[2], s)