def number_roots(m): global TEIcount TEIcount = TEIcount + 3 return """<TEI xml:id="ch%s" xmlns="http://www.tei-c.org/ns/1.0">""" % TEIcount ## Main files = handle_args(sys.argv[1:]) if not os.path.isdir(output_dir): os.mkdir(output_dir) for file in files: # set counters to 0 (or a value) to make them reset to that value for each file # comment counters out to increment across files # (maybe add a command-line switch for this) # bibcount = 70 # divcount = 110 # pcount = 480 s = read_file(file) s = re.sub(r"""<bibl>""", number_bibs, s) ## <-- edit search regex! ## previous s = re.sub(r"""<div>""", number_divs, s) s = re.sub(r"""<p>""", number_paras, s) s = re.sub(r"""<TEI>""", number_roots, s) write_file(os.path.join(output_dir, file), s) print "\n TEIcount ended at %s." % TEIcount print "\n Bibcount ended at %s." % bibcount print "\n Divcount ended at %s." % divcount print "\n Pcount ended at %s." % pcount
from implib.io import read_file, write_file import re, sys f = read_file(sys.argv[1]) split = re.compile(r'^.*?$', re.MULTILINE) l = split.findall(f) for item in l: while l.count(item) > 1: l.remove(item) s = '' for item in l: s = s + item + '\n' write_file(sys.argv[2], s)
('\[hm(.+?)\]', r''), ('\[cf..?\]', r''), ('\[..', r''), ('<page.*?>', r''), ('<(/?)I>', r'<\1i>'), ('<(/?)B>', r'<\1b>'), ('<(/?)INF>', r'<\1sub>'), ('<(/?)SUB>', r'<\1sub>'), ('<(/?)SUP>', r'<\1sup>'), ('0\]-9\]', r'') ] files = glob.glob('*toc') for file in files: f = read_file(file) s_id = shortID(f) l_id = longID(f) toc = '<?xml version="1.0" encoding="iso-8859-1"?>\n' \ '<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" ' \ '"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n' \ '<html xmlns="http://www.w3.org/1999/xhtml">\n<head>\n<title>' \ 'BSSA, Volume ' + s_id + '</title>\n</head>\n' \ '<body bgcolor="#FFFFFF" link="#0000FF" vlink="#FF0000" alink="#E6E6E6">' \ '\n<table cellpadding="20" cellspacing="0" width="500">\n<tr><td>\n' \ '<h2>Bulletin of the<br />Seismological Society of America</h2>\n' \ '<p><b><span style="font-size: larger">' + l_id + '</span></b></p>\n' \ '<h2>Contents</h2>\n' regex = re.compile('<pag>.*?\]<1>', re.DOTALL) f = regex.sub('<1>', f) f = regexr(cleanupL, f)