from collator3 import collate
import filekeeping
from zipfile import ZipFile
from glob import glob

collectiondir = '/Volumes/obelisk/zipped/serials/'

HTids_to_process = ['mdp.39015065345954']

count = 0

for HTid in HTids_to_process:
    print(HTid)
    
    ## Change to match scheme for non-simple HTids...
    path, postfix = filekeeping.pairtreepath(HTid, collectiondir)
    pagepath = path + postfix + "/"
    filename = postfix + ".zip"

    # For each HTid, we get a path in the pairtree structure.
    # Then we read page files, and concatenate them in a list of pages
    # where each page is a list of lines.
    
    pagelist = []
    
    with ZipFile(pagepath + filename,mode='r') as zipvol:
        zippages = zipvol.namelist()
        zippages.sort()
        del zippages[0]
        count = 0
        for f in zippages:
Example #2
0
            if header in remove:
                del pagelist[idx][0]
        if idx in divplace:
            page.insert(0,"<div id=\"" + divplace[idx][1] + "\" code=\"" + str(divplace[idx][3]) + "\" wordcount=\"" + str(divplace[idx][2]) + "\">\n")
            pagelist[divplace[idx][0]].append("</div>\n")
    
    return pagelist
    
for HTid in HTids_toprocess:

    # For each HTid, we get a path in the pairtree structure.
    # Then we read page files, and concatenate them in a list of pages
    # where each page is a list of lines.
    
    path, postfix = filekeeping.pairtreepath(HTid,pairtree_rootpath)
    pagepath = path + postfix + "/" + postfix + "/"
    pagefiles = os.listdir(pagepath)
    pagelist = []

    
    for f in pagefiles:
        if f[0] == ".":
            continue
        with open(pagepath + f, encoding='utf-8') as file:
            linelist = file.readlines()
            pagelist.append(linelist)

    # We're going to keep pageheaders rigorously aligned with pagelist,
    # so every page gets a 'header,' even if blank.