def get_metadata(self, tag=None):
     if self.__metadata == None:
         mpath = self.metadata_path()
         if os.path.exists(mpath):
             self.__metadata = read_metadata (self.metadata_path())
     if tag:
         return self.__metadata and self.__metadata.get(tag)
     else:
         return self.__metadata or {}
        def rip (self, location, doc_id):

            rootpath = os.path.join(location, "originals", "original.html")
            pagecontentspath = os.path.join(location, "webpagecontents.txt")
            md = read_metadata(os.path.join(location, "metadata.txt"))
            mimetype = md.get("apparent-mime-type")
            #note("location is %s, rootpath is %s, mimetype = %s", location, rootpath, mimetype)
            if (mimetype == "text/html") and os.path.exists(rootpath):
                # clean it
                pc = PageCleaner2(rootpath)
                text = pc.textify().strip()
                if text:
                    fp = codecs.open(pagecontentspath, "w", "UTF-8")
                    fp.write(text)
                    fp.write("\n")
                    fp.close()
def main (argv):

    if len(argv) < 1 or (not os.path.isdir(argv[0])):
        sys.stderr.write("Invalid directory specified.\n")
        sys.exit(1)

    set_verbosity(4)
    files = os.listdir(argv[0])
    if ("docs" in files) and ("overhead" in files):
        from uplib.repository import Repository
        from uplib.plibUtil import configurator

        uplib_version = configurator().get("UPLIB_VERSION")
        r = Repository(uplib_version, argv[0], read_metadata(os.path.join(argv[0], "overhead", "metadata.txt")))

        build_index_1_0(r)
def update_document_metadata_from_acm_diglib (location):

    def charref_replace(matchobj):
        return unichr(int(matchobj.group('charcode')))

    def parse_endnote(newdict, md, endnote):
        parts = endnote.strip().split("\n")
        authors = ""
        for part in parts:
            p = ENDNOTE_CHARREF.sub(charref_replace, part.strip())
            if p.startswith("%T "):
                newdict['title'] = p[3:].strip()
                newdict['title-is-original-filepath'] = ''
            elif p.startswith("%P "):
                newdict['page-numbers'] = p[3:].strip()
            elif p.startswith("%D "):
                # we override any existing date, because often the PDF file had
                # a bad date in it -- the date it was scanned to add to the library
                year, month, day = parse_date(p[3:].strip())
                newdict['date'] = "%s/%s/%s" % (month, day, year)
            elif p.startswith("%A "):
                # ignore any author metadata in the PDF file
                if authors:
                    authors += " and "
                authors += p[3:].strip()
        if authors:
            d['authors'] = authors

    mdpath = os.path.join(location, "metadata.txt")
    md = read_metadata(mdpath)
    if md.has_key("original-url") and "portal.acm.org" in md.get("original-url"):
        bibtex, endnote, abstract = fetch_bibtex_and_endnote_from_acm_diglib(md.get("original-url"))
        if bibtex or endnote:
            d = {}
            if bibtex:
                d['bibtex-citation'] = re.sub("\n", " ", bibtex)
            if endnote:
                parse_endnote(d, md, endnote)
                d['endnote-citation'] = re.sub("\n", " / ", endnote)
            if bibtex and not md.has_key("citation"):
                d["citation"] = re.sub("\n", " ", bibtex)
            if abstract and not md.has_key("abstract"):
                d["abstract"] = re.sub("\n|<par>|</par>", " ", abstract)
            update_metadata(mdpath, d)
        else:
            note("Couldn't fetch citation info for URL \"%s\".", md.get("original-url"))
    def rip (self, folder, docid):

        def encodestring(s):
            # nytimes strings have xml char refs, and we want Unicode
            if not s:
                return s

            s = re.sub(r"&#([0-9]+);", lambda x: unichr(int(x.group(1))), s)
            s = re.sub(r"&([a-z]+);", lambda x: htmlentitydefs.name2codepoint(x.group(1)), s)
            return s

        mdpath = os.path.join(folder, "metadata.txt")
        originalspath = os.path.join(folder, "originals", "original.html")
        if not (os.path.exists(mdpath) and os.path.exists(originalspath)):
            return
        md = read_metadata(mdpath)
        url = md.get("original-url")
        if not url:
            return
        host, port, path = parse_URL(url)
        if host != "www.nytimes.com":
            return

        # OK, it's from the NY Times
        new_metadata = MetadataGatherer.parse(originalspath)

        if "source" not in md:
            md["source"] = "New York Times"

        # not all articles have metadata...
        if not ((('title' in new_metadata) or ('hdl' in new_metadata)) and ('pdate' in new_metadata)):
            note(3, "No metadata in article:  %s", new_metadata)
            return

        md["title"] = encodestring(new_metadata.get("hdl") or md.get("title"))
        if "date" not in md:
            # get the date
            d = new_metadata.get("pdate")
            md["date"] = "%s/%s/%s" % (d[4:6], d[6:], d[:4])
        if "authors" not in md:
            # get the byline
            d = new_metadata.get("byl")
            if d:
                if d.startswith("By "):
                    d = d[3:]
                # capitalize properly
                d = d.title()
                # lowercase "And"
                d = d.replace(" And ", " and ")
                md["authors"] = encodestring(d)
        d = new_metadata.get("keywords")
        d0 = md.get("keywords")
        if d0:
            d0 += ("," + d)
        else:
            d0 = d
        if d0:
            md["keywords"] = encodestring(d0)
        if new_metadata.get("description"):
            md["summary"] = encodestring(new_metadata.get("description"))
        update_metadata(mdpath, md)
def thumbnail_translation_and_scaling (folder, d=None, update=true, recalc=false):
    
    # 'translation' is in units of points
    # 'scaling' is in units of pixels/point

    if d is None:
        d = dict()

    def find_data (key):
        return d.get(key) or doc_metadata.get(key)

    def parse_value (x):
        if x is None:
            return None
        elif type(x) in types.StringTypes:
            return eval('(' + x + ')')
        elif type(x) in types.TupleType:
            return x
        else:
            raise ValueError("argument " + str(x) + " must be string or tuple")

    metadata_file = os.path.join(folder, "metadata.txt")
    doc_metadata = read_metadata(metadata_file)

    if recalc:
        translation = None
        scaling = None
    else:
        translation = parse_value(doc_metadata.get("big-thumbnail-translation-points"))
        scaling = parse_value(doc_metadata.get("big-thumbnail-scaling-factor"))

    if scaling is None or translation is None:

        cropbox_data = find_data("cropping-bounding-box")
        images_size = eval('(%s)' % find_data("images-size"))
        if cropbox_data:
            cropbox = [eval('(%s)' % x) for x in cropbox_data.split(';')]
        else:
            cropbox = [(0,0), images_size]
        big_thumbnail_size = find_data("big-thumbnail-size")
        if big_thumbnail_size:
            big_tn_size = eval('(%s)' % big_thumbnail_size)
        else:
            from PIL import Image
            big_tn_size = Image.open(os.path.join(folder, "thumbnails", "big1.png")).size

        ppi = int(find_data("tiff-dpi") or find_data("images-dpi") or 300)

        # Remember that cropped page images have a 20 pixel border added back after scaling.
        #

        left_crop_border = 0
        right_crop_border = 0
        top_crop_border = 0
        bottom_crop_border = 0

        if cropbox_data:        
            if cropbox[0][0] != 0:
                left_crop_border = 20
            if cropbox[0][1] != 0:
                top_crop_border = 20
            if cropbox[1][0] != images_size[0]:
                right_crop_border = 20
            if cropbox[1][1] != images_size[1]:
                bottom_crop_border = 20

        # calculate a translation quantity in "points"
        translation = (0 - float((cropbox[0][0] - left_crop_border) * 72)/ppi,
                       0 - float((cropbox[0][1] - top_crop_border) * 72)/ppi)

        # calculate a scaling factor that goes from bounding box edges in "points" to
        # scaled thumbnail coordinates in "pixels"
        #
        scaling = (float(ppi * big_tn_size[0])/float(72 * (cropbox[1][0] - cropbox[0][0] + (left_crop_border + right_crop_border))),
                   float(ppi * big_tn_size[1])/float(72 * (cropbox[1][1] - cropbox[0][1] + (top_crop_border + bottom_crop_border))))

        # now read the wordboxes and calculate the thumbnail bounding boxes for them
        note(4, "    for %s:  translation is %f, %f, scaling is %f, %f",
             folder, translation[0], translation[1], scaling[0], scaling[1])

        if update:
            update_metadata(metadata_file,
                            {'big-thumbnail-scaling-factor' : "%f,%f" % scaling,
                             'big-thumbnail-translation-points' : "%f,%f" % translation})

    return translation, scaling
def do_thumbnails (dirpath, output_dir, **params):
    note(2, "  thumbnailing in %s...", dirpath)
    tmpdir = tempfile.mktemp()
    retval = params.get('returnvalue', false)
    doc_metadata_path = os.path.join(dirpath, "metadata.txt")
    try:
        os.mkdir(tmpdir)
        os.chmod(tmpdir, 0700)
        try:

            md = read_metadata(doc_metadata_path)
            is_temporary_doc = md.get("temporary-contents")
            if is_temporary_doc and (is_temporary_doc == "true"):
                # temporary -- don't spend much time on this
                create_temporary_icons (md, dirpath, output_dir, params)
                retval = true
                return

            if os.path.exists(os.path.join(dirpath, "document.tiff")):
                # contains one single-page TIFF file
                tiffmaster = os.path.join(tmpdir, "master.tiff")
                split_command = (TIFF_SPLIT_CMD
                                 % (TIFFCP, os.path.join(dirpath, "document.tiff"), tiffmaster,
                                    TIFFSPLIT, tiffmaster, os.path.join(tmpdir, "x")))
                status, output, tsignal = subproc(split_command)
                if status != 0:
                    raise Error ("'%s' signals non-zero exit status %d in %s => %s"
                                 % (split_command, status, dirpath, tmpdir))
                parts_dir = tmpdir
                filecheck_fn = lambda fn: fn[0] == "x"
            elif (os.path.exists(os.path.join(dirpath, "page-images")) and
                  os.path.isdir(os.path.join(dirpath, "page-images"))):
                # contains directory full of PNG page images
                parts_dir = os.path.join(dirpath, "page-images")
                filecheck_fn = lambda fn: (fn.startswith('page') and fn.endswith('.png'))
            else:
                raise Error("No page images for document in %s" % dirpath)

            tiff_parts = os.listdir(parts_dir)
            if len(tiff_parts) < 1:
                raise Error("No pages in split tiff file directory after split!")
            # either a PNG-images or a TIFF split will sort properly in lexicographic order
            tiff_parts.sort()

            # see if there's a document icon legend and info about the DPI of the tiff file
            legend = md.get('document-icon-legend')
            tiff_dpi = int(md.get('images-dpi') or md.get('tiff-dpi') or params.get('images-dpi') or 0)
            page_numbers_v = md.get('page-numbers')
            page_numbers = (page_numbers_v and figure_page_numbers(page_numbers_v, dirpath))
            first_page = int(md.get('first-page-number', 1))
            skips = md.get('document-bbox-pages-to-skip', '')
            if skips:
                parts = string.split(skips, ':')
                bbox_skips = []
                for part in parts:
                    bbox_skips = bbox_skips + map(int, string.split(part, ','))
            else:
                bbox_skips = None

            # figure bounding box for imaged page
            page_count = 0
            bbox = None
            note(2, "    calculating bounding box for large pages...")
            dont_crop = md.get('dont-crop-big-thumbnails', false)
            if AUTO_CROP_BIG_THUMBNAILS and not dont_crop:
                do_bbox = true
            else:
                do_bbox = false
            for tiff_part in tiff_parts:
                if not filecheck_fn(tiff_part):
                    continue
                if page_count == 0:
                    # find the width and height of the document
                    docwidth, docheight = figure_doc_size(os.path.join(parts_dir, tiff_part))
                    if not do_bbox:
                        bbox = (0, 0, docwidth, docheight)
                if do_bbox:
                    bbox = figure_bbox (os.path.join(parts_dir, tiff_part), page_count, bbox, bbox_skips)
                if (bbox and bbox[0] == 0) and (bbox[1] == 0) and (bbox[2] >= docwidth) and (bbox[3] >= docheight):
                    # don't bother, there's no area to crop already
                    do_bbox = false
                page_count = page_count + 1
            if page_count == 0:
                raise Error("No pages in split tiff file directory after split!")
            note(2, "      final bbox is %s, page_count is %d", bbox, page_count)

            if USE_VIRTUAL_INK:
                note(2, "      alpha channels will be added to large thumbnails...")

            # now make the thumbnails
            big_thumbnail_size = []
            small_thumbnail_size = []
            icon_size = []
            page_index = 0
            for tiff_part in tiff_parts:
                if not filecheck_fn(tiff_part):
                    note(3, "    skipping %s", tiff_part)
                    continue
                tiff_path = os.path.join(parts_dir, tiff_part)
                if page_numbers:
                    page_no_string = page_numbers.get(page_index)
                else:
                    page_no_string = None
                note (2, "    page %d%s", page_index, (page_no_string and "   (%s)" % page_no_string) or "")
                try:
                    if not create_thumbnail(tiff_path, tiff_dpi, output_dir,
                                            page_index, first_page, page_count, bbox, bbox_skips,
                                            big_thumbnail_size, small_thumbnail_size, icon_size,
                                            params.get('maxwidth'), params.get('maxheight'), params.get('maxscaling'),
                                            params.get('thumbnail_strategy'), legend, page_no_string):
                        raise Error ("Can't create thumbnail for page %d in %s (of %s)" % (page_index, tiff_path, dirpath))
                except Exception, x:
                    doc_id = os.path.split(dirpath)[1]
                    note("exception creating thumbnails for page %d of document %s:\n%s", page_index, doc_id,
                         string.join(traceback.format_exception(*sys.exc_info()), ""))
                    raise AbortDocumentIncorporation(doc_id, str(x))

                if page_index == 0:
                    bt_width = big_thumbnail_size[0]
                    bt_height = big_thumbnail_size[1]
                    st_width = small_thumbnail_size[0]
                    st_height = small_thumbnail_size[1]
                else:
                    bt_width = max(bt_width, big_thumbnail_size[0])
                    bt_height = max(bt_height, big_thumbnail_size[1])
                    st_width = max(st_width, small_thumbnail_size[0])
                    st_height = max(st_height, small_thumbnail_size[1])
                st_scaling = (float(st_width)/float(docwidth) + float(st_height)/float(docheight)) / 2.0
                page_index = page_index + 1

            d = {"page-count" : str(page_count),
                 "tiff-width" : str(docwidth),
                 "images-width" : str(docwidth),
                 "images-size" : "%d,%d" % (docwidth, docheight),
                 "cropping-bounding-box" : "%d,%d;%d,%d" % (bbox),
                 "big-thumbnail-size" : "%s,%s" % (bt_width, bt_height),
                 "small-thumbnail-size" : "%s,%s" % (st_width, st_height),
                 "small-thumbnail-scaling" : "%f" % st_scaling,
                 "icon-size" : "%d,%d" % icon_size[0],
                 "images-height" : str(docheight),
                 "tiff-height" : str(docheight) }

            translation, scaling = thumbnail_translation_and_scaling(dirpath, d, false, true)
            d["big-thumbnail-translation-points"] = "%f,%f" % translation
            d["big-thumbnail-scaling-factor"] = "%f,%f" % scaling
            update_metadata(os.path.join(dirpath, "metadata.txt"), d)

        finally:
            shutil.rmtree(tmpdir)

        # indicate successful completion
        note(2, "  finished.")
        retval = true
    def rip(self, folder, docid):
        def encodestring(s):
            # WashPost strings have xml char refs, and we want Unicode
            if not s:
                return s

            s = re.sub(r"&#([0-9]+);", lambda x: unichr(int(x.group(1))), s)
            s = re.sub(r"&([a-z]+);", lambda x: htmlentitydefs.name2codepoint(x.group(1)), s)
            return s

        def dequote(s):
            return re.sub(r"\\'", "'", s)

        def catclean(s):
            return re.sub(r"[/,]", "_", s)

        mdpath = os.path.join(folder, "metadata.txt")
        originalspath = os.path.join(folder, "originals", "original.html")
        if not (os.path.exists(mdpath) and os.path.exists(originalspath)):
            return
        md = read_metadata(mdpath)
        url = md.get("original-url")
        if not url:
            return
        host, port, path = parse_URL(url)
        if host != "www.washingtonpost.com":
            return

        # OK, it's from the Post
        new_metadata = MetadataGatherer.parse(originalspath)
        for line in open(originalspath):
            if line.startswith(_HEADLINE):
                line = line[len(_HEADLINE) :].strip("\n")
                t = _TITLEPATTERN.match(line)
                if t:
                    new_metadata["hdl"] = dequote(t.group("title"))
            m = _AUTHORSPATTERN.search(line)
            if m:
                new_metadata["authors"] = dequote(line[len(m.group(0)) :].strip(" ';\n"))
            if line.startswith(_CONTENTID):
                new_metadata["content-id"] = line[len(_CONTENTID) :].strip(" ';\n")
            if line.startswith(_SECTION):
                section = line[len(_SECTION) :].strip(" ';\n")
                i = section.index("'")
                new_metadata["section"] = section[:i]

        if "source" not in md:
            md["source"] = "Washington Post"

        # not all articles have metadata...
        if not ("hdl" in new_metadata):
            note(3, "No metadata in article:  %s", new_metadata)
            return

        md["title"] = encodestring(new_metadata.get("hdl") or md.get("title"))

        if "date" not in md:
            # get the date
            d = _URLDATEPATTERN.match(url)
            if d:
                md["date"] = "%s/%s/%s" % (d.group("month"), d.group("day"), d.group("year"))

        if "authors" not in md:
            # get the byline
            d = new_metadata.get("authors")
            if d:
                md["authors"] = encodestring(d)

        d = new_metadata.get("keywords")
        d0 = md.get("keywords")
        if d and d0:
            d0 = [x.strip() for x in d0.split(",")] + [x.strip() for x in d.split(";")]
        elif d:
            d0 = [x.strip() for x in d.split(";")]
        if d0:
            md["keywords"] = encodestring(",".join(d0))
        if new_metadata.get("description"):
            md["summary"] = encodestring(new_metadata.get("description"))
            md["abstract"] = encodestring(new_metadata.get("description"))
        section = new_metadata.get("section")
        if section:
            c = md.get("categories")
            if c:
                c = [x.strip() for x in c.split(",")]
            else:
                c = []
            c = c + ["article", "Washington Post/%s" % catclean(section)]
            md["categories"] = ",".join(c)
        content_id = new_metadata.get("content-id")
        if content_id:
            md["citation"] = "Washington Post article %s" % content_id
        update_metadata(mdpath, md)
def do_HTML (dirpath, html_dir, doc_id, port):

    note(3, "  HTMLing in %s...", dirpath)
    html_index = os.path.join(dirpath, "index.html")
    doc_id = os.path.basename(dirpath)
    retval = false
    try:
        if not os.path.exists(html_dir):
            os.mkdir(html_dir)
            os.chmod(html_dir, 0700)

        metadata = read_metadata(os.path.join(dirpath, "metadata.txt"))
        title = metadata.get('name') or metadata.get('title') or doc_id
        pagewidth = None
        pageheight = None
        bts = metadata.get('big-thumbnail-size')
        if bts:
            pagewidth, pageheight = [int(x) for x in string.split(bts, ',')]
            note(3, "    title is %s, pagesize is %sx%s", title, pagewidth, pageheight)

        # start with summary.html

        note(3, "    summary.html")
        summarypath = os.path.join(dirpath, "summary.txt")
        if os.path.exists(summarypath):
            f = open(summarypath, 'r')
            summary_text = f.read()
            f.close()
            html_summary = htmlescape(summary_text, true)
        else:
            html_summary = ""
        html_summary_path = os.path.join(html_dir, "summary.html")
        f = open(html_summary_path, 'w')
        f.write('<html><body>' + html_summary + '</body></html>');
        f.close()
        os.chmod(html_summary_path, 0600)        

        # next thumbs.html

        note(3, "    thumbs.html")
        thumbs_path = os.path.join(html_dir, "thumbs.html")
        f = open(thumbs_path, "w")
        if USE_VIRTUAL_INK:
            bgcolor = "white"
        else:
            bgcolor = STANDARD_TOOLS_COLOR
        f.write('<html><body bgcolor="%s"><center>\n' % bgcolor)
        thumbnail_dir = os.path.join(dirpath, "thumbnails")
        thumbnail_files = os.listdir(thumbnail_dir)
        thumbs = []
        for thumbnail in thumbnail_files:
            m = re.match(r"(\d+).png", thumbnail)
            if m:
                thumbs.append((int(m.group(1)), thumbnail,))
        thumbs.sort()
        for thumbnail in thumbs:
            page_no = int(thumbnail[0])
            f.write('<a href="page%s.html" target=viewarea>' % page_no)
            f.write('<img src="../thumbnails/%s" border=1></a><br>\n' % thumbnail[1])

            # now write the HTML connected to that thumbnail
            page_html = os.path.join(html_dir, "page%s.html" % page_no)
            f2 = open (page_html, 'w')
            # get width of large page
            if not pagewidth or not pageheight:
                im = Image.open(os.path.join(thumbnail_dir, "big%s.png" % page_no))
                pagewidth, pageheight = im.size[0] - 25, im.size[1]
                note(3, "    title is %s, pagesize is %sx%s", title, pagewidth, pageheight)
                del im
            f2.write('<html><body bgcolor="white"><img src="../thumbnails/big%s.png" usemap="#page%smap" border=0>\n' % (page_no, page_no))
            f2.write('<map name="page%smap">\n' % page_no)
            if (page_no < len(thumbs)):
                f2.write('<area href="page%s.html" alt="to Page %s" shape="circle" coords="%s,60,10">\n'
                         % (page_no + 1, page_no + 1, pagewidth + 15))
                f2.write('<area href="page%s.html" alt="to Page %s" shape="rect" coords="%s,0,%s,%s">\n'
                         % (page_no + 1, page_no + 1, pagewidth/2, pagewidth, pageheight))
            if (page_no > 1):
                f2.write('<area href="page%s.html" alt="to Page %s" shape="circle" coords="%s,90,10">\n'
                         % (page_no - 1, page_no - 1, pagewidth + 15))
                f2.write('<area href="page%s.html" alt="to Page %s" shape="rect" coords="0,0,%s,%s">\n'
                         % (page_no - 1, page_no - 1, (pagewidth/2)-1, pageheight))
            f2.write('<area href="/" alt="to repository" target="_top" shape="circle" coords="%s,207,10">\n'
                     % (pagewidth + 15))
            f2.write('</map></body></html>\n')
            f2.close()
            os.chmod(page_html, 0600)
        f.write('</center></body></html>')
        f.close()
        os.chmod (thumbs_path, 0600)

        # next is controls.html

        note(3, "    controls.html")
        controls_path = os.path.join(html_dir, "controls.html")
        f = open(controls_path, "w")
        if CONTROLS_TEMPLATE:
            f.write(CONTROLS_TEMPLATE % { 'doc-id': doc_id })
        else:
            f.write('<html>\n<head>\n')
            f.write('<script type="text/javascript">\n')
            f.write('function newInWindow(did, title, w, h, sidebar, twopage) {\n')
            f.write('  var s = "/action/basic/dv_show?doc_id=" + did + "&no-margin=1";\n')
            f.write('  var c = "width=" + w + ",height=" + h;\n')
            f.write('  if (!sidebar)\n')
            f.write('    s = s + "&no-sidebar=1";\n')
            f.write('  if (twopage)\n')
            f.write('    s = s + "&two-pages=1";\n')
            f.write('  defaultStatus = s;\n')
            f.write('  window.open(s, title, config=c);\n')
            f.write('}\n')
            f.write('</script></head><body bgcolor="%s">\n<center>\n' % STANDARD_TOOLS_COLOR)
            f.write("""<a href="javascript:newInWindow('%s','%s', %d+30, %d+10, false, false); void 0;">Detach</a>""" % (doc_id, htmlescape(title, true), pagewidth, pageheight))
            f.write(""" <a href="javascript:newInWindow('%s','%s', (2 * %d)+30, %d+10, false, true); void 0;">(2)</a>\n""" % (doc_id, htmlescape(title, true), pagewidth, pageheight))
            buttons = get_buttons_sorted(FN_DOCUMENT_SCOPE)
            for button in buttons:
                url = button[1][4]
                target = button[1][3]
                label = button[1][0]
                if url:
                    f.write('<br>\n<a href="%s"' % htmlescape(url % doc_id, true))
                else:
                    f.write('<br>\n<a href="/action/basic/repo_userbutton?uplib_userbutton_key=%s&doc_id=%s"' % (button[0], doc_id))
                if target:
                    f.write(' target="%s"' % target)
                f.write('>%s</a>\n' % label)
            f.write("</center></body></html>")
        f.close()
        os.chmod(controls_path, 0600)

        # then index.html

        note(3, "    index.html")
        f = open(html_index, "w")
        f.write('<head>\n')
        f.write('<title>%s</title>\n</head>\n' % htmlescape(title))
        f.write('<base target="_top">'
                '<frameset cols="%s,*">'
                '<frameset rows="%s,*">'
                '<frame name=controls src="./html/controls.html">'
                '<frame name=thumbs src="./html/thumbs.html">'
                '</frameset>'
                '<frame name="viewarea" src="./html/page1.html">'
                '</frameset>\n' % (THUMBNAIL_COLWIDTH, CONTROLS_HEIGHT))
        f.close()
        os.chmod(html_index, 0600)

        # indicate successful completion
        note(3, "  finished.")
        retval = true

    except:
        info = sys.exc_info()
        note(0, "exception raised in createHTML:\n%s\n", string.join(traceback.format_exception(*info)))
        raise

    else:
        if not retval:
            note("bad retval %s", retval)
            if os.path.exists(html_index): os.unlink(html_index)
            if os.path.exists(html_dir): shutil.rmtree(html_dir)
def build_index_1_1 (repo):

    overhead_dir = repo.overhead_folder()
    index_file = os.path.join(overhead_dir, "index.upri")
    repo_mtime = repo.mod_time()
    note(3, "Considering rebuild of repository metadata index file...")
    if os.path.exists(index_file):
        # see if it's newer than the metadata.txt file
        mtime = os.path.getmtime(index_file)
        note(3, "repo mod time is %s, index file mod time is %s", repo_mtime, mtime)
        if mtime >= repo_mtime:
            note(3, "Index up-to-date.")
            return

    note("Re-building repository metadata index...")

    # need to rebuild index

    # some variables to keep track of categories and collections
    categories={}
    collections={}
    documents={}
    authors={}

    # read the repository metadata
    mdata = read_metadata(os.path.join(overhead_dir, "metadata.txt"))
    repo_password_hash = mdata.get('password')
    repo_password_hash = (repo_password_hash and binascii.a2b_hex(repo_password_hash)) or (20 * '\0')
    

    # List the number of documents

    def figure_author_name (basename):
        def clean_token(t):
            v = t.strip()
            if v[-1] == ",":
                v = v[:-1]
            return v
        honorifics = ("MD", "M.D.", "PhD", "Ph.D.", "Jr.", "Sr.", "II", "III", "IV", "V", "MPA")
        tokens = [clean_token(x) for x in basename.strip().split(' ') if x.strip()]
        if not tokens:
            note("Author name \"%s\" => %s", basename, tokens)
            return ""
        v = tokens[-1]
        h = ""
        while v in honorifics:
            h = h + ((h and " ") or "") + v
            tokens = tokens[:-1]
            v = tokens[-1]
        if len(tokens) > 2 and (tokens[-2] in ("van", "de", "von")):
            v = tokens[-2] + " " + v
            tokens = tokens[:-1]
        if tokens[:-1]:
            v = v + ", " + string.join(tokens[:-1])
        if h:
            v = v + ", " + h
        return v

    def read_document (doc, categories, collections, authors):

        def figure_date(datestring):
            d2 = parse_date(datestring)
            if (not d2) or (sum(d2) == 0):
                return 0
            return d2[0] * (13 * 32) + d2[1] * 13 + d2[2]

        docdata = {'id': doc.id, 'rloc': 0}
        mdata = doc.get_metadata()
        docdata['title'] = mdata.get('title', "")
        docdata['page-count'] = int(mdata.get('page-count', 1))
        date = mdata.get('date')
        if date:
            docdata['date'] = figure_date(date);
        else:
            docdata['date'] = 0
        docdata['addtime'] = int(id_to_time(doc.id))
        # we don't really know the reftime (FIXME) but we'll use the document add time as an approximation
        docdata['reftime'] = docdata['addtime']
        docdata['categories'] = []
        cstring = mdata.get('categories', "")
        if cstring:
            for category in split_categories_string(cstring):
                if not category in categories:
                    categories[category] = { 'rloc': 0, 'docs': [ doc.id, ], 'name': category }
                else:
                    categories[category]['docs'].append(doc.id)
                docdata['categories'].append(category)
        docdata['authors'] = []
        auths = mdata.get('authors', "").split(" and ")
        for auth in auths:
            if auth:
                authname = figure_author_name(auth)
                if not authname in authors:
                    authors[authname] = { 'rloc': 0, 'docs': [ doc.id, ], 'name': authname }
                else:
                    authors[authname]['docs'].append(doc.id)
                docdata['authors'].append(authname)
        return docdata

    for doc in repo.generate_docs():
        documents[doc.id] = read_document(doc, categories, collections, authors)

    note(3, "    processed documents...")

    # read the collections files
    for collname, coll in repo.list_collections():
        collections[coll.id] = { 'name': collname,
                                 'docs': [doc.id for doc in coll.docs()],
                                 'query': (isinstance(coll, QueryCollection) and coll.query) or "",
                                 'rloc': 0,
                                 'presto': isinstance(coll, PrestoCollection),
                                 'excludes': (isinstance(coll, PrestoCollection) and coll.excludes) or [],
                                 'includes': (isinstance(coll, PrestoCollection) and coll.includes) or [],
                                 'id': coll.id }

    note(3, "    processed collections...")
    
    # now figure out the layout of the index file

    def sorted_values(d, rname):
        def compare(r1, r2):
            v1 = r1.get(rname)
            v2 = r2.get(rname)
            if (type(v1) in types.StringTypes) and (type(v2) in types.StringTypes):
                return cmp(v1.lower(), v2.lower())
            else:
                return cmp(v1, v2)
        l = d.values()
        l.sort(compare)
        return l

    def document_record_size(r):
        return (2 + # offset to next document record
                2 + # page count
                2 + # number of categories
                2 + # number of authors
                4 + # date published
                4 + # date last used
                4 + # date added to repository
                4 * len(r.get('authors')) +
                4 * len(r.get('categories')) +
                2 + (len(r.get("id").encode("UTF-8")) + 1) +       # document ID
                2 + (len(r.get("title").encode("UTF-8")) + 1))

    def category_record_size(r):
        return (2 + # offset to next record
                2 + # number of documents
                4 * len(r.get("docs")) +    # positions of document records
                2 + (len(r.get("name").encode("UTF-8")) + 1))      # category name

    def author_record_size(r):
        return (2 + # offset to next record
                2 + # number of documents
                4 * len(r.get("docs")) +    # positions of document records
                2 + (len(r.get("name").encode("UTF-8")) + 1))      # author name

    def collection_record_size(r):
        return (2 + # offset to next record
                2 + # number of documents
                4 * len(r.get("docs")) +    # positions of document records
                2 + # number of explicitly included documents
                2 + # number of explicitly excluded documents
                4 * len(r.get("includes")) +    # explicitly included
                4 * len(r.get("excludes")) +    # explicitly excluded
                2 + (len(r.get("name").encode("UTF-8")) + 1) +     # collection name
                2 + (len(r.get("query").encode("UTF-8")) + 1))     # collection query

    def repository_record_size(r):
        return (4 + # number of docs in repository
                4 + # number of authors
                4 + # last-modified timem
                2 + # number of categories
                2 + # number of collections
                4 + # first document record
                4 + # first category record
                4 + # first collection record
                4 + # first authors record
                20 + # SHA hash of password
                2 + len(r.get("name", "").encode("UTF-8")) + 1)

    def round8 (v):
        return ((v + 7)/8)*8

    mdata['rsize'] = repository_record_size(mdata)
    mdata['rloc'] = 32
    first_doc_record = round8(mdata['rloc'] + mdata['rsize'])
    loc = first_doc_record
    for document in sorted_values(documents, 'id'):
        document['rsize'] = round8(document_record_size(document))
        document['rloc'] = loc
        loc += document['rsize']
        document['nextoffset'] = document['rsize']
    first_categories_record = loc
    for category in sorted_values(categories, 'name'):
        category['rsize'] = round8(category_record_size(category))
        category['rloc'] = loc
        loc += category['rsize']
        category['nextoffset'] = category['rsize']
    first_collections_record = loc
    for collection in sorted_values(collections, 'name'):
        collection['rsize'] = round8(collection_record_size(collection))
        collection['rloc'] = loc
        loc += collection['rsize']
        collection['nextoffset'] = collection['rsize']
    first_author_record = loc
    for author in sorted_values(authors, 'name'):
        author['rsize'] = round8(author_record_size(author))
        author['rloc'] = loc
        loc += author['rsize']
        author['nextoffset'] = author['rsize']

    note(3, "    figured layout...");

    # output data for debugging

    note(4, "repository name:  %s", mdata.get("name"))
    note(3, "Documents (%d) at %s:", len(documents), first_doc_record)
    for document in documents.values():
        note(4, "  %s\n      %s // %s pages // date %s // %s // %s",
             document['title'], document['authors'], document['page-count'], document['date'], document['id'], document['rloc'])
    note(3, "Categories (%d) at %s:", len(categories), first_categories_record)
    for category in categories.values():
        note(4, "  %s // %d docs // %s", category['name'], len(category['docs']), category['rloc'])
    note(3, "Collections (%d) at %s:", len(collections), first_collections_record)
    for collection in collections.values():
        note(4, "  %s // %s // %d docs // %s", collection['name'], collection['query'], len(collection['docs']), collection['rloc'])
    for author in authors.values():
        note(4, "  %s // %d docs // %s", author['name'], len(author['docs']), author['rloc'])
        for doc in author['docs']:
            r = documents.get(doc)
            note(4, "     %s \"%s\"", r['id'], r['title'])
    note(3, "total size is %s", loc)

    # output the index file

    def out4(fp, v):
        fp.write(struct.pack(">I", v & 0xFFFFFFFF))

    def out2(fp, v):
        fp.write(struct.pack(">H", v & 0xFFFF))

    def outs(fp, v):
        s = (v and v.encode('UTF-8')) or ""
        fp.write(struct.pack(">H", (len(s) + 1) & 0xFFFF) + s + '\0')

    fp = open(index_file, "wb")

    try:
        # index version header
        magic = u"UpLib Repository Index 1.1".encode('US-ASCII')
        fp.write(magic + ('\0' * (32-len(magic))))

        # write out repository information
        out4(fp, len(documents))
        out4(fp, len(authors))
        out4(fp, int(repo_mtime))      # seconds since 1/1/1970
        out2(fp, len(categories))
        out2(fp, len(collections))
        out4(fp, first_doc_record)
        out4(fp, first_categories_record)
        out4(fp, first_collections_record)
        out4(fp, first_author_record)
        fp.write(repo_password_hash)
        outs(fp, mdata.get("name", ""))

        # for each document record, write that
        for document in sorted_values(documents, 'rloc'):
            note(4, "document %s at %s [%s]", document['id'], document['rloc'], document['rsize'])
            fp.seek(document['rloc'])
            out2(fp, document['nextoffset'])
            out2(fp, document['page-count'])
            out2(fp, len(document['categories']))
            out2(fp, len(document['authors']))
            out4(fp, document['date'])
            out4(fp, document['reftime'])
            out4(fp, document['addtime'])
            for a in document['authors']:
                r = authors.get(a)
                out4(fp, (r and r.get('rloc')) or 0)
            for c in document['categories']:
                r = categories.get(c)
                out4(fp, (r and r.get('rloc')) or 0)
            outs(fp, document['id'])
            outs(fp, document['title'])
            fp.flush()

        # write out categories
        for category in sorted_values(categories, 'rloc'):
            note(4, "category %s at %s [%s]", category['name'], category['rloc'], category['rsize'])
            fp.seek(category['rloc'])
            out2(fp, category['nextoffset'])
            out2(fp, len(category['docs']))
            for docid in category['docs']:
                r = documents.get(docid)
                out4(fp, (r and r.get('rloc')) or 0)
            outs(fp, category['name'])
            fp.flush()

        # write out collections
        for collection in sorted_values(collections, 'rloc'):

            note(4, "collection %s at %s [%s] includes=%s excludes=%s", collection['name'], collection['rloc'], collection['rsize'],
                 ((not collection['presto']) and 0xFFFF) or len(collection['includes']),
                 ((not collection['presto']) and 0xFFFF) or len(collection['excludes']))

            fp.seek(collection['rloc'])
            out2(fp, collection['nextoffset'])
            out2(fp, len(collection['docs']))
            for docid in collection['docs']:
                r = documents.get(docid)
                out4(fp, (r and r.get('rloc')) or 0)
            includes = collection['includes']
            excludes = collection['excludes']
            out2(fp, ((not collection['presto']) and 0xFFFF) or len(includes))
            out2(fp, ((not collection['presto']) and 0xFFFF) or len(excludes))
            for docid in includes:
                r = documents.get(docid)
                out4(fp, (r and r.get('rloc')) or 0)
            for docid in excludes:
                r = documents.get(docid)
                out4(fp, (r and r.get('rloc')) or 0)
            outs(fp, collection['name'])
            outs(fp, collection['query'])
            fp.flush()

        # write out authors
        for author in sorted_values(authors, 'rloc'):
            note(4, "author %s at %s [%s]", author['name'], author['rloc'], author['rsize'])
            fp.seek(author['rloc'])
            out2(fp, author['nextoffset'])
            out2(fp, len(author['docs']))
            for docid in author['docs']:
                r = documents.get(docid)
                out4(fp, (r and r.get('rloc')) or 0)
            outs(fp, author['name'])
            fp.flush()

        # finished
        fp.close()
        note(3, "wrote index at %s", os.path.getmtime(index_file))

    except:
        excinfo = sys.exc_info()
        fp.close()
        os.unlink(index_file)
        note(0, "exception %s", traceback.format_exception(*excinfo))
 def get_folder_metadata (self, location):
     return read_metadata(self.folder_metadata_path(location))
def _add_internal (ostream, percent_done_fn, repo, response, params, content, wait):

    # this can be called in several different ways.
    # In general, you post a multipart/form-data body which
    # contains a "contenttype" for the document, and either a "URL"
    # for the content, or a "content" parameter containing the
    # the actual content.  If both "URL" and "content" are present,
    # the URL is added as the "original-url" value for the metadata,
    # and if the content is HTML, it's used as the "original.html"
    # and the URL is used to pull ancillary content referenced in it.

    content_type = params.get("contenttype")
    url = params.get("URL")
    noredir = params.get("no-redirect")
    noredir = noredir and (noredir.lower() == "true")
    uploadloc = url
    docname = params.get("documentname")
    tempf = None
    suppress_duplicates = params.get("suppress-duplicates")
    suppress_duplicates = suppress_duplicates and (suppress_duplicates.lower() == "true")
    bury = params.get("bury")
    bury = bury and (bury.lower() == "true")
    verbosity = int(params.get("verbosity") or "0")
    if content:
        if wait and ostream:
            _rewrite_job_output(ostream, '{ state: 0, msg: "Caching page..."}')
        extension = CONTENT_TYPES.get(content_type)
        if not extension:
            if wait:
                msg = "Don't know what to do with contenttype \"%s\"" % content_type
                if ostream:
                    _rewrite_job_output(ostream, '{state: 1, msg: "' + urllib.quote(msg) + '"}')
                else:
                    response.error(HTTPCodes.UNSUPPORTED_MEDIA_TYPE, msg)
            return
        # special case HTML/XHTML
        if content and (content_type.lower() in ("text/html", "application/xhtml+xml")):
            tempf = tempfile.mkdtemp()
            uploadloc = os.path.join(tempf, "original.html")
            # make sure that the folder for other parts exists, even if empty
            os.mkdir(os.path.join(tempf, "original_files"))
            # remove our bookmarklet, if present
            content = _BOOKMARKLET_PATTERN.sub('', content)
            content = _ADD_FORM_PATTERN.sub('', content)
            c = _OurCacher(url, filename=uploadloc, bits=content, content_type=content_type)
            # make sure that the folder for other parts exists, even if empty
            other_parts = os.path.join(tempf, "original_files")
            if not os.path.exists(other_parts):
                os.mkdir(other_parts)
        # special case 3x5 cards
        elif (docname and (content_type.lower() == "text/plain") and os.path.splitext(docname)[1] == ".3x5"):
            fd, tempf = tempfile.mkstemp(".3x5")
            fp = os.fdopen(fd, "wb")
            fp.write(content)
            fp.close()
            uploadloc = tempf
        else:
            fd, tempf = tempfile.mkstemp("." + extension)
            fp = os.fdopen(fd, "wb")
            fp.write(content)
            fp.close()
            uploadloc = tempf
        if suppress_duplicates:
            hash = calculate_originals_fingerprint(tempf)
            results = repo.do_query("sha-hash:"+hash)
            if results:
                # it's a duplicate
                doc = results[0][1]
                if os.path.isdir(tempf):
                    shutil.rmtree(tempf)
                elif os.path.exists(tempf):
                    os.remove(tempf)
                if ostream:
                    _rewrite_job_output(ostream, '{ state: 2, doc_id: "' + doc.id + '"}')
                elif noredir:
                    response.reply(doc.id, "text/plain")
                else:
                    response.redirect("/action/basic/dv_show?doc_id=%s" % doc.id)
                return
    try:
        try:
            # get a cookie for authentication
            cookie = repo.new_cookie(url or content[:min(100, len(content))])
            cookie_str = '%s=%s; path=/; Secure' % (cookie.name(), cookie.value())
            os.environ["UPLIB_COOKIE"] = cookie_str
            doctitle = params.get("md-title")
            docauthors = params.get("md-authors")
            docdate = params.get("md-date")
            doccats = params.get("md-categories")
            metadata = params.get("metadata")
            if metadata:
                mdtmpfile = tempfile.mktemp()
                open(mdtmpfile, "w").write(metadata)
                # check to see if we're replacing an existing document
                md2 = read_metadata(StringIO.StringIO(metadata))
                existing_doc_id = md2.get("replacement-contents-for")
                if existing_doc_id and not repo.valid_doc_id(existing_doc_id):
                    raise ValueError("Invalid doc ID %s specified for replacement" % existing_doc_id)
            else:
                mdtmpfile = None
                existing_doc_id = None
            # now form the command
            scheme = ((repo.get_param("use-http", "false").lower() == "true") or _use_http) and "http" or "https"
            cmd = '%s --verbosity=%s --repository=%s://127.0.0.1:%s ' % (_uplib_add_document, verbosity, scheme, repo.port())
            if doctitle:
                cmd += ' --title=%s' % pipes.quote(doctitle)
            if docauthors:
                cmd += ' --authors=%s' % pipes.quote(docauthors)
            if docdate:
                cmd += ' --date="%s"' % docdate
            if doccats:
                cmd += ' --categories=%s' % pipes.quote(doccats)
            if mdtmpfile:
                cmd += ' --metadata="%s"' % mdtmpfile
            cmd += ' "%s"' % uploadloc
            if ostream:
                _rewrite_job_output(ostream, '{state: 0, msg: "' + urllib.quote(cmd) + '"}')
            # and invoke the command
            status, output, tsignal = subproc(cmd)
            note(4, "cmd is %s, status is %s, output is %s", repr(cmd), status, repr(output.strip()))
            if mdtmpfile:
                os.unlink(mdtmpfile)
            if status == 0:
                # success; output should be doc-id
                doc_id = existing_doc_id or output.strip().split()[-1]
                note(4, "output is '%s'; doc_id for new doc is %s", output.strip(), doc_id)
                if wait and ostream:
                    _rewrite_job_output(ostream, '{ state: 1, doc_id: "' + doc_id + '", msg: "' + urllib.quote(output) + '"}')
                # wait for it to come on-line
                if percent_done_fn:
                    percent_done_fn(40)         # estimate 40% of work done on client side
                while not repo.valid_doc_id(doc_id):
                    if ostream:
                        pending = repo.list_pending(full=True)
                        s = _first(pending, lambda x: x['id'] == doc_id)
                        if not s:
                            break
                        dstatus = s['status']
                        if dstatus == 'error':
                            msg = 'server-side error incorporating document'
                            _rewrite_job_output(ostream, '{ state: 3, doc_id: "' + doc_id
                                                + '", msg: "' + urllib.quote(s['error']) + '"}')
                            break
                        if dstatus == 'unpacking':
                            msg = 'starting ripper process...'
                        elif dstatus == 'ripping':
                            msg = "ripping with ripper '" + s['ripper'] + "'..."
                        elif dstatus == 'moving':
                            msg = 'adding to registered document set...'
                        _rewrite_job_output(ostream, '{ state: 1, doc_id: "' + doc_id
                                            + '", msg: "' + urllib.quote(msg) + '"}')
                    time.sleep(1.0)
                if percent_done_fn:
                    percent_done_fn(100)        # finished
                if repo.valid_doc_id(doc_id):
                    if bury:
                        # wait up to 100 seconds for it to show up in history list
                        # after that, wait another second, then bury it
                        counter = 100
                        while counter > 0:
                            h = [x.id for x in repo.history()]
                            if doc_id in h:
                                break
                            counter -= 1
                            time.sleep(1)
                        time.sleep(1)
                        repo.touch_doc(doc_id, bury=True, notify=False)
                        note(3, "buried %s", doc_id)
                    if wait:
                        if ostream:
                            _rewrite_job_output(ostream, '{ state: 2, doc_id: "' + doc_id + '"}')
                        elif noredir:
                            response.reply(doc_id, "text/plain")
                        else:
                            response.redirect("/action/basic/dv_show?doc_id=%s" % doc_id)
            else:
                note("cmd <<%s>> failed with status %s:\n%s", cmd, status, output)
                if wait:
                    if ostream:
                        _rewrite_job_output(ostream, '{ state: 3, msg: "' + urllib.quote('Error processing the document:\n' + output) + '"}')
                    else:
                        response.error(HTTPCodes.INTERNAL_SERVER_ERROR, "<pre>" + htmlescape(output) + "</pre>")
        except:
            e = ''.join(traceback.format_exception(*sys.exc_info()))
            if wait:
                note(3, "Exception processing uplib-add-document request:\n%s", htmlescape(e))
                if ostream:
                    _rewrite_job_output(ostream, '{state: 3, msg: "' + urllib.quote("Exception processing uplib-add-document request:\n" + e) + '"}')
                else:
                    response.error(HTTPCodes.INTERNAL_SERVER_ERROR,
                                   "Exception processing uplib-add-document request:\n<pre>" +
                                   htmlescape(e) + "\n</pre>")
            else:
                note("Exception processing uplib-add-document request:\n%s", e)
    finally:
        if tempf and os.path.isfile(tempf):
            os.unlink(tempf)
        elif tempf and os.path.isdir(tempf):
            shutil.rmtree(tempf)
def process_folder (repo, id, directory, delete_p, replace=None):

    def _protect_files (mode, dirname, files):
        for file in files:
            thepath = os.path.join(dirname, file)
            if os.path.isdir(thepath):
                os.chmod(thepath, 0700)
            else:
                os.chmod(thepath, 0600)

    note(2, "processing folder %s...", directory)

    description = None
    contents = None
    summary = None
    metadata = None
    wordbboxes = os.path.join(directory, "wordbboxes")
    tifffile = os.path.join(directory, "document.tiff")
    pageimagesdir = os.path.join(directory, "page-images")
    images = os.path.join(directory, "images")
    originals = os.path.join(directory, "originals")
    links = os.path.join(directory, "links")

    names = os.listdir(directory)
    for name in names:
        if string.lower(name) == "contents.txt":
            contents = os.path.join(directory, name)
        elif string.lower(name) == "summary.txt":
            summary = os.path.join(directory, name)
        elif string.lower(name) == "metadata.txt":
            metadata = os.path.join(directory, name)

    if replace is None:
        newdir = os.path.join(repo.pending_folder(), id)
    else:
        newdir = replace
    if not os.path.isdir(newdir):
        raise Error("Pending directory %s does not exist!" % newdir)

    try:
        lock_folder(newdir)

        try:
            if os.path.exists(images):
                destpath = os.path.join(newdir, "images")
                if replace and os.path.exists(destpath): shutil.rmtree(destpath)
                shutil.copytree (images, destpath)
                if delete_p: shutil.rmtree (images, true)
            if os.path.exists(originals):
                destpath = os.path.join(newdir, "originals")
                if replace and os.path.exists(destpath): shutil.rmtree(destpath)
                shutil.copytree (originals, destpath)
                if delete_p: shutil.rmtree (originals, true)
            if os.path.exists(links):
                destpath = os.path.join(newdir, "links")
                if replace and os.path.exists(destpath): shutil.rmtree(destpath)
                shutil.copytree (links, destpath)
                if delete_p: shutil.rmtree (links, true)
            if metadata:
                destpath = os.path.join(newdir, "metadata.txt")
                if replace and os.path.exists(destpath): os.unlink(destpath)
                shutil.copyfile(metadata, destpath)
                m = read_metadata(metadata)
                if m.has_key("title"):
                    note("Title of uploaded folder is '%s'", m['title'])
                if delete_p: os.unlink(metadata)
            else:
                # create an empty metadata.txt
                destpath = os.path.join(newdir, "metadata.txt")
                if replace and os.path.exists(destpath): os.unlink(destpath)
                mdf = open(destpath, 'w')
                mdf.flush()
                mdf.close()

            newcontents = os.path.join(newdir, "contents.txt")
            if contents:
                if replace and os.path.exists(newcontents): os.unlink(newcontents)
                shutil.copyfile(contents, newcontents)
                if delete_p: os.unlink(contents)

            newsummary = os.path.join(newdir, "summary.txt")
            if summary:
                if replace and os.path.exists(newsummary): os.unlink(newsummary)
                shutil.copyfile(summary, newsummary)
                if delete_p: os.unlink(summary)

            if os.path.exists(wordbboxes):
                destpath = os.path.join(newdir, "wordbboxes")
                if replace and os.path.exists(destpath): os.unlink(destpath)
                shutil.copyfile(wordbboxes, destpath)
                if delete_p: os.unlink(wordbboxes)

            if os.path.exists(tifffile):
                destpath = os.path.join(newdir, "document.tiff")
                if replace and os.path.exists(destpath): os.unlink(destpath)
                shutil.copyfile(tifffile, destpath)
                if delete_p: os.unlink(tifffile)
            elif os.path.isdir(pageimagesdir):
                destpath = os.path.join(newdir, "page-images")
                if replace and os.path.exists(destpath): shutil.rmtree(destpath)
                shutil.copytree(pageimagesdir, destpath)
                if delete_p: shutil.rmtree(pageimagesdir, true)

            os.path.walk(newdir, _protect_files, None)
            os.chmod(newdir, 0700)

            return id

        finally:
            unlock_folder (newdir)

    except:
        type, value, tb = sys.exc_info()
        if os.path.exists(newdir) and not replace:
            shutil.rmtree(newdir)
        # re-raise the exception
        raise value, None, tb
def flesh_out_folder(id, tmpfilename, metadata, repo, unpack_fn, counter):
    try:
        try:
#             note(3, "CODETIMER_ON is %s", CODETIMER_ON)
#             if CODETIMER_ON:
#                 code_timer.Init()
#                 code_timer.CreateTable("uplib")
#                 code_timer.CodeTimerOn()
#                 code_timer.StartInt("newFolder$unpack", "uplib")
#             else:
#                 code_timer.CodeTimerOff()

            if unpack_fn and tmpfilename and os.path.exists(tmpfilename):
                unpack_fn(repo, id, tmpfilename, metadata)

#             if CODETIMER_ON:
#                 code_timer.StopInt("newFolder$unpack", "uplib")
            folderpath = repo.pending_location(id)
            try:
                note("unpacked new folder in %s", folderpath)
                if not sys.platform.lower().startswith("win"):
                    s, o, t = subproc("ls -Rl %s" % folderpath)
                    note("%s\n" % o)

                fp = open(os.path.join(folderpath, "UNPACKED"), 'w')
                fp.flush()
                fp.close()

                # as of this point, we can restart the inclusion of the document

                md = read_metadata(os.path.join(folderpath, "metadata.txt"))
                replacement_id = md.get("replacement-contents-for")
                if replacement_id:
                    if repo.valid_doc_id(replacement_id):
                        # contents to replace another document
                        md["replacement-contents-for"] = ""
                        update_metadata(os.path.join(folderpath, "metadata.txt"), md)
                        note(2, "replacing contents of %s with this data...", replacement_id)
                        existing_document = repo.get_document(replacement_id)
                        new_folder = existing_document.folder()
                        process_folder(repo, replacement_id, folderpath, false, new_folder)
                        _run_rippers(new_folder, repo, replacement_id)
                        existing_document.recache()
                        repo.touch_doc(existing_document)
                        raise AbortDocumentIncorporation(id, "replacement for existing document %s" % replacement_id)
                    else:
                        raise AbortDocumentIncorporation(id, "replacement for non-existent document %s" % replacement_id)

                _finish_inclusion (repo, folderpath, id)

#                 if CODETIMER_ON:
#                     noteOut = StringIO.StringIO()
#                     noteOut.write("\nCode Timer statistics (what took time, in milliseconds):\n")
#                     code_timer.PrintTable(noteOut, "uplib")
#                     noteOut.write("\n")
#                     noteOutString = noteOut.getvalue()
#                     note(3, noteOutString)

            except:
                type, value, tb = sys.exc_info()
                note("%s", ''.join(traceback.format_exception(type, value, tb)))
                note_error(folderpath, (type, value, tb))
                raise value, None, tb

        except AbortDocumentIncorporation, x:
            # ripper signalled to stop adopting this document, for good
            note(2, "AbortDocumentIncorporation exception on %s:  %s", x.id, x.message)
            if (x.id == id):
                shutil.rmtree(folderpath)
            remove_from_index(repo.index_path(), id)

        except:
            type, value, tb = sys.exc_info()
            note("Exception processing new folder:\n%s", ''.join(traceback.format_exception(type, value, tb)))
def findimages(folder, debug=None):

    images = []

    if not FINDIMAGES_PROGRAM:
        note(3, "FINDIMAGES_PROGRAM not defined")
        return images

    images_dir = os.path.join(folder, "page-images")
    if not os.path.isdir(images_dir):
        note(3, "No page images in %s", images_dir)
        return images

    md = read_metadata(os.path.join(folder, "metadata.txt"))
    dpi = int(md.get("images-dpi") or md.get("dpi") or md.get("tiff-dpi") or 300)
    scaling_factor = float(dpi)/72

    def get_images_for_page (page_index, wordboxes, dpi, images_dir):
        pageimages = []
        filepath = os.path.join(images_dir, "page%05d.png" % (page_index + 1))
        if os.path.exists(filepath):
            wordboxes_file = tempfile.mktemp()
            try:
                boxlist = []
                if wordboxes:
                    # first, write out list of wordboxes, in Leptonica BOXA format
                    for i in range(len(wordboxes)):
                        box = boxes[i]
                        x, y, w, h = (int(box.left() * dpi / 72.0), int(box.top() * dpi / 72.0),
                                      int(box.width() * dpi / 72.0), int(box.height() * dpi / 72.0))
                        if (w > 0) and (h > 0):
                            boxlist.append((x, y, w, h))
                    if len(boxlist) > 0:
                        fp = open(wordboxes_file, "wb")
                        fp.write("\nBoxa Version 2\nNumber of boxes = %d\n" % len(boxlist))
                        for i in range(len(boxlist)):
                            fp.write("  Box[%d]: " % i + "x = %d, y = %d, w = %d, h = %d\n" % boxlist[i])
                        fp.close()
                # now, run the finder on the page image plus the list of wordboxes
                debug_arg = (debug and "--debug") or " "
                cmd = "%s %s %s %s %s" % (FINDIMAGES_PROGRAM, debug_arg, dpi, filepath, (boxlist and wordboxes_file) or "-")
                note(4, "findimages cmd is <<%s>>", cmd)
                status, output, tsignal = subproc(cmd)
                if status == 0:
                    for line in [x.strip() for x in output.split('\n') if x.strip()]:
                        if not line.startswith("halftone "):
                            continue
                        pageimages.append((str(page_index) + " " + line.strip()).split())
                else:
                    note(3, "findimages command <%s> returns bad status %s:\n%s\n" % (cmd, status, output))
            finally:
                # remove the temp file
                if os.path.exists(wordboxes_file):
                    os.unlink(wordboxes_file)
                    # note("%d:  wordboxes file is %s", page_index, wordboxes_file)
        return pageimages

    if os.path.exists(os.path.join(folder, "wordbboxes")):
        for page_index, boxes in wordboxes_page_iterator(folder):
            images += get_images_for_page (page_index, boxes, dpi, images_dir)
    else:
        # handle case where there's no text for the image
        files = os.listdir(images_dir)
        for file in files:
            m = PAGE_IMAGE_FILENAME_PATTERN.match(file)
            if m:
                pageimages = get_images_for_page(int(m.group(1))-1, None, dpi, images_dir)
                images += pageimages

    point_squared = scaling_factor * scaling_factor
    images = [(pageno, imtype, x, y, width, height)
              for (pageno, imtype, x, y, width, height) in images
              if ((int(height) * int(width)) > point_squared)]
    note(3, "images for %s are %s", folder, images)
    return images