def get_page_images (self): tf = mktempfile(".pnm") cmd = ('"%s" --input-format jp2 --input "%s" --output-format pnm --output "%s"' % (JASPER, self.doc, tf)) try: status, output, tsignal = subproc(cmd) if status == 0: # success img = Image.open(tf) imagespath = self.images_path() os.mkdir(imagespath) if self.uses_png: png_file_name = os.path.join(imagespath, "page00001.png") img.save(png_file_name, "PNG") else: if (convert_image_to_tiff(tf, imagespath)): note(3, "created tiff file in %s", imagespath) else: note("Can't convert %s. Output was %s.", self.doc, output) note(4, "cmd was %s", cmd) note(4, "tfile %s %s", tf, (os.path.exists(tf) and "exists") or "does not exist") raise RuntimeError(output) finally: if os.path.exists(tf): os.unlink(tf)
def get_images_for_page (page_index, wordboxes, dpi, images_dir): pageimages = [] filepath = os.path.join(images_dir, "page%05d.png" % (page_index + 1)) if os.path.exists(filepath): wordboxes_file = tempfile.mktemp() try: boxlist = [] if wordboxes: # first, write out list of wordboxes, in Leptonica BOXA format for i in range(len(wordboxes)): box = boxes[i] x, y, w, h = (int(box.left() * dpi / 72.0), int(box.top() * dpi / 72.0), int(box.width() * dpi / 72.0), int(box.height() * dpi / 72.0)) if (w > 0) and (h > 0): boxlist.append((x, y, w, h)) if len(boxlist) > 0: fp = open(wordboxes_file, "wb") fp.write("\nBoxa Version 2\nNumber of boxes = %d\n" % len(boxlist)) for i in range(len(boxlist)): fp.write(" Box[%d]: " % i + "x = %d, y = %d, w = %d, h = %d\n" % boxlist[i]) fp.close() # now, run the finder on the page image plus the list of wordboxes debug_arg = (debug and "--debug") or " " cmd = "%s %s %s %s %s" % (FINDIMAGES_PROGRAM, debug_arg, dpi, filepath, (boxlist and wordboxes_file) or "-") note(4, "findimages cmd is <<%s>>", cmd) status, output, tsignal = subproc(cmd) if status == 0: for line in [x.strip() for x in output.split('\n') if x.strip()]: if not line.startswith("halftone "): continue pageimages.append((str(page_index) + " " + line.strip()).split()) else: note(3, "findimages command <%s> returns bad status %s:\n%s\n" % (cmd, status, output)) finally: # remove the temp file if os.path.exists(wordboxes_file): os.unlink(wordboxes_file) # note("%d: wordboxes file is %s", page_index, wordboxes_file) return pageimages
def process_tarred_folder (repo, id, tarfile, metadata): # create a new folder, and populate it dirname = tempfile.mktemp() try: os.mkdir(dirname) os.chmod(dirname, 0700) cmd = UNTAR_CMD % (dirname, TAR, tarfile) note(2, "Untarring folder into temporary directory %s", dirname) status, output, signal = subproc(cmd) if status == 0: note(2, "Successfully untarred folder into %s", dirname) if metadata: update_metadata(os.path.join(dirname, "metadata.txt"), metadata) if (os.path.exists(os.path.join(dirname, "document.tiff")) or os.path.isdir(os.path.join(dirname, "page-images"))): return process_folder(repo, id, dirname, true) else: raise Error("invalid folder -- no page images file") else: raise Error("Problem untarring folder:\n%s" % output) finally: if os.path.exists(dirname): shutil.rmtree(dirname)
def index_folder (folder, repo_index_dir): update_configuration() docs_dir, doc_id = os.path.split(folder) SECTION_LOCK.acquire() try: try: if LUCENE == 'jcc': c = get_context(repo_index_dir) c.index(folder, doc_id) else: indexingcmd = INDEXING_ADD_CMD % (JAVA, DEBUG_FLAGS, INDEXING_PROPERTIES, LUCENE_JAR, INDEXING_JAR, repo_index_dir, docs_dir, doc_id) note(3, " indexing with %s", indexingcmd) status, output, tsignal = subproc(indexingcmd) except: note(0, "Can't index folder %s:\n%s", folder, ''.join(traceback.format_exception(*sys.exc_info()))) finally: SECTION_LOCK.release() if LUCENE != 'jcc': note(3, " indexing output is <%s>", output) if status != 0: raise Error ("%s signals non-zero exit status %d attempting to index %s:\n%s" % (JAVA, status, folder, output))
def remove_from_index (repo_index_dir, doc_id): update_configuration() if LUCENE == 'jcc': SECTION_LOCK.acquire() try: c = get_context(repo_index_dir) c.remove(doc_id) finally: SECTION_LOCK.release() else: indexingcmd = INDEXING_REMOVE_CMD % (JAVA, DEBUG_FLAGS, "", LUCENE_JAR, INDEXING_JAR, repo_index_dir, doc_id) note(3, " de-indexing with %s", indexingcmd) SECTION_LOCK.acquire() try: status, output, tsignal = subproc(indexingcmd) finally: SECTION_LOCK.release() note(3, " indexing output is <%s>", output) if status != 0: raise Error ("%s signals non-zero exit status %d attempting to remove %s:\n%s" % (JAVA, status, doc_id, output))
def do_thumbnails (dirpath, output_dir, **params): note(2, " thumbnailing in %s...", dirpath) tmpdir = tempfile.mktemp() retval = params.get('returnvalue', false) doc_metadata_path = os.path.join(dirpath, "metadata.txt") try: os.mkdir(tmpdir) os.chmod(tmpdir, 0700) try: md = read_metadata(doc_metadata_path) is_temporary_doc = md.get("temporary-contents") if is_temporary_doc and (is_temporary_doc == "true"): # temporary -- don't spend much time on this create_temporary_icons (md, dirpath, output_dir, params) retval = true return if os.path.exists(os.path.join(dirpath, "document.tiff")): # contains one single-page TIFF file tiffmaster = os.path.join(tmpdir, "master.tiff") split_command = (TIFF_SPLIT_CMD % (TIFFCP, os.path.join(dirpath, "document.tiff"), tiffmaster, TIFFSPLIT, tiffmaster, os.path.join(tmpdir, "x"))) status, output, tsignal = subproc(split_command) if status != 0: raise Error ("'%s' signals non-zero exit status %d in %s => %s" % (split_command, status, dirpath, tmpdir)) parts_dir = tmpdir filecheck_fn = lambda fn: fn[0] == "x" elif (os.path.exists(os.path.join(dirpath, "page-images")) and os.path.isdir(os.path.join(dirpath, "page-images"))): # contains directory full of PNG page images parts_dir = os.path.join(dirpath, "page-images") filecheck_fn = lambda fn: (fn.startswith('page') and fn.endswith('.png')) else: raise Error("No page images for document in %s" % dirpath) tiff_parts = os.listdir(parts_dir) if len(tiff_parts) < 1: raise Error("No pages in split tiff file directory after split!") # either a PNG-images or a TIFF split will sort properly in lexicographic order tiff_parts.sort() # see if there's a document icon legend and info about the DPI of the tiff file legend = md.get('document-icon-legend') tiff_dpi = int(md.get('images-dpi') or md.get('tiff-dpi') or params.get('images-dpi') or 0) page_numbers_v = md.get('page-numbers') page_numbers = (page_numbers_v and figure_page_numbers(page_numbers_v, dirpath)) first_page = int(md.get('first-page-number', 1)) skips = md.get('document-bbox-pages-to-skip', '') if skips: parts = string.split(skips, ':') bbox_skips = [] for part in parts: bbox_skips = bbox_skips + map(int, string.split(part, ',')) else: bbox_skips = None # figure bounding box for imaged page page_count = 0 bbox = None note(2, " calculating bounding box for large pages...") dont_crop = md.get('dont-crop-big-thumbnails', false) if AUTO_CROP_BIG_THUMBNAILS and not dont_crop: do_bbox = true else: do_bbox = false for tiff_part in tiff_parts: if not filecheck_fn(tiff_part): continue if page_count == 0: # find the width and height of the document docwidth, docheight = figure_doc_size(os.path.join(parts_dir, tiff_part)) if not do_bbox: bbox = (0, 0, docwidth, docheight) if do_bbox: bbox = figure_bbox (os.path.join(parts_dir, tiff_part), page_count, bbox, bbox_skips) if (bbox and bbox[0] == 0) and (bbox[1] == 0) and (bbox[2] >= docwidth) and (bbox[3] >= docheight): # don't bother, there's no area to crop already do_bbox = false page_count = page_count + 1 if page_count == 0: raise Error("No pages in split tiff file directory after split!") note(2, " final bbox is %s, page_count is %d", bbox, page_count) if USE_VIRTUAL_INK: note(2, " alpha channels will be added to large thumbnails...") # now make the thumbnails big_thumbnail_size = [] small_thumbnail_size = [] icon_size = [] page_index = 0 for tiff_part in tiff_parts: if not filecheck_fn(tiff_part): note(3, " skipping %s", tiff_part) continue tiff_path = os.path.join(parts_dir, tiff_part) if page_numbers: page_no_string = page_numbers.get(page_index) else: page_no_string = None note (2, " page %d%s", page_index, (page_no_string and " (%s)" % page_no_string) or "") try: if not create_thumbnail(tiff_path, tiff_dpi, output_dir, page_index, first_page, page_count, bbox, bbox_skips, big_thumbnail_size, small_thumbnail_size, icon_size, params.get('maxwidth'), params.get('maxheight'), params.get('maxscaling'), params.get('thumbnail_strategy'), legend, page_no_string): raise Error ("Can't create thumbnail for page %d in %s (of %s)" % (page_index, tiff_path, dirpath)) except Exception, x: doc_id = os.path.split(dirpath)[1] note("exception creating thumbnails for page %d of document %s:\n%s", page_index, doc_id, string.join(traceback.format_exception(*sys.exc_info()), "")) raise AbortDocumentIncorporation(doc_id, str(x)) if page_index == 0: bt_width = big_thumbnail_size[0] bt_height = big_thumbnail_size[1] st_width = small_thumbnail_size[0] st_height = small_thumbnail_size[1] else: bt_width = max(bt_width, big_thumbnail_size[0]) bt_height = max(bt_height, big_thumbnail_size[1]) st_width = max(st_width, small_thumbnail_size[0]) st_height = max(st_height, small_thumbnail_size[1]) st_scaling = (float(st_width)/float(docwidth) + float(st_height)/float(docheight)) / 2.0 page_index = page_index + 1 d = {"page-count" : str(page_count), "tiff-width" : str(docwidth), "images-width" : str(docwidth), "images-size" : "%d,%d" % (docwidth, docheight), "cropping-bounding-box" : "%d,%d;%d,%d" % (bbox), "big-thumbnail-size" : "%s,%s" % (bt_width, bt_height), "small-thumbnail-size" : "%s,%s" % (st_width, st_height), "small-thumbnail-scaling" : "%f" % st_scaling, "icon-size" : "%d,%d" % icon_size[0], "images-height" : str(docheight), "tiff-height" : str(docheight) } translation, scaling = thumbnail_translation_and_scaling(dirpath, d, false, true) d["big-thumbnail-translation-points"] = "%f,%f" % translation d["big-thumbnail-scaling-factor"] = "%f,%f" % scaling update_metadata(os.path.join(dirpath, "metadata.txt"), d) finally: shutil.rmtree(tmpdir) # indicate successful completion note(2, " finished.") retval = true
def _add_internal (ostream, percent_done_fn, repo, response, params, content, wait): # this can be called in several different ways. # In general, you post a multipart/form-data body which # contains a "contenttype" for the document, and either a "URL" # for the content, or a "content" parameter containing the # the actual content. If both "URL" and "content" are present, # the URL is added as the "original-url" value for the metadata, # and if the content is HTML, it's used as the "original.html" # and the URL is used to pull ancillary content referenced in it. content_type = params.get("contenttype") url = params.get("URL") noredir = params.get("no-redirect") noredir = noredir and (noredir.lower() == "true") uploadloc = url docname = params.get("documentname") tempf = None suppress_duplicates = params.get("suppress-duplicates") suppress_duplicates = suppress_duplicates and (suppress_duplicates.lower() == "true") bury = params.get("bury") bury = bury and (bury.lower() == "true") verbosity = int(params.get("verbosity") or "0") if content: if wait and ostream: _rewrite_job_output(ostream, '{ state: 0, msg: "Caching page..."}') extension = CONTENT_TYPES.get(content_type) if not extension: if wait: msg = "Don't know what to do with contenttype \"%s\"" % content_type if ostream: _rewrite_job_output(ostream, '{state: 1, msg: "' + urllib.quote(msg) + '"}') else: response.error(HTTPCodes.UNSUPPORTED_MEDIA_TYPE, msg) return # special case HTML/XHTML if content and (content_type.lower() in ("text/html", "application/xhtml+xml")): tempf = tempfile.mkdtemp() uploadloc = os.path.join(tempf, "original.html") # make sure that the folder for other parts exists, even if empty os.mkdir(os.path.join(tempf, "original_files")) # remove our bookmarklet, if present content = _BOOKMARKLET_PATTERN.sub('', content) content = _ADD_FORM_PATTERN.sub('', content) c = _OurCacher(url, filename=uploadloc, bits=content, content_type=content_type) # make sure that the folder for other parts exists, even if empty other_parts = os.path.join(tempf, "original_files") if not os.path.exists(other_parts): os.mkdir(other_parts) # special case 3x5 cards elif (docname and (content_type.lower() == "text/plain") and os.path.splitext(docname)[1] == ".3x5"): fd, tempf = tempfile.mkstemp(".3x5") fp = os.fdopen(fd, "wb") fp.write(content) fp.close() uploadloc = tempf else: fd, tempf = tempfile.mkstemp("." + extension) fp = os.fdopen(fd, "wb") fp.write(content) fp.close() uploadloc = tempf if suppress_duplicates: hash = calculate_originals_fingerprint(tempf) results = repo.do_query("sha-hash:"+hash) if results: # it's a duplicate doc = results[0][1] if os.path.isdir(tempf): shutil.rmtree(tempf) elif os.path.exists(tempf): os.remove(tempf) if ostream: _rewrite_job_output(ostream, '{ state: 2, doc_id: "' + doc.id + '"}') elif noredir: response.reply(doc.id, "text/plain") else: response.redirect("/action/basic/dv_show?doc_id=%s" % doc.id) return try: try: # get a cookie for authentication cookie = repo.new_cookie(url or content[:min(100, len(content))]) cookie_str = '%s=%s; path=/; Secure' % (cookie.name(), cookie.value()) os.environ["UPLIB_COOKIE"] = cookie_str doctitle = params.get("md-title") docauthors = params.get("md-authors") docdate = params.get("md-date") doccats = params.get("md-categories") metadata = params.get("metadata") if metadata: mdtmpfile = tempfile.mktemp() open(mdtmpfile, "w").write(metadata) # check to see if we're replacing an existing document md2 = read_metadata(StringIO.StringIO(metadata)) existing_doc_id = md2.get("replacement-contents-for") if existing_doc_id and not repo.valid_doc_id(existing_doc_id): raise ValueError("Invalid doc ID %s specified for replacement" % existing_doc_id) else: mdtmpfile = None existing_doc_id = None # now form the command scheme = ((repo.get_param("use-http", "false").lower() == "true") or _use_http) and "http" or "https" cmd = '%s --verbosity=%s --repository=%s://127.0.0.1:%s ' % (_uplib_add_document, verbosity, scheme, repo.port()) if doctitle: cmd += ' --title=%s' % pipes.quote(doctitle) if docauthors: cmd += ' --authors=%s' % pipes.quote(docauthors) if docdate: cmd += ' --date="%s"' % docdate if doccats: cmd += ' --categories=%s' % pipes.quote(doccats) if mdtmpfile: cmd += ' --metadata="%s"' % mdtmpfile cmd += ' "%s"' % uploadloc if ostream: _rewrite_job_output(ostream, '{state: 0, msg: "' + urllib.quote(cmd) + '"}') # and invoke the command status, output, tsignal = subproc(cmd) note(4, "cmd is %s, status is %s, output is %s", repr(cmd), status, repr(output.strip())) if mdtmpfile: os.unlink(mdtmpfile) if status == 0: # success; output should be doc-id doc_id = existing_doc_id or output.strip().split()[-1] note(4, "output is '%s'; doc_id for new doc is %s", output.strip(), doc_id) if wait and ostream: _rewrite_job_output(ostream, '{ state: 1, doc_id: "' + doc_id + '", msg: "' + urllib.quote(output) + '"}') # wait for it to come on-line if percent_done_fn: percent_done_fn(40) # estimate 40% of work done on client side while not repo.valid_doc_id(doc_id): if ostream: pending = repo.list_pending(full=True) s = _first(pending, lambda x: x['id'] == doc_id) if not s: break dstatus = s['status'] if dstatus == 'error': msg = 'server-side error incorporating document' _rewrite_job_output(ostream, '{ state: 3, doc_id: "' + doc_id + '", msg: "' + urllib.quote(s['error']) + '"}') break if dstatus == 'unpacking': msg = 'starting ripper process...' elif dstatus == 'ripping': msg = "ripping with ripper '" + s['ripper'] + "'..." elif dstatus == 'moving': msg = 'adding to registered document set...' _rewrite_job_output(ostream, '{ state: 1, doc_id: "' + doc_id + '", msg: "' + urllib.quote(msg) + '"}') time.sleep(1.0) if percent_done_fn: percent_done_fn(100) # finished if repo.valid_doc_id(doc_id): if bury: # wait up to 100 seconds for it to show up in history list # after that, wait another second, then bury it counter = 100 while counter > 0: h = [x.id for x in repo.history()] if doc_id in h: break counter -= 1 time.sleep(1) time.sleep(1) repo.touch_doc(doc_id, bury=True, notify=False) note(3, "buried %s", doc_id) if wait: if ostream: _rewrite_job_output(ostream, '{ state: 2, doc_id: "' + doc_id + '"}') elif noredir: response.reply(doc_id, "text/plain") else: response.redirect("/action/basic/dv_show?doc_id=%s" % doc_id) else: note("cmd <<%s>> failed with status %s:\n%s", cmd, status, output) if wait: if ostream: _rewrite_job_output(ostream, '{ state: 3, msg: "' + urllib.quote('Error processing the document:\n' + output) + '"}') else: response.error(HTTPCodes.INTERNAL_SERVER_ERROR, "<pre>" + htmlescape(output) + "</pre>") except: e = ''.join(traceback.format_exception(*sys.exc_info())) if wait: note(3, "Exception processing uplib-add-document request:\n%s", htmlescape(e)) if ostream: _rewrite_job_output(ostream, '{state: 3, msg: "' + urllib.quote("Exception processing uplib-add-document request:\n" + e) + '"}') else: response.error(HTTPCodes.INTERNAL_SERVER_ERROR, "Exception processing uplib-add-document request:\n<pre>" + htmlescape(e) + "\n</pre>") else: note("Exception processing uplib-add-document request:\n%s", e) finally: if tempf and os.path.isfile(tempf): os.unlink(tempf) elif tempf and os.path.isdir(tempf): shutil.rmtree(tempf)
def flesh_out_folder(id, tmpfilename, metadata, repo, unpack_fn, counter): try: try: # note(3, "CODETIMER_ON is %s", CODETIMER_ON) # if CODETIMER_ON: # code_timer.Init() # code_timer.CreateTable("uplib") # code_timer.CodeTimerOn() # code_timer.StartInt("newFolder$unpack", "uplib") # else: # code_timer.CodeTimerOff() if unpack_fn and tmpfilename and os.path.exists(tmpfilename): unpack_fn(repo, id, tmpfilename, metadata) # if CODETIMER_ON: # code_timer.StopInt("newFolder$unpack", "uplib") folderpath = repo.pending_location(id) try: note("unpacked new folder in %s", folderpath) if not sys.platform.lower().startswith("win"): s, o, t = subproc("ls -Rl %s" % folderpath) note("%s\n" % o) fp = open(os.path.join(folderpath, "UNPACKED"), 'w') fp.flush() fp.close() # as of this point, we can restart the inclusion of the document md = read_metadata(os.path.join(folderpath, "metadata.txt")) replacement_id = md.get("replacement-contents-for") if replacement_id: if repo.valid_doc_id(replacement_id): # contents to replace another document md["replacement-contents-for"] = "" update_metadata(os.path.join(folderpath, "metadata.txt"), md) note(2, "replacing contents of %s with this data...", replacement_id) existing_document = repo.get_document(replacement_id) new_folder = existing_document.folder() process_folder(repo, replacement_id, folderpath, false, new_folder) _run_rippers(new_folder, repo, replacement_id) existing_document.recache() repo.touch_doc(existing_document) raise AbortDocumentIncorporation(id, "replacement for existing document %s" % replacement_id) else: raise AbortDocumentIncorporation(id, "replacement for non-existent document %s" % replacement_id) _finish_inclusion (repo, folderpath, id) # if CODETIMER_ON: # noteOut = StringIO.StringIO() # noteOut.write("\nCode Timer statistics (what took time, in milliseconds):\n") # code_timer.PrintTable(noteOut, "uplib") # noteOut.write("\n") # noteOutString = noteOut.getvalue() # note(3, noteOutString) except: type, value, tb = sys.exc_info() note("%s", ''.join(traceback.format_exception(type, value, tb))) note_error(folderpath, (type, value, tb)) raise value, None, tb except AbortDocumentIncorporation, x: # ripper signalled to stop adopting this document, for good note(2, "AbortDocumentIncorporation exception on %s: %s", x.id, x.message) if (x.id == id): shutil.rmtree(folderpath) remove_from_index(repo.index_path(), id) except: type, value, tb = sys.exc_info() note("Exception processing new folder:\n%s", ''.join(traceback.format_exception(type, value, tb)))
def index_folders (docs_dir, doc_ids, repo_index_dir): update_configuration() if not doc_ids: return if LUCENE == 'jcc': c = get_context(repo_index_dir) SECTION_LOCK.acquire() try: for id in doc_ids: folderpath = os.path.join(docs_dir, id) if os.path.isdir(folderpath): lock_folder(folderpath) try: try: c.index(folderpath, id, False) except: note(0, "Can't index folder %s:\n%s", folderpath, ''.join(traceback.format_exception(*sys.exc_info()))) finally: unlock_folder(folderpath) c.reopen() finally: SECTION_LOCK.release() return else: # invoke Java to do indexing if len(doc_ids) > 6: fname = tempfile.mktemp() fp = open(fname, "w") fp.write(string.join(doc_ids, '\n')) fp.close() indexingcmd = INDEXING_BATCHADD_CMD % (JAVA, DEBUG_FLAGS, INDEXING_PROPERTIES, LUCENE_JAR, INDEXING_JAR, repo_index_dir, docs_dir, fname) note(3, " indexing with %s", indexingcmd) SECTION_LOCK.acquire() try: status, output, tsignal = subproc(indexingcmd) finally: SECTION_LOCK.release() os.unlink(fname) note(3, " indexing output is <%s>", output) if status != 0: raise Error ("%s signals non-zero exit status %d attempting to index %s:\n%s" % (JAVA, status, doc_ids, output)) else: folders = string.join(doc_ids, ' ') indexingcmd = INDEXING_ADD_CMD % (JAVA, DEBUG_FLAGS, INDEXING_PROPERTIES, LUCENE_JAR, INDEXING_JAR, repo_index_dir, docs_dir, folders) note(3, " indexing with %s", indexingcmd) SECTION_LOCK.acquire() try: status, output, tsignal = subproc(indexingcmd) finally: SECTION_LOCK.release() note(3, " indexing output is <%s>", output) if status != 0: raise Error ("%s signals non-zero exit status %d attempting to index %s:\n%s" % (JAVA, status, doc_ids, output))
def manipulate_server_internal (repo, params, response=None, ipaddr=None, lgr=None): # regular UpLib action conf = params.get("configurator") if not conf: conf = configurator() imap_ssl_port = conf.get_int("imap-server-ssl-port", -1) imap_localhost_port = conf.get_int("imap-server-localhost-port", 8143) stunnel = conf.get("stunnel") expunge_deletes_docs = conf.get_bool("imap-expunge-deletes-documents", False) global CHECKPOINT_PERIOD CHECKPOINT_PERIOD = conf.get_int("imap-server-checkpoint-interval", 600) allow_anonymous_readers = ((not repo.has_password) and conf.get_bool("imap-server-allow-anonymous-readers", True)) use_for_email = conf.get_bool("imap-server-use-for-email", False) imap_dir = os.path.join(repo.overhead_folder(), "imap") if not os.path.isdir(imap_dir): os.mkdir(imap_dir) stunnel_pid_filepath = os.path.join(imap_dir, "stunnel.pid") if os.path.exists(stunnel_pid_filepath): stunnel_pid = int(open(stunnel_pid_filepath, 'r').read().strip()) else: stunnel_pid = None # we cache the reference to the existing server in another # module so that we can reload this one with impunity current_server = emailParser.__dict__.get("IMAP_SERVER") note("current server is %s", current_server) action = params.get('action') newcontext = params.get('newcontext', False) if response: fp = response.open() else: fp = StringIO() fp.write('<body bgcolor="%s">\n' % STANDARD_BACKGROUND_COLOR) if current_server: s = current_server.status() m = s.more() while m: fp.write(m) m = s.more() fp.write('\n<hr>\n') else: fp.write('<h2>UpLib IMAP Server control panel</h2>\n') current_context = None if current_server and ((action == 'Stop') or (action == 'Restart')): if stunnel_pid: try: os.kill(stunnel_pid, signal.SIGKILL) time.sleep(4) except: pass stunnel_pid = None current_context = current_server.mailcontext current_server.close() current_server = None del emailParser.__dict__["IMAP_SERVER"] fp.write("<p>Closed current server.\n") if os.path.exists(stunnel_pid_filepath): os.unlink(stunnel_pid_filepath) if (action == 'Start') or (action == 'Restart'): cert_filepath = os.path.join(repo.overhead_folder(), repo.certfilename()) try: port = params.get("port") if port: port = int(port) else: port = imap_localhost_port if stunnel and ((not ssl) or (imap_ssl_port > 0)): # start stunnel stunnel_conf_filepath = os.path.join(imap_dir, "stunnel.conf") f = open(stunnel_conf_filepath, 'w') f.write("debug = 7\n\ncert = %s\noutput = %s\npid = %s\n\n[imapuplib]\naccept = %s\nconnect = 127.0.0.1:%s\n" % (cert_filepath, os.path.join(imap_dir, "stunnel.log"), stunnel_pid_filepath, str(imap_ssl_port), str(port))) f.close() status, tsignal, output = subproc("%s %s" % (stunnel, stunnel_conf_filepath)) note("status from '%s %s' (on %s) is %s, output is <%s>", stunnel, stunnel_conf_filepath, imap_ssl_port, status, output) if status != 0: raise RuntimeError("Can't start stunnel with '%s %s'; status is %s, output is %s" % (stunnel, stunnel_conf_filepath, status, output)) stunnel_pid = int(open(stunnel_pid_filepath, 'r').read().strip()) note("stunnel_pid is %s", stunnel_pid) else: stunnel_pid = None if newcontext or (not current_context): current_context = uplib_mailcontext(repo, expunge_deletes_docs=expunge_deletes_docs, allow_readers=allow_anonymous_readers, use_for_email=use_for_email, ip=get_fqdn(), server_certificate_file=cert_filepath) if current_context.inbox: current_context.inbox.rescan() if stunnel_pid is not None: ipaddr = '127.0.0.1' else: ipaddr = '0.0.0.0' if not lgr: lgr = logger.rotating_file_logger (os.path.join(imap_dir, "imap.log"), "weekly", None, True) lgr = logger.unresolving_logger(lgr) imaps = imap_server (current_context, ipaddr, port, logger=lgr, stunnel_pid=stunnel_pid) emailParser.__dict__["IMAP_SERVER"] = imaps current_server = imaps hooked = emailParser.__dict__.get("IMAP_SERVER_SHUTDOWN_HOOK") if not hooked: repo.add_shutdown_hook(lambda x=repo: shutdown_server(x)) emailParser.__dict__["IMAP_SERVER_SHUTDOWN_HOOK"] = True if stunnel_pid: fp.write("<p>Started new IMAP4 server for %s on ports %s/%s." % (repr(repo), str(imap_ssl_port), str(port))) else: fp.write("<p>Started new IMAP4 server for %s on port %s." % (repr(repo), str(port))) if current_context.inbox: fp.write("<p>Inbox: %d messages, %d recent, %d unseen." % (len(current_context.inbox.msgs), len(current_context.inbox.recent()), current_context.inbox.min_unseen())) except: type, value, tb = sys.exc_info() s = string.join(traceback.format_exception(type, value, tb)) note("Can't establish IMAP server: exception: " + s) fp.write(s) fp.write('<form method=GET action="/action/IMAPServer/manipulate_server">\n') fp.write('<input type=submit name=action value="Start" %s>\n' % ((current_server and "disabled") or "")) fp.write('<input type=submit name=action value="Stop" %s>\n' % (((current_server == None) and "disabled") or "")) fp.write('<input type=submit name=action value="Restart" %s>\n' % (((current_server == None) and "disabled") or "")) fp.write('<input type=checkbox name="newcontext" %s> Use fresh mail context\n' % (newcontext and "checked") or "") fp.write('</form>\n') fp.write('</body>\n')
def rip (self, location, doc_id): global CITATION_PARSER, HEADER_PARSER omd = self.get_folder_metadata(location) # CiteSeer really only works on traditional publications, so let's stay # with PDF and Word docs mimetype = omd.get("apparent-mime-type") if mimetype not in TRADITIONAL_PAPER_FORMATS: return text, language = self.get_folder_text(location) if not text: # no text to look at return m = REFERENCES_PATTERN.search(text) if not m: # no REFERENCES_PATTERN in text return # just a note if we're re-ripping something if self.repository().valid_doc_id(doc_id): note(3, "%s is a technical report", self.repository().get_document(doc_id)) cp = self.__citation_parser or CITATION_PARSER if cp: status, output, tsig = subproc('%s "%s"' % (cp, self.folder_text_path(location))) if status == 0: parsed = BeautifulStoneSoup(output.strip()) citations = parsed.findAll("citation") note(3, "found %d citations", len(citations)) fp = open(os.path.join(location, "citeseerx-citations.xml"), "w") fp.write(output.strip()) hp = self.__header_parser or HEADER_PARSER if hp: tfile = tempfile.mktemp() fp = codecs.open(tfile, "w", "UTF-8") fp.write(text) fp.close() try: status, output, tsig = subproc('%s "%s"' % (hp, tfile)) if status == 0: md = dict() parsed = BeautifulStoneSoup(output.strip()) title = parsed.find("title") if title: if title.string: md['citeseer-title'] = title.string else: note(3, "Non-string title found: %s", title) authors = set() for author in parsed.findAll("author"): n = author.find("name") if n: authors.add(n.string) else: authors.add(author.string) if authors: md['citeseer-authors'] = " and ".join(list(authors)) abstract = parsed.find("abstract") if abstract: if abstract.string: md['citeseer-abstract'] = abstract.string else: note(3, "Non-string abstract found: %s", abstract) note(3, "citeseer metadata is %s", pprint.pformat(md)) if "citeseer-title" in md: # use CiteSeer data to fix up document metadata, if necessary if ((not omd.get("title")) or (omd.get("title-is-original-filepath", "false").lower() == "true")): md['title'] = md.get("citeseer-title") md['title-is-original-filepath'] = None md['title-is-citeseer-extracted'] = "true" if ("citeseer-authors" in md) and (not omd.get("authors")): md['authors'] = md.get("citeseer-authors") if ("citeseer-abstract" in md) and (not md.get("abstract")): abs = md.get("citeseer-abstract") prefix = ABSTRACT_PREFIX.match(abs) if prefix: realstart = prefix.end("prefix") note(3, "trimming abstract prefix of %s", repr(abs[:realstart])) abs = abs[realstart:] md['abstract'] = abs note(3, "updated missing metadata with CiteSeer versions") self.update_folder_metadata(location, md) finally: if os.path.exists(tfile): os.unlink(tfile)