def start_incorporation_thread(repo, n_simultaneous_threads): incoming_queue = Queue.Queue() name = "process_incoming_documents" counter = None if (n_simultaneous_threads > 0): name += "_in_at_most_%d_threads" % n_simultaneous_threads counter = threading.BoundedSemaphore(n_simultaneous_threads) uthread.start_new_thread(_incorporate_document, (repo, incoming_queue, counter), name=name) return incoming_queue
def __init__(self, repo, expunge_deletes_docs=False, use_for_email=False, allow_readers=False, ip=None, server_certificate_file=None): self.repo = repo self.ip = ip if use_for_email: email_namespace = (namespace("", "/"),) else: email_namespace = () doc_namespace = (namespace(repo.name(), "/"),) self.__namespaces = (email_namespace, (), doc_namespace) self.expunge_deletes_docs = expunge_deletes_docs self.expunge_deletes_inbox_docs = use_for_email self.allow_readers = allow_readers self.__server_certificate_file = server_certificate_file self.__dir = os.path.join(repo.overhead_folder(), "imap") if not os.path.exists(self.__dir): os.mkdir(self.__dir) mboxes = [] inbox = None subscribed = [] subscriptions = [] if os.path.exists(os.path.join(self.__dir, "subscribed")): # read subscriptions for line in open(os.path.join(self.__dir, "subscribed"), 'r'): subscriptions.append(line.strip()) categories = repo.categories() if use_for_email: inbox = uplib_email_mailbox("INBOX", self, category=False, ip=self.ip) mboxes.append(inbox) for c in categories: if c.startswith("email/"): name = string.join([x.strip() for x in c.split('/')][1:], '/') if name: box = uplib_email_mailbox(name, self, category=c, ip=self.ip) note("new mailbox %s", box) mboxes.append(box) subscribed.append(box) # build document context for c in categories: name = repo.name() + '/categories/' + string.join([x.strip() for x in c.split('/')], '/') if name: box = uplib_email_mailbox(name, self, category=c, email_folder=False, ip=self.ip) note("new mailbox %s", box) mboxes.append(box) if (("category " + c) in subscriptions) and (box not in subscribed): subscribed.append(box) for cname, c in repo.list_collections(): name = repo.name() + '/collections/' + cname if name: box = uplib_email_mailbox(name, self, category=None, email_folder=False, ip=self.ip, collection=c) note("new mailbox %s", box) mboxes.append(box) if (("collection " + c.name()) in subscriptions) and (box not in subscribed): subscribed.append(box) mailcontext.__init__(self, inbox=inbox, mailboxes=mboxes, subscribed=subscribed) uthread.start_new_thread(checkpoint_thread_fn, (weakref.ref(self),))
def rerip (self, changed_fields=None, wait=False): try: import thread def rip_it (self): reruns = [] for ripper in self.repo.rippers(): try: if (ripper.rerun_after_metadata_changes(changed_fields=changed_fields) or any([ripper.rerun_after_other_ripper(x.name()) for x in reruns])): ripper.rip(self.folder(), self.id) reruns.append(ripper) except: note("Exception running %s on %s:\n%s", ripper, self, ''.join(traceback.format_exception(*sys.exc_info()))) self.recache() newthread = uthread.start_new_thread(rip_it, (self,)) if wait: newthread.join() return newthread except: type, value, tb = sys.exc_info() note("%s", traceback.format_exception(type, value, tb))
def fork_request(self, fn, *args): note(3, "forking %s in new thread...", fn) id = uthread.start_new_thread(run_fn_in_new_thread, (self, fn, args), "handling request %s %s at %s" % ( self.request.request.method, self.request.request.uri, time.ctime())) raise ForkRequestInNewThread(id)
def retry_folders (repo): def _retry_folders_thread_fn (repo): directory = repo.pending_folder() pending_docs = [x for x in os.listdir(directory) if DOC_ID_RE.match(x)] note(3, "%d docs in 'pending' folder", len(pending_docs)) for filename in pending_docs: try: # retry this document folderpath = os.path.join(directory, filename) if (os.path.exists(os.path.join(folderpath, "UNPACKED")) or os.path.exists(os.path.join(folderpath, "RIPPED"))): note(2, "Attempting to salvage pending folder %s", filename) retry_folder (repo, folderpath, filename) else: note("Files in %s may be salvageable, but not automatically. Please check.", folderpath) except: note("retry_folders: %s", ''.join(traceback.format_exception(*sys.exc_info()))) uthread.start_new_thread(_retry_folders_thread_fn, (repo,), name="retry_pending_folders")
def _incorporate_document(repo, incoming_queue, counter): # it's a bit tricky to shut this down cleanly, so we do # some extra checking import sys, Queue, traceback, threading while True: try: value = incoming_queue.get(True, 60) # value contains: (id, tmpfilename, metadata, unpack_fn) id, tmpfilename, metadata, unpack_fn = value if isinstance(counter, threading._BoundedSemaphore): # blocks until below max number of threads counter.acquire() uthread.start_new_thread(flesh_out_folder, (id, tmpfilename, metadata, repo, unpack_fn, counter), name="incorporating-%s" % id) except Queue.Empty: pass except: if sys and traceback and note: note("%s", ''.join(traceback.format_exception(*sys.exc_info())))
def fork_request(self, fn, *args): from uplib.plibUtil import uthread, note from uplib.service import run_fn_in_new_thread def run_fn_in_new_thread(resp, fn, args): try: fn(*args) except: excn = sys.exc_info() note(0, "Exception calling %s with %s:\n%s", fn, args, ''.join(traceback.format_exception(*excn))) resp.error(HTTPCodes.INTERNAL_SERVER_ERROR, ''.join(traceback.format_exception(*excn)), "text/plain") self.thread = uthread.start_new_thread(run_fn_in_new_thread, (self, fn, args))
def start(repo): from uplib.plibUtil import note, configurator, uthread global CRAWLER_THREAD try: import feedparser except ImportError: note("RSSReader: Python feedparser module not available -- can't run RSS scanner") return from uplib.indexing import HeaderField, initialize initialize() # make sure the indexing headers are present HeaderField.HEADERS["rss-id"] = HeaderField("rss-id", True, False, False, False, None) if CRAWLER_THREAD is None: CRAWLER_THREAD = uthread.start_new_thread(_scan_rss_sites, (repo,), name="RSS feed scanner")
def update_metadata (self, newdict, reindex=true): lock_folder(self.__folder) if reindex: oldvals = self.get_metadata().copy() try: self.__metadata = p_update_metadata(self.metadata_path(), newdict) self.__date = None self.__category_strings = None self.__citation = None finally: unlock_folder(self.__folder) if reindex: # show_stack(0, "mysterious re-indexing") d = newdict.copy() for k in d.keys(): if oldvals.get(k) == d.get(k): del d[k] newthread = uthread.start_new_thread(_reindex_document_folder, (self.repo, self.__folder, self.id, d.keys())) note(3, "reindexing %s in %s", self.id, str(newthread))
def add(repo, response, params): """ Add a document to the repository, calling ``uplib-add-document`` in a subprocess. :param wait: optional, whether to wait for the incorporation and ripping to \ happen. If not specified, ``add`` returns immediately after starting \ the incorporation process. If specified as ``true``, ``add`` will wait \ until the document is available in the repository. If specified as ``watch``, \ ``add`` will start a new ``Job`` which can be "watched" with the ``fetch_job_output`` \ function in ``uplib.externalAPI``. If specified as ``bounce``, and the ``URL`` \ parameter is also specified, the incorporation \ will be started, and ``add`` will immediately return an HTTP redirect to \ the value of ``URL``. If specified as ``watchexternal``, will start a new ``Job`` \ and immediately return the Job ID as a text/plain string. :type wait: string containing either ``watch`` or ``true`` or ``bounce`` :param content: the actual bits of the document. One of either ``content`` or ``URL`` must be specified. :type content: byte sequence :param contenttype: the MIME type for the document content :type contenttype: string containing MIME type :param URL: the URL for the document. One of either ``content`` or ``URL`` must be specified. :type URL: string :param documentname: the name of the document :type documentname: string :param no-redirect: if specified as ``true``, no redirect to the incorporated document \ will be returned; instead, a document ID string as "text/plain" will be returned, \ if ``wait`` is specified as ``true``. Optional, defaults to "false". :type no-redirect: boolean :param bury: optional, defaults to "false", if specified as "true" will cause \ the newly added document to be "buried" in the history list, so that it \ won't show up in the most-recently-used listing, as it normally would :type bury: boolean :param md-title: title to put in the document metadata :type md-title: string :param md-authors: standard UpLib authors line (" and "-separated) to put in the document metadata :type md-authors: string :param md-date: standard UpLib date ([MM[/DD]/]YYYY) to put in the document metadata :type md-date: string :param md-categories: standard UpLib categories string (comma-separated category names) to put in the document metadata :type md-categories: string :param metadata: contents of a standard UpLib metadata.txt file. If this file is provided, \ it is typically just passed unchanged to ``uplib-add-document``. However, it is \ inspected for the metadata element ``replacement-contents-for``, and if that is found, \ ``add`` will check to see that the specified document ID is still valid in that repository. :type metadata: string containing "text/rfc822-headers" format data :returns: depends on what parameters are passed. If ``wait`` is specified as ``true`` and ``no-redirect`` \ is specified as ``true``, will simply wait until the document has been incorporated and \ return the document ID as a plain text string. If ``no-redirect`` is not specified, \ and ``wait`` is ``true``, will return an HTTP redirect to the new document in the repository. \ If ``wait`` is specified as ``bounce``, will return an immediate redirect to the original \ URL for the document. If ``wait`` is not specified, will simply immediately return an HTTP \ 200 (Success) code and a non-committal message. :rtype: various """ wait = params.get("wait") content = params.get("content") url = params.get("URL") docname = params.get("documentname") if content and (not params.get("contenttype")): note(3, "add: No contenttype specified."); response.error(HTTPCodes.BAD_REQUEST, "No contenttype specified") return if (not content) and (not url): note(3, "add: Neither content nor URL specified."); response.error(HTTPCodes.BAD_REQUEST, "Nothing to upload!") return if wait and (wait.lower() in ("watch", "watchexternal")): job = Job(_add_internal, repo, None, params, content, True) note(3, "job id is %s", job.id) if url: title = htmlescape(url) elif docname: title = htmlescape(docname) else: title = 'document' if (wait.lower() == "watchexternal"): response.reply(job.id, "text/plain") else: fp = response.open() fp.write('<head><title>Adding %s to repository...</title>\n' % title) fp.write('<script type="text/javascript" language="javascript" src="/html/javascripts/prototype.js"></script>\n') fp.write(JOBS_JAVASCRIPT) fp.write('</head><body bgcolor="%s">\n' % STANDARD_BACKGROUND_COLOR) fp.write('<p style="background-color: %s;"><span id="swirl">%s</span> <span id="titlespan">Adding <b>%s</b>...</span></p>\n' % ( STANDARD_TOOLS_COLOR, SWIRLIMG, title)) fp.write('<p id="progressreport"></p>\n') fp.write('<script type="text/javascript">\n' 'function report_error (req) {\n' ' // alert("Can\'t check status of job");\n' '}\n' 'function update_progress_report(jobid, percent_done, update_text) {\n' ' // alert("update_text is " + update_text);\n' ' var state = eval("(" + update_text + ")");\n' ' // alert("state is " + state);\n' ' if (percent_done >= 100) {\n' ' $("swirl").innerHTML = \'' + SWIRLSPACER + '\';\n' ' $("titlespan").innerHTML = "Finished adding ' + title + '.";\n' ' }\n' ' if (state.state == 2) {\n' ' $("progressreport").innerHTML = \'Finished.\\n<p>Click here <a href="/action/basic/dv_show?doc_id=\' + unescape(state.doc_id) + \'"><img src="/docs/\' + unescape(state.doc_id) + \'/thumbnails/first.png" border=0></a> to open the document in the UpLib browser viewer.\';\n' ' } else if (state.state == 0) {\n' ' $("progressreport").innerHTML = "Extracting page images and text...";\n' ' } else if (state.state == 1) {\n' ' $("progressreport").innerHTML = "Finished client side, ID is " + unescape(state.doc_id) + "<br>" + unescape(state.msg);\n' ' } else {\n' ' $("progressreport").innerHTML = "Error:<br><pre>" + unescape(state.msg) + "</pre>";\n' ' }\n' '}\n' 'Jobs.monitor("' + job.id + '", update_progress_report, 3, report_error);\n' '</script>\n') fp.write('</body>\n') return elif wait and (wait.lower() == "true"): response.fork_request(_add_internal, None, None, repo, response, params, content, True) else: uthread.start_new_thread(_add_internal, (None, None, repo, response, params, content, False), "UploadDocument: adding %s" % (docname or url or time.ctime())) if url and (wait.lower() == "bounce"): response.redirect(url) else: response.reply("Started new thread to add document", "text/plain")