def start_incorporation_thread(repo, n_simultaneous_threads):
    incoming_queue = Queue.Queue()
    name = "process_incoming_documents"
    counter = None
    if (n_simultaneous_threads > 0):
        name += "_in_at_most_%d_threads" % n_simultaneous_threads
        counter = threading.BoundedSemaphore(n_simultaneous_threads)
    uthread.start_new_thread(_incorporate_document, (repo, incoming_queue, counter), name=name)
    return incoming_queue
 def __init__(self, repo, expunge_deletes_docs=False, use_for_email=False, allow_readers=False, ip=None,
              server_certificate_file=None):
     self.repo = repo
     self.ip = ip
     if use_for_email:
         email_namespace = (namespace("", "/"),)
     else:
         email_namespace = ()
     doc_namespace = (namespace(repo.name(), "/"),)
     self.__namespaces = (email_namespace, (), doc_namespace)
     self.expunge_deletes_docs = expunge_deletes_docs
     self.expunge_deletes_inbox_docs = use_for_email
     self.allow_readers = allow_readers
     self.__server_certificate_file = server_certificate_file
     self.__dir = os.path.join(repo.overhead_folder(), "imap")
     if not os.path.exists(self.__dir):
         os.mkdir(self.__dir)
     mboxes = []
     inbox = None
     subscribed = []
     subscriptions = []
     if os.path.exists(os.path.join(self.__dir, "subscribed")):
         # read subscriptions
         for line in open(os.path.join(self.__dir, "subscribed"), 'r'):
             subscriptions.append(line.strip())
     categories = repo.categories()
     if use_for_email:
         inbox = uplib_email_mailbox("INBOX", self, category=False, ip=self.ip)
         mboxes.append(inbox)
         for c in categories:
             if c.startswith("email/"):
                 name = string.join([x.strip() for x in c.split('/')][1:], '/')
                 if name:
                     box = uplib_email_mailbox(name, self, category=c, ip=self.ip)
                     note("new mailbox %s", box)
                     mboxes.append(box)
                     subscribed.append(box)
     # build document context
     for c in categories:
         name = repo.name() + '/categories/' + string.join([x.strip() for x in c.split('/')], '/')
         if name:
             box = uplib_email_mailbox(name, self, category=c, email_folder=False, ip=self.ip)
             note("new mailbox %s", box)
             mboxes.append(box)
             if (("category " + c) in subscriptions) and (box not in subscribed):
                 subscribed.append(box)
     for cname, c in repo.list_collections():
         name = repo.name() + '/collections/' + cname
         if name:
             box = uplib_email_mailbox(name, self, category=None, email_folder=False, ip=self.ip, collection=c)
             note("new mailbox %s", box)
             mboxes.append(box)
             if (("collection " + c.name()) in subscriptions) and (box not in subscribed):
                 subscribed.append(box)
     mailcontext.__init__(self, inbox=inbox, mailboxes=mboxes, subscribed=subscribed)
     uthread.start_new_thread(checkpoint_thread_fn, (weakref.ref(self),))
    def rerip (self, changed_fields=None, wait=False):

        try:

            import thread

            def rip_it (self):
                reruns = []
                for ripper in self.repo.rippers():
                    try:
                        if (ripper.rerun_after_metadata_changes(changed_fields=changed_fields) or
                            any([ripper.rerun_after_other_ripper(x.name()) for x in reruns])):
                            ripper.rip(self.folder(), self.id)
                            reruns.append(ripper)
                    except:
                        note("Exception running %s on %s:\n%s", ripper, self,
                             ''.join(traceback.format_exception(*sys.exc_info())))
                self.recache()

            newthread = uthread.start_new_thread(rip_it, (self,))
            if wait:
                newthread.join()
            return newthread

        except:
            
            type, value, tb = sys.exc_info()
            note("%s", traceback.format_exception(type, value, tb))
 def fork_request(self, fn, *args):
     note(3, "forking %s in new thread...", fn)
     id = uthread.start_new_thread(run_fn_in_new_thread, (self, fn, args),
                                   "handling request %s %s at %s" % (
                                       self.request.request.method,
                                       self.request.request.uri, time.ctime()))
     raise ForkRequestInNewThread(id)
def retry_folders (repo):
    def _retry_folders_thread_fn (repo):
        directory = repo.pending_folder()
        pending_docs = [x for x in os.listdir(directory) if DOC_ID_RE.match(x)]
        note(3, "%d docs in 'pending' folder", len(pending_docs))
        for filename in pending_docs:
            try:
                # retry this document
                folderpath = os.path.join(directory, filename)
                if (os.path.exists(os.path.join(folderpath, "UNPACKED")) or os.path.exists(os.path.join(folderpath, "RIPPED"))):
                    note(2, "Attempting to salvage pending folder %s", filename)
                    retry_folder (repo, folderpath, filename)
                else:
                    note("Files in %s may be salvageable, but not automatically.  Please check.", folderpath)
            except:
                note("retry_folders:  %s", ''.join(traceback.format_exception(*sys.exc_info())))
    uthread.start_new_thread(_retry_folders_thread_fn, (repo,), name="retry_pending_folders")
def _incorporate_document(repo, incoming_queue, counter):
    # it's a bit tricky to shut this down cleanly, so we do
    # some extra checking
    import sys, Queue, traceback, threading
    while True:
        try:
            value = incoming_queue.get(True, 60)
            # value contains:  (id, tmpfilename, metadata, unpack_fn)
            id, tmpfilename, metadata, unpack_fn = value
            if isinstance(counter, threading._BoundedSemaphore):
                # blocks until below max number of threads
                counter.acquire()
            uthread.start_new_thread(flesh_out_folder,
                                     (id, tmpfilename, metadata, repo, unpack_fn, counter),
                                     name="incorporating-%s" % id)
        except Queue.Empty:
            pass
        except:
            if sys and traceback and note:
                note("%s", ''.join(traceback.format_exception(*sys.exc_info())))
    def fork_request(self, fn, *args):
        from uplib.plibUtil import uthread, note
        from uplib.service import run_fn_in_new_thread

        def run_fn_in_new_thread(resp, fn, args):
            try:
                fn(*args)
            except:
                excn = sys.exc_info()
                note(0, "Exception calling %s with %s:\n%s", fn, args, ''.join(traceback.format_exception(*excn)))
                resp.error(HTTPCodes.INTERNAL_SERVER_ERROR, ''.join(traceback.format_exception(*excn)), "text/plain")

        self.thread = uthread.start_new_thread(run_fn_in_new_thread, (self, fn, args))
def start(repo):
    from uplib.plibUtil import note, configurator, uthread

    global CRAWLER_THREAD

    try:
        import feedparser
    except ImportError:
        note("RSSReader:  Python feedparser module not available -- can't run RSS scanner")
        return

    from uplib.indexing import HeaderField, initialize
    initialize()                # make sure the indexing headers are present
    HeaderField.HEADERS["rss-id"] = HeaderField("rss-id", True, False, False, False, None)

    if CRAWLER_THREAD is None:
        CRAWLER_THREAD = uthread.start_new_thread(_scan_rss_sites, (repo,), name="RSS feed scanner")
 def update_metadata (self, newdict, reindex=true):
     lock_folder(self.__folder)
     if reindex:
         oldvals = self.get_metadata().copy()
     try:
         self.__metadata = p_update_metadata(self.metadata_path(), newdict)
         self.__date = None
         self.__category_strings = None
         self.__citation = None
     finally:
         unlock_folder(self.__folder)
     if reindex:
         # show_stack(0, "mysterious re-indexing")
         d = newdict.copy()
         for k in d.keys():
             if oldvals.get(k) == d.get(k):
                 del d[k]
         newthread = uthread.start_new_thread(_reindex_document_folder, (self.repo, self.__folder, self.id, d.keys()))
         note(3, "reindexing %s in %s", self.id, str(newthread))
def add(repo, response, params):
    """
    Add a document to the repository, calling ``uplib-add-document`` in a subprocess.

    :param wait: optional, whether to wait for the incorporation and ripping to \
           happen.  If not specified, ``add`` returns immediately after starting \
           the incorporation process.  If specified as ``true``, ``add`` will wait \
           until the document is available in the repository.  If specified as ``watch``, \
           ``add`` will start a new ``Job`` which can be "watched" with the ``fetch_job_output`` \
           function in ``uplib.externalAPI``.  If specified as ``bounce``, and the ``URL`` \
           parameter is also specified, the incorporation \
           will be started, and ``add`` will immediately return an HTTP redirect to \
           the value of ``URL``.  If specified as ``watchexternal``, will start a new ``Job`` \
           and immediately return the Job ID as a text/plain string.
    :type wait: string containing either ``watch`` or ``true`` or ``bounce``
    :param content: the actual bits of the document.  One of either ``content`` or ``URL`` must be specified.
    :type content: byte sequence
    :param contenttype: the MIME type for the document content
    :type contenttype: string containing MIME type
    :param URL: the URL for the document.  One of either ``content`` or ``URL`` must be specified.
    :type URL: string
    :param documentname: the name of the document
    :type documentname: string
    :param no-redirect: if specified as ``true``, no redirect to the incorporated document \
           will be returned; instead, a document ID string as "text/plain" will be returned, \
           if ``wait`` is specified as ``true``.  Optional, defaults to "false".
    :type no-redirect: boolean
    :param bury: optional, defaults to "false", if specified as "true" will cause \
           the newly added document to be "buried" in the history list, so that it \
           won't show up in the most-recently-used listing, as it normally would
    :type bury: boolean
    :param md-title: title to put in the document metadata
    :type md-title: string
    :param md-authors: standard UpLib authors line (" and "-separated) to put in the document metadata
    :type md-authors: string
    :param md-date: standard UpLib date ([MM[/DD]/]YYYY) to put in the document metadata
    :type md-date: string
    :param md-categories: standard UpLib categories string (comma-separated category names) to put in the document metadata
    :type md-categories: string
    :param metadata: contents of a standard UpLib metadata.txt file.  If this file is provided, \
           it is typically just passed unchanged to ``uplib-add-document``.  However, it is \
           inspected for the metadata element ``replacement-contents-for``, and if that is found, \
           ``add`` will check to see that the specified document ID is still valid in that repository.
    :type metadata: string containing "text/rfc822-headers" format data
    :returns: depends on what parameters are passed.  If ``wait`` is specified as ``true`` and ``no-redirect`` \
              is specified as ``true``, will simply wait until the document has been incorporated and \
              return the document ID as a plain text string.  If ``no-redirect`` is not specified, \
              and ``wait`` is ``true``, will return an HTTP redirect to the new document in the repository. \
              If ``wait`` is specified as ``bounce``, will return an immediate redirect to the original \
              URL for the document.  If ``wait`` is not specified, will simply immediately return an HTTP \
              200 (Success) code and a non-committal message.
    :rtype: various
    """

    wait = params.get("wait")
    content = params.get("content")
    url = params.get("URL")
    docname = params.get("documentname")
    if content and (not params.get("contenttype")):
        note(3, "add:  No contenttype specified.");
        response.error(HTTPCodes.BAD_REQUEST, "No contenttype specified")
        return
    if (not content) and (not url):
        note(3, "add:  Neither content nor URL specified.");
        response.error(HTTPCodes.BAD_REQUEST, "Nothing to upload!")
        return
    
    if wait and (wait.lower() in ("watch", "watchexternal")):
        job = Job(_add_internal, repo, None, params, content, True)
        note(3, "job id is %s", job.id)
        if url:
            title = htmlescape(url)
        elif docname:
            title = htmlescape(docname)
        else:
            title = 'document'
        if (wait.lower() == "watchexternal"):
            response.reply(job.id, "text/plain")
        else:
            fp = response.open()
            fp.write('<head><title>Adding %s to repository...</title>\n' % title)
            fp.write('<script type="text/javascript" language="javascript" src="/html/javascripts/prototype.js"></script>\n')
            fp.write(JOBS_JAVASCRIPT)
            fp.write('</head><body bgcolor="%s">\n' % STANDARD_BACKGROUND_COLOR)
            fp.write('<p style="background-color: %s;"><span id="swirl">%s</span> <span id="titlespan">Adding <b>%s</b>...</span></p>\n' % (
                STANDARD_TOOLS_COLOR, SWIRLIMG, title))
            fp.write('<p id="progressreport"></p>\n')
            fp.write('<script type="text/javascript">\n'
                     'function report_error (req) {\n'
                     '  // alert("Can\'t check status of job");\n'
                     '}\n'
                     'function update_progress_report(jobid, percent_done, update_text) {\n'
                     '  // alert("update_text is " + update_text);\n'
                     '  var state = eval("(" + update_text + ")");\n'
                     '  // alert("state is " + state);\n'
                     '  if (percent_done >= 100) {\n'
                     '     $("swirl").innerHTML = \'' + SWIRLSPACER + '\';\n'
                     '     $("titlespan").innerHTML = "Finished adding ' + title + '.";\n'
                     '  }\n'
                     '  if (state.state == 2) {\n'
                     '    $("progressreport").innerHTML = \'Finished.\\n<p>Click here <a href="/action/basic/dv_show?doc_id=\' + unescape(state.doc_id) + \'"><img src="/docs/\' + unescape(state.doc_id) + \'/thumbnails/first.png" border=0></a> to open the document in the UpLib browser viewer.\';\n'
                     '  } else if (state.state == 0) {\n'
                     '    $("progressreport").innerHTML = "Extracting page images and text...";\n'
                     '  } else if (state.state == 1) {\n'
                     '    $("progressreport").innerHTML = "Finished client side, ID is " + unescape(state.doc_id) + "<br>" + unescape(state.msg);\n'
                     '  } else {\n'
                     '    $("progressreport").innerHTML = "Error:<br><pre>" + unescape(state.msg) + "</pre>";\n'
                     '  }\n'
                     '}\n'
                     'Jobs.monitor("' + job.id + '", update_progress_report, 3, report_error);\n'
                     '</script>\n')
            fp.write('</body>\n')
        return
        
    elif wait and (wait.lower() == "true"):
        response.fork_request(_add_internal, None, None, repo, response, params, content, True)
    else:
        uthread.start_new_thread(_add_internal, (None, None, repo, response, params, content, False),
                                 "UploadDocument:  adding %s" % (docname or url or time.ctime()))
        if url and (wait.lower() == "bounce"):
            response.redirect(url)
        else:
            response.reply("Started new thread to add document", "text/plain")