def generate_keywords(req, recid, argd):
    """Extracts keywords from the fulltexts (if found) for the
    given recid. It first checks whether the keywords are not already
    stored in the temp file (maybe from the previous run).
    @var req: req object
    @var recid: record id
    @var argd: arguments passed from web
    @keyword store_keywords: boolean, whether to save records in the file
    @return: standard dictionary of kw objects or {}
    """

    ln = argd['ln']
    _ = gettext_set_language(ln)
    keywords = {}

    # check the files were not already generated
    abs_path = bibclassify_engine.get_tmp_file(recid)
    if os.path.exists(abs_path):
        try:
            # Try to load the data from the tmp file
            recs = bibupload.xml_marc_to_records(bibupload.open_marc_file(abs_path))
            return record_get_keywords(recs[0])
        except:
            pass

    # check it is allowed (for this user) to generate pages
    (exit_stat, msg) = acce.acc_authorize_action(req, 'runbibclassify')
    if exit_stat != 0:
        log.info('Access denied: ' + msg)
        msg = _("The site settings do not allow automatic keyword extraction")
        req.write(template.tmpl_page_msg(msg=msg))
        return 0, keywords, None

    # register generation
    bibdocfiles = BibRecDocs(recid).list_latest_files()
    if bibdocfiles:
        # User arrived at a page, but no keywords are available
        inprogress, msg = _doc_already_submitted(recid)
        if argd['generate'] != 'yes':
            # Display a form and give them possibility to generate keywords
            if inprogress:
                req.write(template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _(msg)))
            else:
                req.write(template.tmpl_page_generate_keywords(req=req, **argd))
            return 0, keywords, None
        else: # after user clicked on "generate" button
            if inprogress:
                req.write(template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _(msg) ))
            else:
                schedule_extraction(recid, taxonomy=bconfig.CFG_EXTRACTION_TAXONOMY)
                req.write(template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' %
                                                 _('We have registered your request, the automated'
                'keyword extraction will run after some time. Please return back in a while.')))

    else:
        req.write(template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' %
                    _("Unfortunately, we don't have a PDF fulltext for this record in the storage, \
                    keywords cannot be generated using an automated process.")))

    return 0, keywords, None
Example #2
0
def main_page(req, recid, tabs, ln, webstyle_template, websearch_template):
    """Generates the main page for the keyword tab - http://url/record/[recid]/keywords
    @var req: request object
    @var recid: int docid
    @var tabs: list of tab links
    @var ln: language id
    @var webstyle_template: template object
    @var websearch_template: template object
    @return: nothing, writes using req object
    """

    form = req.form
    argd = wash_urlargd(
        form, {
            'generate': (str, 'no'),
            'sorting': (str, 'occurences'),
            'type': (str, 'tagcloud'),
            'numbering': (str, 'off'),
            'showall': (str, 'off'),
        })

    for k, v in argd.items():
        argd[k] = escape(v)

    req.write(webstyle_template.detailed_record_container_top(recid, tabs, ln))
    req.write(websearch_template.tmpl_record_plots(recID=recid, ln=ln))

    # Get the keywords from MARC (if any)
    success, keywords, marcrec = record_get_keywords(recid)

    if success:
        # check for the cached file and delete it (we don't need it anymore, data are in the DB)
        tmp_file = bibclassify_engine.get_tmp_file(recid)
        if os.path.exists(tmp_file):
            try:
                os.remove(tmp_file)
            except Exception, msg:
                log.error('Error removing the cached file: %s' % tmp_file)
                log.error(msg)
def main_page(req, recid, tabs, ln, template):
    """Generates the main page for the keyword tab - http://url/record/[recid]/keywords
    @var req: request object
    @var recid: int docid
    @var tabs: list of tab links
    @var ln: language id
    @var template: template object
    @return: nothing, writes using req object
    """

    form = req.form
    argd = wash_urlargd(form, {
        'generate': (str, 'no'),
        'sorting': (str, 'occurences'),
        'type': (str, 'tagcloud'),
        'numbering': (str, 'off'),
        'showall': (str, 'off'),
        })
    for k,v in argd.items():
        argd[k] = escape(v)


    req.write(template.detailed_record_container_top(recid, tabs, ln))


    # Get the keywords from MARC (if any)
    success, keywords, marcrec = record_get_keywords(recid)

    if success:
        # check for the cached file and delete it (we don't need it anymore, data are in the DB)
        tmp_file = bibclassify_engine.get_tmp_file(recid)
        if os.path.exists(tmp_file):
            try:
                os.remove(tmp_file)
            except Exception, msg:
                log.error('Error removing the cached file: %s' % tmp_file)
                log.error(msg)
Example #4
0
def _task_run_core():
    """Runs analyse_documents for each ontology, collection, record ids
    set."""

    automated_daemon_mode_p = True
    recids = bibtask.task_get_option('recids')
    collections = bibtask.task_get_option('collections')
    taxonomy = bibtask.task_get_option('taxonomy')

    if recids or collections:
        # We want to run some records/collection only, so we are not
        # in the automated daemon mode; this will be useful later.
        automated_daemon_mode_p = False

    # Check if the user specified which documents to extract keywords from.
    if recids:
        onto_recids = _get_recids_foreach_ontology(recids=recids,
                                                   taxonomy=taxonomy)
    elif collections:
        onto_recids = _get_recids_foreach_ontology(collections=collections,
                                                   taxonomy=taxonomy)
    else:
        onto_recids = _get_recids_foreach_ontology()

    if not onto_recids:
        # Nothing to do.
        if automated_daemon_mode_p:
            _update_date_of_last_run(
                bibtask.task_get_task_param('task_starting_time'))
        return 1

    # We will write to a temporary file as we go, because we might be processing
    # big collections with many docs
    _rid = time.strftime("%Y%m%d%H%M%S", time.localtime())
    abs_path = bibclassify_engine.get_tmp_file(_rid)
    fo = open(abs_path, 'w')

    fo.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    fo.write('<collection xmlns="http://www.loc.gov/MARC21/slim">\n')

    # Count the total number of records in order to update the progression.
    global _RECIDS_NUMBER
    for onto_rec in onto_recids:
        _RECIDS_NUMBER += len(onto_rec['recIDs'])

    rec_added = False

    for onto_rec in onto_recids:
        bibtask.task_sleep_now_if_required(can_stop_too=False)

        if onto_rec['collection'] is not None:
            bibtask.write_message(
                'INFO: Applying taxonomy %s to collection %s (%s '
                'records)' % (onto_rec['ontology'], onto_rec['collection'],
                              len(onto_rec['recIDs'])),
                stream=sys.stderr,
                verbose=3)
        else:
            bibtask.write_message(
                'INFO: Applying taxonomy %s to recIDs %s. ' %
                (onto_rec['ontology'], ', '.join(
                    [str(recid) for recid in onto_rec['recIDs']])),
                stream=sys.stderr,
                verbose=3)
        if onto_rec['recIDs']:
            xml = _analyze_documents(onto_rec['recIDs'], onto_rec['ontology'],
                                     onto_rec['collection'])
            if len(xml) > 5:
                fo.write(xml)
                rec_added = True

    fo.write('</collection>\n')
    fo.close()

    # Apply the changes.
    if rec_added:
        if bconfig.CFG_DB_SAVE_KW:
            bibclassify_webinterface.upload_keywords(abs_path)
        else:
            bibtask.write_message(
                "INFO: CFG_DB_SAVE_KW is false, we don't save results",
                stream=sys.stderr,
                verbose=0)
    else:
        bibtask.write_message("WARNING: No keywords found, recids: %s" %
                              onto_recids,
                              stream=sys.stderr,
                              verbose=0)
        os.remove(abs_path)

    # Update the date of last run in the clsMETHOD table, but only if
    # we were running in an automated mode.
    if automated_daemon_mode_p:
        _update_date_of_last_run(
            bibtask.task_get_task_param('task_starting_time'))
    return 1
Example #5
0
def generate_keywords(req, recid, argd):
    """Extracts keywords from the fulltexts (if found) for the
    given recid. It first checks whether the keywords are not already
    stored in the temp file (maybe from the previous run).
    @var req: req object
    @var recid: record id
    @var argd: arguments passed from web
    @keyword store_keywords: boolean, whether to save records in the file
    @return: standard dictionary of kw objects or {}
    """

    ln = argd['ln']
    _ = gettext_set_language(ln)
    keywords = {}

    # check the files were not already generated
    abs_path = bibclassify_engine.get_tmp_file(recid)
    if os.path.exists(abs_path):
        try:
            # Try to load the data from the tmp file
            recs = bibupload.xml_marc_to_records(
                bibupload.open_marc_file(abs_path))
            return record_get_keywords(recs[0])
        except:
            pass

    # check it is allowed (for this user) to generate pages
    (exit_stat, msg) = acce.acc_authorize_action(req, 'runbibclassify')
    if exit_stat != 0:
        log.info('Access denied: ' + msg)
        msg = _("The site settings do not allow automatic keyword extraction")
        req.write(template.tmpl_page_msg(msg=msg))
        return 0, keywords, None

    # register generation
    bibdocfiles = BibRecDocs(recid).list_latest_files()
    if bibdocfiles:
        # User arrived at a page, but no keywords are available
        inprogress, msg = _doc_already_submitted(recid)
        if argd['generate'] != 'yes':
            # Display a form and give them possibility to generate keywords
            if inprogress:
                req.write(
                    template.tmpl_page_msg(
                        msg='<div class="warningbox">%s</div>' % _(msg)))
            else:
                req.write(template.tmpl_page_generate_keywords(req=req,
                                                               **argd))
            return 0, keywords, None
        else:  # after user clicked on "generate" button
            if inprogress:
                req.write(
                    template.tmpl_page_msg(
                        msg='<div class="warningbox">%s</div>' % _(msg)))
            else:
                schedule_extraction(recid,
                                    taxonomy=bconfig.CFG_EXTRACTION_TAXONOMY)
                req.write(
                    template.
                    tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _(
                        'We have registered your request, the automated'
                        'keyword extraction will run after some time. Please return back in a while.'
                    )))

    else:
        req.write(
            template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _(
                "Unfortunately, we don't have a PDF fulltext for this record in the storage, \
                    keywords cannot be generated using an automated process."))
        )

    return 0, keywords, None
Example #6
0
def _task_run_core():
    """Runs analyse_documents for each ontology, collection, record ids
    set."""

    automated_daemon_mode_p = True
    recids = bibtask.task_get_option("recids")
    collections = bibtask.task_get_option("collections")
    taxonomy = bibtask.task_get_option("taxonomy")

    if recids or collections:
        # We want to run some records/collection only, so we are not
        # in the automated daemon mode; this will be useful later.
        automated_daemon_mode_p = False

    # Check if the user specified which documents to extract keywords from.
    if recids:
        onto_recids = _get_recids_foreach_ontology(recids=recids, taxonomy=taxonomy)
    elif collections:
        onto_recids = _get_recids_foreach_ontology(collections=collections, taxonomy=taxonomy)
    else:
        onto_recids = _get_recids_foreach_ontology()

    if not onto_recids:
        # Nothing to do.
        if automated_daemon_mode_p:
            _update_date_of_last_run(bibtask.task_get_task_param("task_starting_time"))
        return 1

    # We will write to a temporary file as we go, because we might be processing
    # big collections with many docs
    _rid = time.strftime("%Y%m%d%H%M%S", time.localtime())
    abs_path = bibclassify_engine.get_tmp_file(_rid)
    fo = open(abs_path, "w")

    fo.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    fo.write('<collection xmlns="http://www.loc.gov/MARC21/slim">\n')

    # Count the total number of records in order to update the progression.
    global _RECIDS_NUMBER
    for onto_rec in onto_recids:
        _RECIDS_NUMBER += len(onto_rec["recIDs"])

    rec_added = False

    for onto_rec in onto_recids:
        bibtask.task_sleep_now_if_required(can_stop_too=False)

        if onto_rec["collection"] is not None:
            bibtask.write_message(
                "INFO: Applying taxonomy %s to collection %s (%s "
                "records)" % (onto_rec["ontology"], onto_rec["collection"], len(onto_rec["recIDs"])),
                stream=sys.stderr,
                verbose=3,
            )
        else:
            bibtask.write_message(
                "INFO: Applying taxonomy %s to recIDs %s. "
                % (onto_rec["ontology"], ", ".join([str(recid) for recid in onto_rec["recIDs"]])),
                stream=sys.stderr,
                verbose=3,
            )
        if onto_rec["recIDs"]:
            xml = _analyze_documents(onto_rec["recIDs"], onto_rec["ontology"], onto_rec["collection"])
            if len(xml) > 5:
                fo.write(xml)
                rec_added = True

    fo.write("</collection>\n")
    fo.close()

    # Apply the changes.
    if rec_added:
        if bconfig.CFG_DB_SAVE_KW:
            bibclassify_webinterface.upload_keywords(abs_path)
        else:
            bibtask.write_message("INFO: CFG_DB_SAVE_KW is false, we don't save results", stream=sys.stderr, verbose=0)
    else:
        bibtask.write_message("WARNING: No keywords found, recids: %s" % onto_recids, stream=sys.stderr, verbose=0)
        os.remove(abs_path)

    # Update the date of last run in the clsMETHOD table, but only if
    # we were running in an automated mode.
    if automated_daemon_mode_p:
        _update_date_of_last_run(bibtask.task_get_task_param("task_starting_time"))
    return 1