Beispiel #1
0
def main_page(req, recid, tabs, ln, template):
    """Generate the main page for the keyword tab

    Url style : http://url/record/[recid]/keywords
    :param req: request object
    :param recid: int docid
    :param tabs: list of tab links
    :param ln: language id
    :param template: template object
    :return: nothing, writes using req object
    """
    form = req.form
    argd = wash_urlargd(
        form, {
            'generate': (str, 'no'),
            'sorting': (str, 'occurences'),
            'type': (str, 'tagcloud'),
            'numbering': (str, 'off'),
            'showall': (str, 'off'),
        })

    for k, v in argd.items():
        argd[k] = escape(v)

    req.write(template.detailed_record_container_top(recid, tabs, ln))

    # Get the keywords from MARC (if any)
    success, keywords, marcrec = record_get_keywords(recid)

    if success:
        # check for the cached file and delete it (we don't need it anymore, data are in the DB)
        tmp_file = get_tmp_file(recid)
        if os.path.exists(tmp_file):
            try:
                os.remove(tmp_file)
            except Exception as msg:
                log.error('Error removing the cached file: %s' % tmp_file)
                log.error(msg)
    else:
        # Give user possibility to generate them ONLY if not available already
        # we may have some keywords, but they are the old ones and we want to generate new
        new_found, new_keywords, marcrec = generate_keywords(req, recid, argd)
        if keywords and new_keywords:
            for key in keywords.keys():
                if key in new_keywords:
                    log.warning(
                        'The old "DESY" keyword will be overwritten by the newly extracted one: %s'
                        % key)
        keywords.update(new_keywords)

    if keywords:
        # Output the keywords or the generate button or some message why kw not available
        write_keywords_body(keywords, req, recid, argd, marcrec=marcrec)

    req.write(template.detailed_record_container_bottom(recid, tabs, ln))
Beispiel #2
0
def main_page(req, recid, tabs, ln, template):
    """Generate the main page for the keyword tab

    Url style : http://url/record/[recid]/keywords
    :param req: request object
    :param recid: int docid
    :param tabs: list of tab links
    :param ln: language id
    :param template: template object
    :return: nothing, writes using req object
    """
    form = req.form
    argd = wash_urlargd(
        form,
        {
            "generate": (str, "no"),
            "sorting": (str, "occurences"),
            "type": (str, "tagcloud"),
            "numbering": (str, "off"),
            "showall": (str, "off"),
        },
    )

    for k, v in argd.items():
        argd[k] = escape(v)

    req.write(template.detailed_record_container_top(recid, tabs, ln))

    # Get the keywords from MARC (if any)
    success, keywords, marcrec = record_get_keywords(recid)

    if success:
        # check for the cached file and delete it (we don't need it anymore, data are in the DB)
        tmp_file = get_tmp_file(recid)
        if os.path.exists(tmp_file):
            try:
                os.remove(tmp_file)
            except Exception as msg:
                log.error("Error removing the cached file: %s" % tmp_file)
                log.error(msg)
    else:
        # Give user possibility to generate them ONLY if not available already
        # we may have some keywords, but they are the old ones and we want to generate new
        new_found, new_keywords, marcrec = generate_keywords(req, recid, argd)
        if keywords and new_keywords:
            for key in keywords.keys():
                if key in new_keywords:
                    log.warning('The old "DESY" keyword will be overwritten by the newly extracted one: %s' % key)
        keywords.update(new_keywords)

    if keywords:
        # Output the keywords or the generate button or some message why kw not available
        write_keywords_body(keywords, req, recid, argd, marcrec=marcrec)

    req.write(template.detailed_record_container_bottom(recid, tabs, ln))
Beispiel #3
0
def _task_run_core():
    """Runs analyse_documents for each ontology, collection, record ids
    set."""

    automated_daemon_mode_p = True
    recids = bibtask.task_get_option('recids')
    collections = bibtask.task_get_option('collections')
    taxonomy = bibtask.task_get_option('taxonomy')

    if recids or collections:
        # We want to run some records/collection only, so we are not
        # in the automated daemon mode; this will be useful later.
        automated_daemon_mode_p = False

    # Check if the user specified which documents to extract keywords from.
    if recids:
        onto_recids = _get_recids_foreach_ontology(recids=recids,
                                                   taxonomy=taxonomy)
    elif collections:
        onto_recids = _get_recids_foreach_ontology(collections=collections,
                                                   taxonomy=taxonomy)
    else:
        onto_recids = _get_recids_foreach_ontology()

    if not onto_recids:
        # Nothing to do.
        if automated_daemon_mode_p:
            _update_date_of_last_run(
                bibtask.task_get_task_param('task_starting_time'))
        return 1

    # We will write to a temporary file as we go, because we might be processing
    # big collections with many docs
    _rid = time.strftime("%Y%m%d%H%M%S", time.localtime())
    abs_path = engine.get_tmp_file(_rid)
    fo = open(abs_path, 'w')

    fo.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    fo.write('<collection xmlns="http://www.loc.gov/MARC21/slim">\n')

    # Count the total number of records in order to update the progression.
    global _RECIDS_NUMBER
    for onto_rec in onto_recids:
        _RECIDS_NUMBER += len(onto_rec['recIDs'])

    rec_added = False

    for onto_rec in onto_recids:
        bibtask.task_sleep_now_if_required(can_stop_too=False)

        if onto_rec['collection'] is not None:
            bibtask.write_message(
                'INFO: Applying taxonomy %s to collection %s (%s '
                'records)' % (onto_rec['ontology'], onto_rec['collection'],
                              len(onto_rec['recIDs'])), stream=sys.stderr,
                verbose=3)
        else:
            bibtask.write_message('INFO: Applying taxonomy %s to recIDs %s. ' %
                                  (onto_rec['ontology'],
                                   ', '.join([str(recid) for recid in
                                              onto_rec['recIDs']])),
                                  stream=sys.stderr, verbose=3)
        if onto_rec['recIDs']:
            xml = _analyze_documents(onto_rec['recIDs'],
                                     onto_rec['ontology'],
                                     onto_rec['collection'])
            if len(xml) > 5:
                fo.write(xml)
                rec_added = True

    fo.write('</collection>\n')
    fo.close()

    # Apply the changes.
    if rec_added:
        if bconfig.CFG_DB_SAVE_KW:
            webinterface.upload_keywords(abs_path)
        else:
            bibtask.write_message(
                "INFO: CFG_DB_SAVE_KW is false, we don't save results",
                stream=sys.stderr, verbose=0)
    else:
        bibtask.write_message(
            "WARNING: No keywords found, recids: %s" % onto_recids,
            stream=sys.stderr, verbose=0)
        os.remove(abs_path)

    # Update the date of last run in the clsMETHOD table, but only if
    # we were running in an automated mode.
    if automated_daemon_mode_p:
        _update_date_of_last_run(
            bibtask.task_get_task_param('task_starting_time'))
    return 1
Beispiel #4
0
def generate_keywords(req, recid, argd):
    """Extract keywords from the fulltexts.

    Do the extraction on the record witth a recid equal to the parameter.
    It first checks whether the keywords are not already
    stored in the temp file (maybe from the previous run).

    :param req: req object.
    :param recid: record id.
    :param argd: arguments passed from web.
    :keyword store_keywords: boolean, whether to save records in the file.
    :return: standard dictionary of kw objects or {}.
    """
    ln = argd['ln']
    _ = gettext_set_language(ln)
    keywords = {}

    # check the files were not already generated
    abs_path = get_tmp_file(recid)
    if os.path.exists(abs_path):
        try:
            # Try to load the data from the tmp file
            recs = xml_marc_to_records(open_marc_file(abs_path))
            return record_get_keywords(recs[0])
        except:
            pass

    # check it is allowed (for this user) to generate pages
    (exit_stat, msg) = acce.acc_authorize_action(req, 'runbibclassify')
    if exit_stat != 0:
        log.info('Access denied: ' + msg)
        msg = _("The site settings do not allow automatic keyword extraction")
        req.write(template.tmpl_page_msg(msg=msg))
        return 0, keywords, None

    # register generation
    bibdocfiles = BibRecDocs(recid).list_latest_files()
    if bibdocfiles:
        # User arrived at a page, but no keywords are available
        inprogress, msg = _doc_already_submitted(recid)
        if argd['generate'] != 'yes':
            # Display a form and give them possibility to generate keywords
            if inprogress:
                req.write(
                    template.tmpl_page_msg(
                        msg='<div class="warningbox">%s</div>' % _(msg)))
            else:
                req.write(template.tmpl_page_generate_keywords(req=req,
                                                               **argd))
            return 0, keywords, None
        else:  # after user clicked on "generate" button
            if inprogress:
                req.write(
                    template.tmpl_page_msg(
                        msg='<div class="warningbox">%s</div>' % _(msg)))
            else:
                schedule_extraction(recid,
                                    taxonomy=bconfig.CFG_EXTRACTION_TAXONOMY)
                req.write(
                    template.
                    tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _(
                        'We have registered your request, the automated'
                        'keyword extraction will run after some time. Please return back in a while.'
                    )))

    else:
        req.write(
            template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _(
                "Unfortunately, we don't have a PDF fulltext for this record in the storage, \
                    keywords cannot be generated using an automated process."))
        )

    return 0, keywords, None
Beispiel #5
0
def _task_run_core():
    """Runs analyse_documents for each ontology, collection, record ids
    set."""

    automated_daemon_mode_p = True
    recids = bibtask.task_get_option('recids')
    collections = bibtask.task_get_option('collections')
    taxonomy = bibtask.task_get_option('taxonomy')

    if recids or collections:
        # We want to run some records/collection only, so we are not
        # in the automated daemon mode; this will be useful later.
        automated_daemon_mode_p = False

    # Check if the user specified which documents to extract keywords from.
    if recids:
        onto_recids = _get_recids_foreach_ontology(recids=recids,
                                                   taxonomy=taxonomy)
    elif collections:
        onto_recids = _get_recids_foreach_ontology(collections=collections,
                                                   taxonomy=taxonomy)
    else:
        onto_recids = _get_recids_foreach_ontology()

    if not onto_recids:
        # Nothing to do.
        if automated_daemon_mode_p:
            _update_date_of_last_run(
                bibtask.task_get_task_param('task_starting_time'))
        return 1

    # We will write to a temporary file as we go, because we might be processing
    # big collections with many docs
    _rid = time.strftime("%Y%m%d%H%M%S", time.localtime())
    abs_path = engine.get_tmp_file(_rid)
    fo = open(abs_path, 'w')

    fo.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    fo.write('<collection xmlns="http://www.loc.gov/MARC21/slim">\n')

    # Count the total number of records in order to update the progression.
    global _RECIDS_NUMBER
    for onto_rec in onto_recids:
        _RECIDS_NUMBER += len(onto_rec['recIDs'])

    rec_added = False

    for onto_rec in onto_recids:
        bibtask.task_sleep_now_if_required(can_stop_too=False)

        if onto_rec['collection'] is not None:
            bibtask.write_message(
                'INFO: Applying taxonomy %s to collection %s (%s '
                'records)' % (onto_rec['ontology'], onto_rec['collection'],
                              len(onto_rec['recIDs'])),
                stream=sys.stderr,
                verbose=3)
        else:
            bibtask.write_message(
                'INFO: Applying taxonomy %s to recIDs %s. ' %
                (onto_rec['ontology'], ', '.join(
                    [str(recid) for recid in onto_rec['recIDs']])),
                stream=sys.stderr,
                verbose=3)
        if onto_rec['recIDs']:
            xml = _analyze_documents(onto_rec['recIDs'], onto_rec['ontology'],
                                     onto_rec['collection'])
            if len(xml) > 5:
                fo.write(xml)
                rec_added = True

    fo.write('</collection>\n')
    fo.close()

    # Apply the changes.
    if rec_added:
        if bconfig.CFG_DB_SAVE_KW:
            webinterface.upload_keywords(abs_path)
        else:
            bibtask.write_message(
                "INFO: CFG_DB_SAVE_KW is false, we don't save results",
                stream=sys.stderr,
                verbose=0)
    else:
        bibtask.write_message("WARNING: No keywords found, recids: %s" %
                              onto_recids,
                              stream=sys.stderr,
                              verbose=0)
        os.remove(abs_path)

    # Update the date of last run in the clsMETHOD table, but only if
    # we were running in an automated mode.
    if automated_daemon_mode_p:
        _update_date_of_last_run(
            bibtask.task_get_task_param('task_starting_time'))
    return 1
Beispiel #6
0
def generate_keywords(req, recid, argd):
    """Extract keywords from the fulltexts.

    Do the extraction on the record witth a recid equal to the parameter.
    It first checks whether the keywords are not already
    stored in the temp file (maybe from the previous run).

    :param req: req object.
    :param recid: record id.
    :param argd: arguments passed from web.
    :keyword store_keywords: boolean, whether to save records in the file.
    :return: standard dictionary of kw objects or {}.
    """
    ln = argd["ln"]
    _ = gettext_set_language(ln)
    keywords = {}

    # check the files were not already generated
    abs_path = get_tmp_file(recid)
    if os.path.exists(abs_path):
        try:
            # Try to load the data from the tmp file
            recs = xml_marc_to_records(open_marc_file(abs_path))
            return record_get_keywords(recs[0])
        except:
            pass

    # check it is allowed (for this user) to generate pages
    (exit_stat, msg) = acce.acc_authorize_action(req, "runbibclassify")
    if exit_stat != 0:
        log.info("Access denied: " + msg)
        msg = _("The site settings do not allow automatic keyword extraction")
        req.write(template.tmpl_page_msg(msg=msg))
        return 0, keywords, None

    # register generation
    bibdocfiles = BibRecDocs(recid).list_latest_files()
    if bibdocfiles:
        # User arrived at a page, but no keywords are available
        inprogress, msg = _doc_already_submitted(recid)
        if argd["generate"] != "yes":
            # Display a form and give them possibility to generate keywords
            if inprogress:
                req.write(template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _(msg)))
            else:
                req.write(template.tmpl_page_generate_keywords(req=req, **argd))
            return 0, keywords, None
        else:  # after user clicked on "generate" button
            if inprogress:
                req.write(template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _(msg)))
            else:
                schedule_extraction(recid, taxonomy=bconfig.CFG_EXTRACTION_TAXONOMY)
                req.write(
                    template.tmpl_page_msg(
                        msg='<div class="warningbox">%s</div>'
                        % _(
                            "We have registered your request, the automated"
                            "keyword extraction will run after some time. Please return back in a while."
                        )
                    )
                )

    else:
        req.write(
            template.tmpl_page_msg(
                msg='<div class="warningbox">%s</div>'
                % _(
                    "Unfortunately, we don't have a PDF fulltext for this record in the storage, \
                    keywords cannot be generated using an automated process."
                )
            )
        )

    return 0, keywords, None