def generate_keywords(req, recid, argd):
    """Extracts keywords from the fulltexts (if found) for the
    given recid. It first checks whether the keywords are not already
    stored in the temp file (maybe from the previous run).
    @var req: req object
    @var recid: record id
    @var argd: arguments passed from web
    @keyword store_keywords: boolean, whether to save records in the file
    @return: standard dictionary of kw objects or {}
    """

    ln = argd['ln']
    _ = gettext_set_language(ln)
    keywords = {}

    # check the files were not already generated
    abs_path = bibclassify_engine.get_tmp_file(recid)
    if os.path.exists(abs_path):
        try:
            # Try to load the data from the tmp file
            recs = bibupload.xml_marc_to_records(bibupload.open_marc_file(abs_path))
            return record_get_keywords(recs[0])
        except:
            pass

    # check it is allowed (for this user) to generate pages
    (exit_stat, msg) = acce.acc_authorize_action(req, 'runbibclassify')
    if exit_stat != 0:
        log.info('Access denied: ' + msg)
        msg = _("The site settings do not allow automatic keyword extraction")
        req.write(template.tmpl_page_msg(msg=msg))
        return 0, keywords, None

    # register generation
    bibdocfiles = BibRecDocs(recid).list_latest_files()
    if bibdocfiles:
        # User arrived at a page, but no keywords are available
        inprogress, msg = _doc_already_submitted(recid)
        if argd['generate'] != 'yes':
            # Display a form and give them possibility to generate keywords
            if inprogress:
                req.write(template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _(msg)))
            else:
                req.write(template.tmpl_page_generate_keywords(req=req, **argd))
            return 0, keywords, None
        else: # after user clicked on "generate" button
            if inprogress:
                req.write(template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _(msg) ))
            else:
                schedule_extraction(recid, taxonomy=bconfig.CFG_EXTRACTION_TAXONOMY)
                req.write(template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' %
                                                 _('We have registered your request, the automated'
                'keyword extraction will run after some time. Please return back in a while.')))

    else:
        req.write(template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' %
                    _("Unfortunately, we don't have a PDF fulltext for this record in the storage, \
                    keywords cannot be generated using an automated process.")))

    return 0, keywords, None
    if len(args) != 1:
        sys.stderr.write("Error: Missing MARCXML to analyse")
        print usage
        sys.exit(1)

    input_filename = args[0]

    if not os.path.exists(input_filename):
        sys.stderr.write("Please enter a valid filename for input.")
        sys.exit(1)
    if not os.path.exists(config_path):
        sys.stderr.write("Please enter a valid filename for config.")
        sys.exit(1)

    # Read and wash incoming data
    file_data = open_marc_file(input_filename)
    washed_data = wash_for_xml(wash_for_utf8(file_data))

    # Transform MARCXML to record structure
    records = create_records(washed_data)
    action_dict = read_actions_configuration_file(config_path)
    insert_records = []
    append_records = []
    correct_records = []
    holdingpen_records = []

    for rec in records:
        record = rec[0]
        if record is None:
            sys.stderr.write("Record is None: %s" % (rec[2],))
            sys.exit(1)
Esempio n. 3
0
    if len(args) != 1:
        sys.stderr.write("Error: Missing MARCXML to analyse")
        print usage
        sys.exit(1)

    input_filename = args[0]

    if not os.path.exists(input_filename):
        sys.stderr.write("Please enter a valid filename for input.")
        sys.exit(1)
    if not os.path.exists(config_path):
        sys.stderr.write("Please enter a valid filename for config.")
        sys.exit(1)

    # Read and wash incoming data
    file_data = open_marc_file(input_filename)
    washed_data = wash_for_xml(wash_for_utf8(file_data))

    # Transform MARCXML to record structure
    records = create_records(washed_data)
    action_dict = read_actions_configuration_file(config_path)
    insert_records = []
    append_records = []
    correct_records = []
    holdingpen_records = []

    for rec in records:
        record = rec[0]
        if record is None:
            sys.stderr.write("Record is None: %s" % (rec[2], ))
            sys.exit(1)
        sys.stderr.write("Error: Missing MARCXML to analyse")
        print usage
        sys.exit(1)

    input_filename = args[0]

    if not os.path.exists(input_filename):
        sys.stderr.write("Please enter a valid filename for input.")
        sys.exit(1)
    if not os.path.exists(config_path):
        sys.stderr.write("Please enter a valid filename for config.")
        sys.exit(1)

    # Transform MARCXML to record structure
    try:
        records = create_records(open_marc_file(input_filename))
    except:
        sys.stderr.write("bibupload.xml_marc_to_records failed on file: %s" %
                         (input_filename, ))
        sys.exit(3)
    action_dict = read_actions_configuration_file(config_path)
    insert_records = []
    append_records = []
    correct_records = []
    holdingpen_records = []

    for rec in records:
        record = rec[0]
        # Perform various checks to determine an suitable action to be taken for
        # that particular record. Whether it will be inserted, discarded or replacing
        # existing records
Esempio n. 5
0
def generate_keywords(req, recid, argd):
    """Extracts keywords from the fulltexts (if found) for the
    given recid. It first checks whether the keywords are not already
    stored in the temp file (maybe from the previous run).
    @var req: req object
    @var recid: record id
    @var argd: arguments passed from web
    @keyword store_keywords: boolean, whether to save records in the file
    @return: standard dictionary of kw objects or {}
    """

    ln = argd['ln']
    _ = gettext_set_language(ln)
    keywords = {}

    # check the files were not already generated
    abs_path = bibclassify_engine.get_tmp_file(recid)
    if os.path.exists(abs_path):
        try:
            # Try to load the data from the tmp file
            recs = bibupload.xml_marc_to_records(
                bibupload.open_marc_file(abs_path))
            return record_get_keywords(recs[0])
        except:
            pass

    # check it is allowed (for this user) to generate pages
    (exit_stat, msg) = acce.acc_authorize_action(req, 'runbibclassify')
    if exit_stat != 0:
        log.info('Access denied: ' + msg)
        msg = _("The site settings do not allow automatic keyword extraction")
        req.write(template.tmpl_page_msg(msg=msg))
        return 0, keywords, None

    # register generation
    bibdocfiles = BibRecDocs(recid).list_latest_files()
    if bibdocfiles:
        # User arrived at a page, but no keywords are available
        inprogress, msg = _doc_already_submitted(recid)
        if argd['generate'] != 'yes':
            # Display a form and give them possibility to generate keywords
            if inprogress:
                req.write(
                    template.tmpl_page_msg(
                        msg='<div class="warningbox">%s</div>' % _(msg)))
            else:
                req.write(template.tmpl_page_generate_keywords(req=req,
                                                               **argd))
            return 0, keywords, None
        else:  # after user clicked on "generate" button
            if inprogress:
                req.write(
                    template.tmpl_page_msg(
                        msg='<div class="warningbox">%s</div>' % _(msg)))
            else:
                schedule_extraction(recid,
                                    taxonomy=bconfig.CFG_EXTRACTION_TAXONOMY)
                req.write(
                    template.
                    tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _(
                        'We have registered your request, the automated'
                        'keyword extraction will run after some time. Please return back in a while.'
                    )))

    else:
        req.write(
            template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _(
                "Unfortunately, we don't have a PDF fulltext for this record in the storage, \
                    keywords cannot be generated using an automated process."))
        )

    return 0, keywords, None
        sys.stderr.write("Error: Missing MARCXML to analyse")
        print usage
        sys.exit(1)

    input_filename = args[0]

    if not os.path.exists(input_filename):
        sys.stderr.write("Please enter a valid filename for input.")
        sys.exit(1)
    if not os.path.exists(config_path):
        sys.stderr.write("Please enter a valid filename for config.")
        sys.exit(1)

    # Transform MARCXML to record structure
    try:
        records = create_records(open_marc_file(input_filename))
    except:
        sys.stderr.write("bibupload.xml_marc_to_records failed on file: %s" % (input_filename,))
        sys.exit(3)
    action_dict = read_actions_configuration_file(config_path)
    insert_records = []
    append_records = []
    correct_records = []
    holdingpen_records = []

    for rec in records:
        record = rec[0]
        # Perform various checks to determine an suitable action to be taken for
        # that particular record. Whether it will be inserted, discarded or replacing
        # existing records
        #