def generate_keywords(req, recid, argd): """Extracts keywords from the fulltexts (if found) for the given recid. It first checks whether the keywords are not already stored in the temp file (maybe from the previous run). @var req: req object @var recid: record id @var argd: arguments passed from web @keyword store_keywords: boolean, whether to save records in the file @return: standard dictionary of kw objects or {} """ ln = argd['ln'] _ = gettext_set_language(ln) keywords = {} # check the files were not already generated abs_path = bibclassify_engine.get_tmp_file(recid) if os.path.exists(abs_path): try: # Try to load the data from the tmp file recs = bibupload.xml_marc_to_records(bibupload.open_marc_file(abs_path)) return record_get_keywords(recs[0]) except: pass # check it is allowed (for this user) to generate pages (exit_stat, msg) = acce.acc_authorize_action(req, 'runbibclassify') if exit_stat != 0: log.info('Access denied: ' + msg) msg = _("The site settings do not allow automatic keyword extraction") req.write(template.tmpl_page_msg(msg=msg)) return 0, keywords, None # register generation bibdocfiles = BibRecDocs(recid).list_latest_files() if bibdocfiles: # User arrived at a page, but no keywords are available inprogress, msg = _doc_already_submitted(recid) if argd['generate'] != 'yes': # Display a form and give them possibility to generate keywords if inprogress: req.write(template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _(msg))) else: req.write(template.tmpl_page_generate_keywords(req=req, **argd)) return 0, keywords, None else: # after user clicked on "generate" button if inprogress: req.write(template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _(msg) )) else: schedule_extraction(recid, taxonomy=bconfig.CFG_EXTRACTION_TAXONOMY) req.write(template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _('We have registered your request, the automated' 'keyword extraction will run after some time. Please return back in a while.'))) else: req.write(template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _("Unfortunately, we don't have a PDF fulltext for this record in the storage, \ keywords cannot be generated using an automated process."))) return 0, keywords, None
if len(args) != 1: sys.stderr.write("Error: Missing MARCXML to analyse") print usage sys.exit(1) input_filename = args[0] if not os.path.exists(input_filename): sys.stderr.write("Please enter a valid filename for input.") sys.exit(1) if not os.path.exists(config_path): sys.stderr.write("Please enter a valid filename for config.") sys.exit(1) # Read and wash incoming data file_data = open_marc_file(input_filename) washed_data = wash_for_xml(wash_for_utf8(file_data)) # Transform MARCXML to record structure records = create_records(washed_data) action_dict = read_actions_configuration_file(config_path) insert_records = [] append_records = [] correct_records = [] holdingpen_records = [] for rec in records: record = rec[0] if record is None: sys.stderr.write("Record is None: %s" % (rec[2],)) sys.exit(1)
if len(args) != 1: sys.stderr.write("Error: Missing MARCXML to analyse") print usage sys.exit(1) input_filename = args[0] if not os.path.exists(input_filename): sys.stderr.write("Please enter a valid filename for input.") sys.exit(1) if not os.path.exists(config_path): sys.stderr.write("Please enter a valid filename for config.") sys.exit(1) # Read and wash incoming data file_data = open_marc_file(input_filename) washed_data = wash_for_xml(wash_for_utf8(file_data)) # Transform MARCXML to record structure records = create_records(washed_data) action_dict = read_actions_configuration_file(config_path) insert_records = [] append_records = [] correct_records = [] holdingpen_records = [] for rec in records: record = rec[0] if record is None: sys.stderr.write("Record is None: %s" % (rec[2], )) sys.exit(1)
sys.stderr.write("Error: Missing MARCXML to analyse") print usage sys.exit(1) input_filename = args[0] if not os.path.exists(input_filename): sys.stderr.write("Please enter a valid filename for input.") sys.exit(1) if not os.path.exists(config_path): sys.stderr.write("Please enter a valid filename for config.") sys.exit(1) # Transform MARCXML to record structure try: records = create_records(open_marc_file(input_filename)) except: sys.stderr.write("bibupload.xml_marc_to_records failed on file: %s" % (input_filename, )) sys.exit(3) action_dict = read_actions_configuration_file(config_path) insert_records = [] append_records = [] correct_records = [] holdingpen_records = [] for rec in records: record = rec[0] # Perform various checks to determine an suitable action to be taken for # that particular record. Whether it will be inserted, discarded or replacing # existing records
def generate_keywords(req, recid, argd): """Extracts keywords from the fulltexts (if found) for the given recid. It first checks whether the keywords are not already stored in the temp file (maybe from the previous run). @var req: req object @var recid: record id @var argd: arguments passed from web @keyword store_keywords: boolean, whether to save records in the file @return: standard dictionary of kw objects or {} """ ln = argd['ln'] _ = gettext_set_language(ln) keywords = {} # check the files were not already generated abs_path = bibclassify_engine.get_tmp_file(recid) if os.path.exists(abs_path): try: # Try to load the data from the tmp file recs = bibupload.xml_marc_to_records( bibupload.open_marc_file(abs_path)) return record_get_keywords(recs[0]) except: pass # check it is allowed (for this user) to generate pages (exit_stat, msg) = acce.acc_authorize_action(req, 'runbibclassify') if exit_stat != 0: log.info('Access denied: ' + msg) msg = _("The site settings do not allow automatic keyword extraction") req.write(template.tmpl_page_msg(msg=msg)) return 0, keywords, None # register generation bibdocfiles = BibRecDocs(recid).list_latest_files() if bibdocfiles: # User arrived at a page, but no keywords are available inprogress, msg = _doc_already_submitted(recid) if argd['generate'] != 'yes': # Display a form and give them possibility to generate keywords if inprogress: req.write( template.tmpl_page_msg( msg='<div class="warningbox">%s</div>' % _(msg))) else: req.write(template.tmpl_page_generate_keywords(req=req, **argd)) return 0, keywords, None else: # after user clicked on "generate" button if inprogress: req.write( template.tmpl_page_msg( msg='<div class="warningbox">%s</div>' % _(msg))) else: schedule_extraction(recid, taxonomy=bconfig.CFG_EXTRACTION_TAXONOMY) req.write( template. tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _( 'We have registered your request, the automated' 'keyword extraction will run after some time. Please return back in a while.' ))) else: req.write( template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _( "Unfortunately, we don't have a PDF fulltext for this record in the storage, \ keywords cannot be generated using an automated process.")) ) return 0, keywords, None
sys.stderr.write("Error: Missing MARCXML to analyse") print usage sys.exit(1) input_filename = args[0] if not os.path.exists(input_filename): sys.stderr.write("Please enter a valid filename for input.") sys.exit(1) if not os.path.exists(config_path): sys.stderr.write("Please enter a valid filename for config.") sys.exit(1) # Transform MARCXML to record structure try: records = create_records(open_marc_file(input_filename)) except: sys.stderr.write("bibupload.xml_marc_to_records failed on file: %s" % (input_filename,)) sys.exit(3) action_dict = read_actions_configuration_file(config_path) insert_records = [] append_records = [] correct_records = [] holdingpen_records = [] for rec in records: record = rec[0] # Perform various checks to determine an suitable action to be taken for # that particular record. Whether it will be inserted, discarded or replacing # existing records #