def generate_keywords(req, recid, argd): """Extracts keywords from the fulltexts (if found) for the given recid. It first checks whether the keywords are not already stored in the temp file (maybe from the previous run). @var req: req object @var recid: record id @var argd: arguments passed from web @keyword store_keywords: boolean, whether to save records in the file @return: standard dictionary of kw objects or {} """ ln = argd['ln'] _ = gettext_set_language(ln) keywords = {} # check the files were not already generated abs_path = bibclassify_engine.get_tmp_file(recid) if os.path.exists(abs_path): try: # Try to load the data from the tmp file recs = bibupload.xml_marc_to_records(bibupload.open_marc_file(abs_path)) return record_get_keywords(recs[0]) except: pass # check it is allowed (for this user) to generate pages (exit_stat, msg) = acce.acc_authorize_action(req, 'runbibclassify') if exit_stat != 0: log.info('Access denied: ' + msg) msg = _("The site settings do not allow automatic keyword extraction") req.write(template.tmpl_page_msg(msg=msg)) return 0, keywords, None # register generation bibdocfiles = BibRecDocs(recid).list_latest_files() if bibdocfiles: # User arrived at a page, but no keywords are available inprogress, msg = _doc_already_submitted(recid) if argd['generate'] != 'yes': # Display a form and give them possibility to generate keywords if inprogress: req.write(template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _(msg))) else: req.write(template.tmpl_page_generate_keywords(req=req, **argd)) return 0, keywords, None else: # after user clicked on "generate" button if inprogress: req.write(template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _(msg) )) else: schedule_extraction(recid, taxonomy=bconfig.CFG_EXTRACTION_TAXONOMY) req.write(template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _('We have registered your request, the automated' 'keyword extraction will run after some time. Please return back in a while.'))) else: req.write(template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _("Unfortunately, we don't have a PDF fulltext for this record in the storage, \ keywords cannot be generated using an automated process."))) return 0, keywords, None
def main_page(req, recid, tabs, ln, webstyle_template, websearch_template): """Generates the main page for the keyword tab - http://url/record/[recid]/keywords @var req: request object @var recid: int docid @var tabs: list of tab links @var ln: language id @var webstyle_template: template object @var websearch_template: template object @return: nothing, writes using req object """ form = req.form argd = wash_urlargd( form, { 'generate': (str, 'no'), 'sorting': (str, 'occurences'), 'type': (str, 'tagcloud'), 'numbering': (str, 'off'), 'showall': (str, 'off'), }) for k, v in argd.items(): argd[k] = escape(v) req.write(webstyle_template.detailed_record_container_top(recid, tabs, ln)) req.write(websearch_template.tmpl_record_plots(recID=recid, ln=ln)) # Get the keywords from MARC (if any) success, keywords, marcrec = record_get_keywords(recid) if success: # check for the cached file and delete it (we don't need it anymore, data are in the DB) tmp_file = bibclassify_engine.get_tmp_file(recid) if os.path.exists(tmp_file): try: os.remove(tmp_file) except Exception, msg: log.error('Error removing the cached file: %s' % tmp_file) log.error(msg)
def main_page(req, recid, tabs, ln, template): """Generates the main page for the keyword tab - http://url/record/[recid]/keywords @var req: request object @var recid: int docid @var tabs: list of tab links @var ln: language id @var template: template object @return: nothing, writes using req object """ form = req.form argd = wash_urlargd(form, { 'generate': (str, 'no'), 'sorting': (str, 'occurences'), 'type': (str, 'tagcloud'), 'numbering': (str, 'off'), 'showall': (str, 'off'), }) for k,v in argd.items(): argd[k] = escape(v) req.write(template.detailed_record_container_top(recid, tabs, ln)) # Get the keywords from MARC (if any) success, keywords, marcrec = record_get_keywords(recid) if success: # check for the cached file and delete it (we don't need it anymore, data are in the DB) tmp_file = bibclassify_engine.get_tmp_file(recid) if os.path.exists(tmp_file): try: os.remove(tmp_file) except Exception, msg: log.error('Error removing the cached file: %s' % tmp_file) log.error(msg)
def _task_run_core(): """Runs analyse_documents for each ontology, collection, record ids set.""" automated_daemon_mode_p = True recids = bibtask.task_get_option('recids') collections = bibtask.task_get_option('collections') taxonomy = bibtask.task_get_option('taxonomy') if recids or collections: # We want to run some records/collection only, so we are not # in the automated daemon mode; this will be useful later. automated_daemon_mode_p = False # Check if the user specified which documents to extract keywords from. if recids: onto_recids = _get_recids_foreach_ontology(recids=recids, taxonomy=taxonomy) elif collections: onto_recids = _get_recids_foreach_ontology(collections=collections, taxonomy=taxonomy) else: onto_recids = _get_recids_foreach_ontology() if not onto_recids: # Nothing to do. if automated_daemon_mode_p: _update_date_of_last_run( bibtask.task_get_task_param('task_starting_time')) return 1 # We will write to a temporary file as we go, because we might be processing # big collections with many docs _rid = time.strftime("%Y%m%d%H%M%S", time.localtime()) abs_path = bibclassify_engine.get_tmp_file(_rid) fo = open(abs_path, 'w') fo.write('<?xml version="1.0" encoding="UTF-8"?>\n') fo.write('<collection xmlns="http://www.loc.gov/MARC21/slim">\n') # Count the total number of records in order to update the progression. global _RECIDS_NUMBER for onto_rec in onto_recids: _RECIDS_NUMBER += len(onto_rec['recIDs']) rec_added = False for onto_rec in onto_recids: bibtask.task_sleep_now_if_required(can_stop_too=False) if onto_rec['collection'] is not None: bibtask.write_message( 'INFO: Applying taxonomy %s to collection %s (%s ' 'records)' % (onto_rec['ontology'], onto_rec['collection'], len(onto_rec['recIDs'])), stream=sys.stderr, verbose=3) else: bibtask.write_message( 'INFO: Applying taxonomy %s to recIDs %s. ' % (onto_rec['ontology'], ', '.join( [str(recid) for recid in onto_rec['recIDs']])), stream=sys.stderr, verbose=3) if onto_rec['recIDs']: xml = _analyze_documents(onto_rec['recIDs'], onto_rec['ontology'], onto_rec['collection']) if len(xml) > 5: fo.write(xml) rec_added = True fo.write('</collection>\n') fo.close() # Apply the changes. if rec_added: if bconfig.CFG_DB_SAVE_KW: bibclassify_webinterface.upload_keywords(abs_path) else: bibtask.write_message( "INFO: CFG_DB_SAVE_KW is false, we don't save results", stream=sys.stderr, verbose=0) else: bibtask.write_message("WARNING: No keywords found, recids: %s" % onto_recids, stream=sys.stderr, verbose=0) os.remove(abs_path) # Update the date of last run in the clsMETHOD table, but only if # we were running in an automated mode. if automated_daemon_mode_p: _update_date_of_last_run( bibtask.task_get_task_param('task_starting_time')) return 1
def generate_keywords(req, recid, argd): """Extracts keywords from the fulltexts (if found) for the given recid. It first checks whether the keywords are not already stored in the temp file (maybe from the previous run). @var req: req object @var recid: record id @var argd: arguments passed from web @keyword store_keywords: boolean, whether to save records in the file @return: standard dictionary of kw objects or {} """ ln = argd['ln'] _ = gettext_set_language(ln) keywords = {} # check the files were not already generated abs_path = bibclassify_engine.get_tmp_file(recid) if os.path.exists(abs_path): try: # Try to load the data from the tmp file recs = bibupload.xml_marc_to_records( bibupload.open_marc_file(abs_path)) return record_get_keywords(recs[0]) except: pass # check it is allowed (for this user) to generate pages (exit_stat, msg) = acce.acc_authorize_action(req, 'runbibclassify') if exit_stat != 0: log.info('Access denied: ' + msg) msg = _("The site settings do not allow automatic keyword extraction") req.write(template.tmpl_page_msg(msg=msg)) return 0, keywords, None # register generation bibdocfiles = BibRecDocs(recid).list_latest_files() if bibdocfiles: # User arrived at a page, but no keywords are available inprogress, msg = _doc_already_submitted(recid) if argd['generate'] != 'yes': # Display a form and give them possibility to generate keywords if inprogress: req.write( template.tmpl_page_msg( msg='<div class="warningbox">%s</div>' % _(msg))) else: req.write(template.tmpl_page_generate_keywords(req=req, **argd)) return 0, keywords, None else: # after user clicked on "generate" button if inprogress: req.write( template.tmpl_page_msg( msg='<div class="warningbox">%s</div>' % _(msg))) else: schedule_extraction(recid, taxonomy=bconfig.CFG_EXTRACTION_TAXONOMY) req.write( template. tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _( 'We have registered your request, the automated' 'keyword extraction will run after some time. Please return back in a while.' ))) else: req.write( template.tmpl_page_msg(msg='<div class="warningbox">%s</div>' % _( "Unfortunately, we don't have a PDF fulltext for this record in the storage, \ keywords cannot be generated using an automated process.")) ) return 0, keywords, None
def _task_run_core(): """Runs analyse_documents for each ontology, collection, record ids set.""" automated_daemon_mode_p = True recids = bibtask.task_get_option("recids") collections = bibtask.task_get_option("collections") taxonomy = bibtask.task_get_option("taxonomy") if recids or collections: # We want to run some records/collection only, so we are not # in the automated daemon mode; this will be useful later. automated_daemon_mode_p = False # Check if the user specified which documents to extract keywords from. if recids: onto_recids = _get_recids_foreach_ontology(recids=recids, taxonomy=taxonomy) elif collections: onto_recids = _get_recids_foreach_ontology(collections=collections, taxonomy=taxonomy) else: onto_recids = _get_recids_foreach_ontology() if not onto_recids: # Nothing to do. if automated_daemon_mode_p: _update_date_of_last_run(bibtask.task_get_task_param("task_starting_time")) return 1 # We will write to a temporary file as we go, because we might be processing # big collections with many docs _rid = time.strftime("%Y%m%d%H%M%S", time.localtime()) abs_path = bibclassify_engine.get_tmp_file(_rid) fo = open(abs_path, "w") fo.write('<?xml version="1.0" encoding="UTF-8"?>\n') fo.write('<collection xmlns="http://www.loc.gov/MARC21/slim">\n') # Count the total number of records in order to update the progression. global _RECIDS_NUMBER for onto_rec in onto_recids: _RECIDS_NUMBER += len(onto_rec["recIDs"]) rec_added = False for onto_rec in onto_recids: bibtask.task_sleep_now_if_required(can_stop_too=False) if onto_rec["collection"] is not None: bibtask.write_message( "INFO: Applying taxonomy %s to collection %s (%s " "records)" % (onto_rec["ontology"], onto_rec["collection"], len(onto_rec["recIDs"])), stream=sys.stderr, verbose=3, ) else: bibtask.write_message( "INFO: Applying taxonomy %s to recIDs %s. " % (onto_rec["ontology"], ", ".join([str(recid) for recid in onto_rec["recIDs"]])), stream=sys.stderr, verbose=3, ) if onto_rec["recIDs"]: xml = _analyze_documents(onto_rec["recIDs"], onto_rec["ontology"], onto_rec["collection"]) if len(xml) > 5: fo.write(xml) rec_added = True fo.write("</collection>\n") fo.close() # Apply the changes. if rec_added: if bconfig.CFG_DB_SAVE_KW: bibclassify_webinterface.upload_keywords(abs_path) else: bibtask.write_message("INFO: CFG_DB_SAVE_KW is false, we don't save results", stream=sys.stderr, verbose=0) else: bibtask.write_message("WARNING: No keywords found, recids: %s" % onto_recids, stream=sys.stderr, verbose=0) os.remove(abs_path) # Update the date of last run in the clsMETHOD table, but only if # we were running in an automated mode. if automated_daemon_mode_p: _update_date_of_last_run(bibtask.task_get_task_param("task_starting_time")) return 1