def test_check_bibdoc_authorization(self): """bibdocfile - check_bibdoc_authorization function""" from invenio.webuser import collect_user_info, get_uid_from_email jekyll = collect_user_info(get_uid_from_email('*****@*****.**')) self.assertEqual(check_bibdoc_authorization(jekyll, 'role:thesesviewer'), (0, CFG_WEBACCESS_WARNING_MSGS[0])) self.assertEqual(check_bibdoc_authorization(jekyll, 'role: thesesviewer'), (0, CFG_WEBACCESS_WARNING_MSGS[0])) self.assertEqual(check_bibdoc_authorization(jekyll, 'role: thesesviewer'), (0, CFG_WEBACCESS_WARNING_MSGS[0])) self.assertNotEqual(check_bibdoc_authorization(jekyll, 'Role: thesesviewer')[0], 0) self.assertEqual(check_bibdoc_authorization(jekyll, 'email: [email protected]'), (0, CFG_WEBACCESS_WARNING_MSGS[0])) self.assertEqual(check_bibdoc_authorization(jekyll, 'email: [email protected]'), (0, CFG_WEBACCESS_WARNING_MSGS[0])) juliet = collect_user_info(get_uid_from_email('*****@*****.**')) self.assertEqual(check_bibdoc_authorization(juliet, 'restricted_picture'), (0, CFG_WEBACCESS_WARNING_MSGS[0])) self.assertEqual(check_bibdoc_authorization(juliet, 'status: restricted_picture'), (0, CFG_WEBACCESS_WARNING_MSGS[0])) self.assertNotEqual(check_bibdoc_authorization(juliet, 'restricted_video')[0], 0) self.assertNotEqual(check_bibdoc_authorization(juliet, 'status: restricted_video')[0], 0)
def get_pdf_snippets(recID, patterns, user_info): """ Extract text snippets around 'patterns' from the newest PDF file of 'recID' The search is case-insensitive. The snippets are meant to look like in the results of the popular search engine: using " ... " between snippets. For empty patterns it returns "" @param recID: record ID to consider @param patterns: list of patterns to retrieve @param nb_words_around: max number of words around the matched pattern @param max_snippets: max number of snippets to include @return: snippet """ from invenio.bibdocfile import BibRecDocs, check_bibdoc_authorization text_path = "" text_path_courtesy = "" for bd in BibRecDocs(recID).list_bibdocs(): # Show excluded fulltext in snippets on Inspire, otherwise depending on authorization if bd.get_text() and (CFG_INSPIRE_SITE or not check_bibdoc_authorization(user_info, bd.get_status())[0]): text_path = bd.get_text_path() text_path_courtesy = bd.get_status() if CFG_INSPIRE_SITE and not text_path_courtesy: # get courtesy from doctype, since docstatus was empty: text_path_courtesy = bd.get_type() if text_path_courtesy == 'INSPIRE-PUBLIC': # but ignore 'INSPIRE-PUBLIC' doctype text_path_courtesy = '' break # stop at the first good PDF textable file nb_chars = CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS.get('', 0) max_snippets = CFG_WEBSEARCH_FULLTEXT_SNIPPETS.get('', 0) if CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS.has_key(text_path_courtesy): nb_chars=CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS[text_path_courtesy] if CFG_WEBSEARCH_FULLTEXT_SNIPPETS.has_key(text_path_courtesy): max_snippets=CFG_WEBSEARCH_FULLTEXT_SNIPPETS[text_path_courtesy] if text_path and nb_chars and max_snippets: out = '' if CFG_WEBSEARCH_FULLTEXT_SNIPPETS_GENERATOR == 'native': out = get_text_snippets(text_path, patterns, nb_chars, max_snippets) if not out: # no hit, so check stemmed versions: from invenio.bibindex_engine_stemmer import stem stemmed_patterns = [stem(p, 'en') for p in patterns] out = get_text_snippets(text_path, stemmed_patterns, nb_chars, max_snippets) elif CFG_WEBSEARCH_FULLTEXT_SNIPPETS_GENERATOR == 'SOLR': out = solr_get_snippet(patterns, recID, nb_chars, max_snippets) if out: out_courtesy = "" if CFG_INSPIRE_SITE and text_path_courtesy: out_courtesy = '<strong>Snippets courtesy of ' + text_path_courtesy + '</strong><br>' return """<div class="snippetbox">%s%s</div>""" % (out_courtesy, out) else: return "" else: return ""
def get_pdf_snippets(recID, patterns, user_info): """ Extract text snippets around 'patterns' from the newest PDF file of 'recID' The search is case-insensitive. The snippets are meant to look like in the results of the popular search engine: using " ... " between snippets. For empty patterns it returns "" @param recID: record ID to consider @param patterns: list of patterns to retrieve @param nb_words_around: max number of words around the matched pattern @param max_snippets: max number of snippets to include @return: snippet """ from invenio.bibdocfile import BibRecDocs, check_bibdoc_authorization text_path = "" text_path_courtesy = "" for bd in BibRecDocs(recID).list_bibdocs(): # Show excluded fulltext in snippets on Inspire, otherwise depending on authorization if bd.get_text() and (CFG_INSPIRE_SITE or not check_bibdoc_authorization(user_info, bd.get_status())[0]): text_path = bd.get_text_path() text_path_courtesy = bd.get_status() if CFG_INSPIRE_SITE and not text_path_courtesy: # get courtesy from doctype, since docstatus was empty: text_path_courtesy = bd.get_type() if text_path_courtesy == 'INSPIRE-PUBLIC': # but ignore 'INSPIRE-PUBLIC' doctype text_path_courtesy = '' break # stop at the first good PDF textable file nb_chars = CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS.get('', 0) max_snippets = CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS.get('', 0) if CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS.has_key(text_path_courtesy): nb_chars=CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS[text_path_courtesy] if CFG_WEBSEARCH_FULLTEXT_SNIPPETS.has_key(text_path_courtesy): max_snippets=CFG_WEBSEARCH_FULLTEXT_SNIPPETS[text_path_courtesy] if text_path and nb_chars and max_snippets: out = '' if CFG_WEBSEARCH_FULLTEXT_SNIPPETS_GENERATOR == 'native': out = get_text_snippets(text_path, patterns, nb_chars, max_snippets) if not out: # no hit, so check stemmed versions: from invenio.bibindex_engine_stemmer import stem stemmed_patterns = [stem(p, 'en') for p in patterns] out = get_text_snippets(text_path, stemmed_patterns, nb_chars, max_snippets) elif CFG_WEBSEARCH_FULLTEXT_SNIPPETS_GENERATOR == 'SOLR': out = solr_get_snippet(patterns, recID, nb_chars, max_snippets) if out: out_courtesy = "" if CFG_INSPIRE_SITE and text_path_courtesy: out_courtesy = '<strong>Snippets courtesy of ' + text_path_courtesy + '</strong><br>' return """<div class="snippetbox">%s%s</div>""" % (out_courtesy, out) else: return "" else: return ""
def test_check_bibdoc_authorization(self): """bibdocfile - check_bibdoc_authorization function""" from invenio.bibdocfile import check_bibdoc_authorization from invenio.webuser import collect_user_info, get_uid_from_email jekyll = collect_user_info(get_uid_from_email('*****@*****.**')) self.assertEqual(check_bibdoc_authorization(jekyll, 'role:thesesviewer'), (0, CFG_WEBACCESS_WARNING_MSGS[0])) self.assertEqual(check_bibdoc_authorization(jekyll, 'role: thesesviewer'), (0, CFG_WEBACCESS_WARNING_MSGS[0])) self.assertEqual(check_bibdoc_authorization(jekyll, 'role: thesesviewer'), (0, CFG_WEBACCESS_WARNING_MSGS[0])) self.assertEqual(check_bibdoc_authorization(jekyll, 'Role: thesesviewer'), (0, CFG_WEBACCESS_WARNING_MSGS[0])) self.assertEqual(check_bibdoc_authorization(jekyll, 'email: [email protected]'), (0, CFG_WEBACCESS_WARNING_MSGS[0])) self.assertEqual(check_bibdoc_authorization(jekyll, 'email: [email protected]'), (0, CFG_WEBACCESS_WARNING_MSGS[0])) juliet = collect_user_info(get_uid_from_email('*****@*****.**')) self.assertEqual(check_bibdoc_authorization(juliet, 'restricted_picture'), (0, CFG_WEBACCESS_WARNING_MSGS[0])) self.assertEqual(check_bibdoc_authorization(juliet, 'status: restricted_picture'), (0, CFG_WEBACCESS_WARNING_MSGS[0])) self.assertNotEqual(check_bibdoc_authorization(juliet, 'restricted_video')[0], 0) self.assertNotEqual(check_bibdoc_authorization(juliet, 'status: restricted_video')[0], 0)