def get_pdf_snippets(recID, patterns, user_info): """ Extract text snippets around 'patterns' from the newest PDF file of 'recID' The search is case-insensitive. The snippets are meant to look like in the results of the popular search engine: using " ... " between snippets. For empty patterns it returns "" @param recID: record ID to consider @param patterns: list of patterns to retrieve @param user_info: the user_info object from collect_user_info @return: snippet """ from invenio.legacy.bibdocfile.api import BibRecDocs, check_bibdoc_authorization text_path = "" text_path_courtesy = "" for bd in BibRecDocs(recID).list_bibdocs(): # Show excluded fulltext in snippets on Inspire, otherwise depending on authorization if hasattr(bd, 'get_text') and (CFG_INSPIRE_SITE or not check_bibdoc_authorization(user_info, bd.get_status())[0]): text_path = bd.get_text_path() text_path_courtesy = bd.get_status() if CFG_INSPIRE_SITE and not text_path_courtesy: # get courtesy from doctype, since docstatus was empty: text_path_courtesy = bd.get_type() if text_path_courtesy == 'INSPIRE-PUBLIC': # but ignore 'INSPIRE-PUBLIC' doctype text_path_courtesy = '' break # stop at the first good PDF textable file nb_chars = CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS.get('', 0) max_snippets = CFG_WEBSEARCH_FULLTEXT_SNIPPETS.get('', 0) if text_path_courtesy in CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS: nb_chars = CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS[text_path_courtesy] if text_path_courtesy in CFG_WEBSEARCH_FULLTEXT_SNIPPETS: max_snippets = CFG_WEBSEARCH_FULLTEXT_SNIPPETS[text_path_courtesy] if text_path and nb_chars and max_snippets: out = '' if CFG_WEBSEARCH_FULLTEXT_SNIPPETS_GENERATOR == 'native': out = get_text_snippets(text_path, patterns, nb_chars, max_snippets) if not out: # no hit, so check stemmed versions: from invenio.legacy.bibindex.engine_stemmer import stem stemmed_patterns = [stem(p, 'en') for p in patterns] out = get_text_snippets(text_path, stemmed_patterns, nb_chars, max_snippets) elif CFG_WEBSEARCH_FULLTEXT_SNIPPETS_GENERATOR == 'SOLR': from invenio.legacy.miscutil.solrutils_bibindex_searcher import solr_get_snippet out = solr_get_snippet(patterns, recID, nb_chars, max_snippets) if out: out_courtesy = "" if CFG_INSPIRE_SITE and text_path_courtesy: out_courtesy = '<strong>Snippets courtesy of ' + text_path_courtesy + '</strong><br>' return '%s%s' % (out_courtesy, out) else: return "" else: return ""
def apply_stemming(word, stemming_language): """Returns word after applying stemming (if stemming language is set). You can change your stemming language in database. @param word: word to be checked @type word: str @param stemming_language: abbreviation of language or None @type stemming_language: str """ if stemming_language: word = stem(word, stemming_language) return word
def clean_and_split_words_and_stem(string, ln=CFG_SITE_LANG, stem_p=True): """Split and stemp words in a string. :param ln: language to consider for stemming :param stem_p: if True, also stem the word according to ``ln`` :return: list of (stemmed) word. """ alphanum_string = re_non_alphanum_only.sub(" ", string).lower() words = re_split_words_pattern.split(alphanum_string) if stem_p: # lowering must be done after stemming words = [stem(word, ln) for word in words] return words
def answer(self, req, user_info, of, cc, colls_to_search, p, f, search_units, ln): """Answer question given by context, using knowledge base. Return (relevance, html_string) where relevance is integer from 0 to 100 indicating how relevant to the question the answer is (see L{CFG_WEBSEARCH_SERVICE_MAX_SERVICE_ANSWER_RELEVANCE} for details), and html_string being a formatted answer. """ _ = gettext_set_language(ln) # words = [stem(unit[1], ln) for unit in search_units if unit[2] == ''] words = [ stem(unit[1].lower(), CFG_SITE_LANG) for unit in search_units if unit[2] == '' ] cache = self.get_data_cache() matching_values = {} for word in words: res = cache.get(word, []) for keyword in res: if keyword not in matching_values: matching_values[keyword] = 1 else: matching_values[keyword] += 1 # order matching values per score matching_values_sorted = sorted(matching_values.iteritems(), key=lambda (k, v): (v, k), reverse=True) if not matching_values_sorted: return (0, '') best_score = matching_values_sorted[0][1] # Compute relevance. How many words from query did match relevance = min( 100, max(0, (100 * float(best_score) / len([word for word in words if len(word) > 3])) - 10)) labels_and_links = [m.split("|", 1) for m in matching_values.keys()] translated_labels_and_links = [(_(label), url) for label, url in labels_and_links] return (relevance, self.display_answer_helper(translated_labels_and_links, ln))
def prepare_data_cache(self): """ "Index" collection names """ from invenio.legacy.search_engine import collection_i18nname_cache cache = {} words_and_coll = [(' '.join([' '.join([stem(word.lower(), ln) for word in \ whitespace_re.split(non_alphanum_chars_only_re.sub(' ', translation))]) for ln, translation in \ translations.iteritems()]), coll_name) for coll_name, translations in \ collection_i18nname_cache.cache.iteritems()] for words, coll in words_and_coll: for word in whitespace_re.split(words): if not word.strip(): continue if not cache.has_key(word): cache[word] = [] if not coll in cache[word]: cache[word].append(coll) return cache
def test_stemmer_greek(self): """bibindex engine - Greek stemmer""" greek_test_cases = [['πληροφορίες', 'ΠΛΗΡΟΦΟΡΙ'], ['πείραμα', 'ΠΕΙΡΑΜ'], ['πειράματα', 'ΠΕΙΡΑΜ'], ['πειραματιστής', 'ΠΕΙΡΑΜΑΤΙΣΤ'], ['πειραματίζομαι', 'ΠΕΙΡΑΜΑΤΙΖ'], ['πειραματίζεσαι', 'ΠΕΙΡΑΜΑΤΙΖ'], ['πειραματίστηκα', 'ΠΕΙΡΑΜΑΤΙΣΤ'], ['πειραματόζωο', 'ΠΕΙΡΑΜΑΤΟΖΩ'], ['ζώο', 'ΖΩ'], ['πειραματισμός', 'ΠΕΙΡΑΜΑΤΙΣΜ'], ['πειραματικός', 'ΠΕΙΡΑΜΑΤΙΚ'], ['πειραματικά', 'ΠΕΙΡΑΜΑΤ'], ['ηλεκτρόνιο', 'ΗΛΕΚΤΡΟΝΙ'], ['ηλεκτρονιακός', 'ΗΛΕΚΤΡΟΝΙΑΚ'], ['ακτίνα', 'ΑΚΤΙΝ'], ['ακτινοβολία', 'ΑΚΤΙΝΟΒΟΛ'], ['E=mc^2', 'E=MC^2'], ['α+β=γ', 'Α+Β=Γ']] for test_word, expected_result in greek_test_cases: self.assertEqual(expected_result, bibindex_engine_stemmer.stem(test_word, "el"))
def test_stemmer_english(self): """bibindex engine - English stemmer""" english_test_cases = [['information', 'inform'], ['experiment', 'experi'], ['experiments', 'experi'], ['experimented', 'experi'], ['experimenting', 'experi'], ['experimental', 'experiment'], ['experimentally', 'experiment'], ['experimentation', 'experiment'], ['experimentalism', 'experiment'], ['experimenter', 'experiment'], ['experimentalise', 'experimentalis'], ['experimentalist', 'experimentalist'], ['experimentalists', 'experimentalist'], ['GeV', 'GeV'], ['$\Omega$', '$\Omega$'], ['e^-', 'e^-'], ['C#', 'C#'], ['C++', 'C++']] for test_word, expected_result in english_test_cases: self.assertEqual(expected_result, bibindex_engine_stemmer.stem(test_word, "en"))
def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance, verbose, methods): """Ranking a records containing specified words and returns a sorted list. input: rank_method_code - the code of the method, from the name field in rnkMETHOD lwords - a list of words from the query hitset - a list of hits for the query found by search_engine rank_limit_relevance - show only records with a rank value above this verbose - verbose value output: reclist - a list of sorted records: [[23,34], [344,24], [1,01]] prefix - what to show before the rank value postfix - what to show after the rank value voutput - contains extra information, content dependent on verbose value""" voutput = "" startCreate = time.time() if verbose > 0: voutput += "<br />Running rank method: %s, using word_frequency function in bibrank_record_sorter<br />" % rank_method_code lwords_old = lwords lwords = [] #Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms. for i in range(0, len(lwords_old)): term = string.lower(lwords_old[i]) if not methods[rank_method_code]["stopwords"] == "True" or methods[ rank_method_code]["stopwords"] and not is_stopword(term): lwords.append((term, methods[rank_method_code]["rnkWORD_table"])) terms = string.split( string.lower( re.sub( methods[rank_method_code] ["chars_alphanumericseparators"], ' ', term))) for term in terms: if "stemmer" in methods[rank_method_code]: # stem word term = stem(string.replace(term, ' ', ''), methods[rank_method_code]["stemmer"]) if lwords_old[ i] != term: #add if stemmed word is different than original word lwords.append( (term, methods[rank_method_code]["rnkWORD_table"])) (recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {}) #For each term, if accepted, get a list of the records using the term #calculate then relevance for each term before sorting the list of records for (term, table) in lwords: term_recs = run_sql( """SELECT term, hitlist FROM %s WHERE term=%%s""" % methods[rank_method_code]["rnkWORD_table"], (term, )) if term_recs: #if term exists in database, use for ranking term_recs = deserialize_via_marshal(term_recs[0][1]) (recdict, rec_termcount) = calculate_record_relevance( (term, int(term_recs["Gi"][1])), term_recs, hitset, recdict, rec_termcount, verbose, quick=None) del term_recs if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""): return ( None, "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.", "", voutput) else: #sort if we got something to sort (reclist, hitset) = sort_record_relevance(recdict, rec_termcount, hitset, rank_limit_relevance, verbose) #Add any documents not ranked to the end of the list if hitset: lrecIDs = list(hitset) #using 2-3mb reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist #using 6mb if verbose > 0: voutput += "<br />Current number of recIDs: %s<br />" % ( methods[rank_method_code]["col_size"]) voutput += "Number of terms: %s<br />" % run_sql( "SELECT count(id) FROM %s" % methods[rank_method_code]["rnkWORD_table"])[0][0] voutput += "Terms: %s<br />" % lwords voutput += "Prepare and pre calculate time: %s<br />" % ( str(time.time() - startCreate)) voutput += "Total time used: %s<br />" % (str(time.time() - startCreate)) voutput += str(reclist) + "<br />" rank_method_stat(rank_method_code, reclist, lwords) return (reclist, methods[rank_method_code]["prefix"], methods[rank_method_code]["postfix"], voutput)
def test_stemmer_none(self): """bibindex engine - no stemmer""" self.assertEqual("information", bibindex_engine_stemmer.stem("information", None))
def search_unit_in_bibwords(word, f, decompress=zlib.decompress, wl=0): """Search for 'word' inside bibwordsX table for field 'f'. :return: hitset of recIDs. """ from invenio.legacy.bibindex.engine_stemmer import stem from invenio.legacy.bibindex.engine_washer import ( lower_index_term, wash_index_term, ) # FIXME: Should not be used for journal field. hitset = intbitset() # will hold output result set limit_reached = 0 # flag for knowing if the query limit has been reached # if no field is specified, search in the global index. f = f or 'anyfield' index = IdxINDEX.get_from_field(f) if index is None: return hitset model = index.wordf stemming_language = index.stemming_language # wash 'word' argument and run query: if f.endswith('count') and word.endswith('+'): # field count query of the form N+ so transform N+ to N->99999: word = word[:-1] + '->99999' word = word.replace('*', '%') # we now use '*' as the truncation character words = word.split("->", 1) # check for span query if len(words) == 2: word0 = re_word.sub('', words[0]) word1 = re_word.sub('', words[1]) if stemming_language: word0 = lower_index_term(word0) word1 = lower_index_term(word1) # We remove trailing truncation character before stemming if word0.endswith('%'): word0 = stem(word0[:-1], stemming_language) + '%' else: word0 = stem(word0, stemming_language) if word1.endswith('%'): word1 = stem(word1[:-1], stemming_language) + '%' else: word1 = stem(word1, stemming_language) word0_washed = wash_index_term(word0) word1_washed = wash_index_term(word1) if f.endswith('count'): # field count query; convert to integers in order # to have numerical behaviour for 'BETWEEN n1 AND n2' query try: word0_washed = int(word0_washed) word1_washed = int(word1_washed) except ValueError: pass query = model.query.filter( model.term.between(word0_washed, word1_washed)) if wl > 0: query = query.limit(wl) res = query.values('term', 'hitlist') if wl > 0 and len(res) == wl: limit_reached = 1 # set the limit reached flag to true else: word = re_word.sub('', word) if stemming_language: word = lower_index_term(word) # We remove trailing truncation character before stemming if word.endswith('%'): word = stem(word[:-1], stemming_language) + '%' else: word = stem(word, stemming_language) if word.find('%') >= 0: # do we have wildcard in the word? query = model.query.filter(model.term.like(wash_index_term(word))) if wl > 0: query.limit(wl) res = query.values('term', 'hitlist') # set the limit reached flag to true limit_reached = wl > 0 and len(res) == wl else: res = model.query.filter(model.term.like( wash_index_term(word))).values('term', 'hitlist') # fill the result set: for word, hitlist in res: # add the results: hitset |= intbitset(hitlist) # check to see if the query limit was reached if limit_reached: # raise an exception, so we can print a nice message to the user raise InvenioWebSearchWildcardLimitError(hitset) # okay, return result set: return hitset
def answer(self, req, user_info, of, cc, colls_to_search, p, f, search_units, ln): """ Answer question given by context. Return (relevance, html_string) where relevance is integer from 0 to 100 indicating how relevant to the question the answer is (see C{CFG_WEBSEARCH_SERVICE_MAX_SERVICE_ANSWER_RELEVANCE} for details) , and html_string being a formatted answer. """ from invenio.legacy.search_engine import \ get_permitted_restricted_collections, \ get_coll_i18nname, \ collection_i18nname_cache, \ collection_restricted_p _ = gettext_set_language(ln) # stem search units. remove those with field # TODO: search in hosted collection names too # TODO: ignore unattached trees # TODO: use synonyms if f or (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH < 0) or \ (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH == 0 and cc != CFG_SITE_NAME): return (0, '') words = [ stem(unit[1], ln) for unit in search_units if unit[2] in ('', 'collection') ] # Stemming if not words: return (0, '') permitted_restricted_collections = get_permitted_restricted_collections( user_info) cache = self.get_data_cache() matching_collections = {} for word in words: if CFG_CERN_SITE and word == 'cern': # This keyword is useless here... continue colls = cache.get(word.lower(), []) for coll in colls: if collection_restricted_p(coll) and \ not coll in permitted_restricted_collections: # Skip restricted collection user do not have access continue if not matching_collections.has_key(coll): matching_collections[coll] = 0 matching_collections[coll] += 1 matching_collections_sorted = sorted(matching_collections.iteritems(), key=lambda (k, v): (v, k), reverse=True) if not matching_collections_sorted: return (0, '') matching_collections_names = [(get_coll_i18nname(coll, ln, False), CFG_SITE_URL + '/collection/' + urllib.quote(coll, safe='') + '?ln=en') \ for coll, score in matching_collections_sorted] best_score = matching_collections_sorted[0][1] best_coll_words = whitespace_re.split( matching_collections_sorted[0][0]) relevance = min( 100, max(0, (100 * float(2 * best_score) / float(len(best_coll_words) + len(words)) - 10))) if (('submit' in p.lower()) or (_('submit') in p.lower())) and \ not (('submit' in best_coll_words) or (_('submit') in best_coll_words)): # User is probably looking for a submission. Decrease relevance relevance = max(0, relevance - 30) return (relevance, self.display_answer_helper(matching_collections_names, ln))
def answer(self, req, user_info, of, cc, colls_to_search, p, f, search_units, ln): """ Answer question given by context. Return (relevance, html_string) where relevance is integer from 0 to 100 indicating how relevant to the question the answer is (see C{CFG_WEBSEARCH_SERVICE_MAX_SERVICE_ANSWER_RELEVANCE} for details) , and html_string being a formatted answer. """ _ = gettext_set_language(ln) if f or (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH < 0) or \ (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH == 0 and cc != CFG_SITE_NAME): return (0, '') words = [ stem(unit[1].lower(), CFG_SITE_LANG) for unit in search_units if unit[2] == '' ] if not words: return (0, '') cache = self.get_data_cache() # TODO: If all categories of a submission match, display only submission (not categories) matching_submissions = {} for word in words: # Look for submission names if CFG_CERN_SITE and word == 'cern': # This keyword is useless here... continue submissions = cache.get(word, []) for doctype, submission_label, category in submissions: if acc_authorize_action(req, 'submit', \ authorized_if_no_roles=not isGuestUser(user_info['uid']), \ doctype=(CFG_CERN_SITE and doctype.startswith('GENSBM#') and 'GENSBM') or doctype, categ=category)[0] != 0: # Not authorized to submit in this submission continue if not matching_submissions.has_key( (doctype, submission_label)): matching_submissions[(doctype, submission_label)] = 0 add_score = 1 if category != '*': # This is the submission category, consider that # words that are part of the submission name are # less important than others here: if not word.lower() in category.lower(): # word is only in submission name add_score = 0.5 else: add_score = 1.5 matching_submissions[(doctype, submission_label)] += add_score matching_submissions_sorted = sorted(matching_submissions.iteritems(), key=lambda (k, v): (v, k), reverse=True) if not matching_submissions_sorted: return (0, '') best_score = matching_submissions_sorted[0][1] max_score_difference = 1.9 matching_submissions_names = [(submission_label, \ CFG_SITE_URL + '/submit?doctype=' + doctype.split("#", 1)[0] + '&ln=' + ln + (CFG_CERN_SITE and doctype.startswith('GENSBM#') and '#' + doctype.split("#", 1)[-1] or '') ) \ for (doctype, submission_label), score in matching_submissions_sorted if score > best_score - max_score_difference] best_sbm_words = whitespace_re.split( matching_submissions_sorted[0][0][1]) score_bonus = (((_("Submit").lower() in words) or ("submit" in words)) or \ ((_("Revise").lower() in words) or ("revise" in words)) or \ ((_("Modify").lower() in words) or ("modify" in words))) and 40 or 0 relevance = min( 100, max(0, (score_bonus + (100 * float(best_score) / float(len(best_sbm_words) + len(words)))) - 10)) return (relevance, self.display_answer_helper(matching_submissions_names, ln))