Example #1
0
def get_pdf_snippets(recID, patterns, user_info):
    """
    Extract text snippets around 'patterns' from the newest PDF file of 'recID'
    The search is case-insensitive.
    The snippets are meant to look like in the results of the popular search
    engine: using " ... " between snippets.
    For empty patterns it returns ""

    @param recID: record ID to consider
    @param patterns: list of patterns to retrieve
    @param user_info: the user_info object from collect_user_info
    @return: snippet
    """
    from invenio.legacy.bibdocfile.api import BibRecDocs, check_bibdoc_authorization

    text_path = ""
    text_path_courtesy = ""
    for bd in BibRecDocs(recID).list_bibdocs():
        # Show excluded fulltext in snippets on Inspire, otherwise depending on authorization
        if hasattr(bd, 'get_text') and (CFG_INSPIRE_SITE or not check_bibdoc_authorization(user_info, bd.get_status())[0]):
            text_path = bd.get_text_path()
            text_path_courtesy = bd.get_status()
            if CFG_INSPIRE_SITE and not text_path_courtesy:
                # get courtesy from doctype, since docstatus was empty:
                text_path_courtesy = bd.get_type()
                if text_path_courtesy == 'INSPIRE-PUBLIC':
                    # but ignore 'INSPIRE-PUBLIC' doctype
                    text_path_courtesy = ''
            break # stop at the first good PDF textable file

    nb_chars = CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS.get('', 0)
    max_snippets = CFG_WEBSEARCH_FULLTEXT_SNIPPETS.get('', 0)
    if text_path_courtesy in CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS:
        nb_chars = CFG_WEBSEARCH_FULLTEXT_SNIPPETS_CHARS[text_path_courtesy]
    if text_path_courtesy in CFG_WEBSEARCH_FULLTEXT_SNIPPETS:
        max_snippets = CFG_WEBSEARCH_FULLTEXT_SNIPPETS[text_path_courtesy]

    if text_path and nb_chars and max_snippets:
        out = ''
        if CFG_WEBSEARCH_FULLTEXT_SNIPPETS_GENERATOR == 'native':
            out = get_text_snippets(text_path, patterns, nb_chars, max_snippets)
            if not out:
                # no hit, so check stemmed versions:
                from invenio.legacy.bibindex.engine_stemmer import stem
                stemmed_patterns = [stem(p, 'en') for p in patterns]
                out = get_text_snippets(text_path, stemmed_patterns, nb_chars, max_snippets)
        elif CFG_WEBSEARCH_FULLTEXT_SNIPPETS_GENERATOR == 'SOLR':
            from invenio.legacy.miscutil.solrutils_bibindex_searcher import solr_get_snippet
            out = solr_get_snippet(patterns, recID, nb_chars, max_snippets)

        if out:
            out_courtesy = ""
            if CFG_INSPIRE_SITE and text_path_courtesy:
                out_courtesy = '<strong>Snippets courtesy of ' + text_path_courtesy + '</strong><br>'
            return '%s%s' % (out_courtesy, out)
        else:
            return ""
    else:
        return ""
Example #2
0
def apply_stemming(word, stemming_language):
    """Returns word after applying stemming (if stemming language is set).
       You can change your stemming language in database.

       @param word: word to be checked
       @type word: str
       @param stemming_language: abbreviation of language or None
       @type stemming_language: str
    """
    if stemming_language:
        word = stem(word, stemming_language)
    return word
Example #3
0
def clean_and_split_words_and_stem(string, ln=CFG_SITE_LANG, stem_p=True):
    """Split and stemp words in a string.

    :param ln: language to consider for stemming
    :param stem_p: if True, also stem the word according to ``ln``
    :return: list of (stemmed) word.
    """
    alphanum_string = re_non_alphanum_only.sub(" ", string).lower()
    words = re_split_words_pattern.split(alphanum_string)
    if stem_p:
        # lowering must be done after stemming
        words = [stem(word, ln) for word in words]

    return words
Example #4
0
    def answer(self, req, user_info, of, cc, colls_to_search, p, f,
               search_units, ln):
        """Answer question given by context, using knowledge base.

        Return (relevance, html_string) where relevance is integer
        from 0 to 100 indicating how relevant to the question the
        answer is (see L{CFG_WEBSEARCH_SERVICE_MAX_SERVICE_ANSWER_RELEVANCE}
        for details), and html_string being a formatted answer.
        """
        _ = gettext_set_language(ln)
        # words = [stem(unit[1], ln) for unit in search_units if unit[2] == '']
        words = [
            stem(unit[1].lower(), CFG_SITE_LANG) for unit in search_units
            if unit[2] == ''
        ]
        cache = self.get_data_cache()

        matching_values = {}
        for word in words:
            res = cache.get(word, [])
            for keyword in res:
                if keyword not in matching_values:
                    matching_values[keyword] = 1
                else:
                    matching_values[keyword] += 1

        # order matching values per score
        matching_values_sorted = sorted(matching_values.iteritems(),
                                        key=lambda (k, v): (v, k),
                                        reverse=True)

        if not matching_values_sorted:
            return (0, '')

        best_score = matching_values_sorted[0][1]

        # Compute relevance. How many words from query did match
        relevance = min(
            100,
            max(0, (100 * float(best_score) /
                    len([word for word in words if len(word) > 3])) - 10))
        labels_and_links = [m.split("|", 1) for m in matching_values.keys()]
        translated_labels_and_links = [(_(label), url)
                                       for label, url in labels_and_links]

        return (relevance,
                self.display_answer_helper(translated_labels_and_links, ln))
 def prepare_data_cache(self):
     """
     "Index" collection names
     """
     from invenio.legacy.search_engine import collection_i18nname_cache
     cache = {}
     words_and_coll = [(' '.join([' '.join([stem(word.lower(), ln) for word in \
                                            whitespace_re.split(non_alphanum_chars_only_re.sub(' ', translation))]) for ln, translation in \
                                  translations.iteritems()]), coll_name) for coll_name, translations in \
                       collection_i18nname_cache.cache.iteritems()]
     for words, coll in words_and_coll:
         for word in whitespace_re.split(words):
             if not word.strip():
                 continue
             if not cache.has_key(word):
                 cache[word] = []
             if not coll in cache[word]:
                 cache[word].append(coll)
     return cache
 def test_stemmer_greek(self):
     """bibindex engine - Greek stemmer"""
     greek_test_cases = [['πληροφορίες', 'ΠΛΗΡΟΦΟΡΙ'],
                         ['πείραμα', 'ΠΕΙΡΑΜ'],
                         ['πειράματα', 'ΠΕΙΡΑΜ'],
                         ['πειραματιστής', 'ΠΕΙΡΑΜΑΤΙΣΤ'],
                         ['πειραματίζομαι', 'ΠΕΙΡΑΜΑΤΙΖ'],
                         ['πειραματίζεσαι', 'ΠΕΙΡΑΜΑΤΙΖ'],
                         ['πειραματίστηκα', 'ΠΕΙΡΑΜΑΤΙΣΤ'],
                         ['πειραματόζωο', 'ΠΕΙΡΑΜΑΤΟΖΩ'],
                         ['ζώο', 'ΖΩ'],
                         ['πειραματισμός', 'ΠΕΙΡΑΜΑΤΙΣΜ'],
                         ['πειραματικός', 'ΠΕΙΡΑΜΑΤΙΚ'],
                         ['πειραματικά', 'ΠΕΙΡΑΜΑΤ'],
                         ['ηλεκτρόνιο', 'ΗΛΕΚΤΡΟΝΙ'],
                         ['ηλεκτρονιακός', 'ΗΛΕΚΤΡΟΝΙΑΚ'],
                         ['ακτίνα', 'ΑΚΤΙΝ'],
                         ['ακτινοβολία', 'ΑΚΤΙΝΟΒΟΛ'],
                         ['E=mc^2', 'E=MC^2'],
                         ['α+β=γ', 'Α+Β=Γ']]
     for test_word, expected_result in greek_test_cases:
         self.assertEqual(expected_result,
                          bibindex_engine_stemmer.stem(test_word, "el"))
 def test_stemmer_english(self):
     """bibindex engine - English stemmer"""
     english_test_cases = [['information', 'inform'],
                           ['experiment', 'experi'],
                           ['experiments', 'experi'],
                           ['experimented', 'experi'],
                           ['experimenting', 'experi'],
                           ['experimental', 'experiment'],
                           ['experimentally', 'experiment'],
                           ['experimentation', 'experiment'],
                           ['experimentalism', 'experiment'],
                           ['experimenter', 'experiment'],
                           ['experimentalise', 'experimentalis'],
                           ['experimentalist', 'experimentalist'],
                           ['experimentalists', 'experimentalist'],
                           ['GeV', 'GeV'],
                           ['$\Omega$', '$\Omega$'],
                           ['e^-', 'e^-'],
                           ['C#', 'C#'],
                           ['C++', 'C++']]
     for test_word, expected_result in english_test_cases:
         self.assertEqual(expected_result,
                          bibindex_engine_stemmer.stem(test_word, "en"))
Example #8
0
def word_similarity(rank_method_code, lwords, hitset, rank_limit_relevance,
                    verbose, methods):
    """Ranking a records containing specified words and returns a sorted list.
    input:
    rank_method_code - the code of the method, from the name field in rnkMETHOD
    lwords - a list of words from the query
    hitset - a list of hits for the query found by search_engine
    rank_limit_relevance - show only records with a rank value above this
    verbose - verbose value
    output:
    reclist - a list of sorted records: [[23,34], [344,24], [1,01]]
    prefix - what to show before the rank value
    postfix - what to show after the rank value
    voutput - contains extra information, content dependent on verbose value"""
    voutput = ""
    startCreate = time.time()

    if verbose > 0:
        voutput += "<br />Running rank method: %s, using word_frequency function in bibrank_record_sorter<br />" % rank_method_code

    lwords_old = lwords
    lwords = []
    #Check terms, remove non alphanumeric characters. Use both unstemmed and stemmed version of all terms.
    for i in range(0, len(lwords_old)):
        term = string.lower(lwords_old[i])
        if not methods[rank_method_code]["stopwords"] == "True" or methods[
                rank_method_code]["stopwords"] and not is_stopword(term):
            lwords.append((term, methods[rank_method_code]["rnkWORD_table"]))
            terms = string.split(
                string.lower(
                    re.sub(
                        methods[rank_method_code]
                        ["chars_alphanumericseparators"], ' ', term)))
            for term in terms:
                if "stemmer" in methods[rank_method_code]:  # stem word
                    term = stem(string.replace(term, ' ', ''),
                                methods[rank_method_code]["stemmer"])
                if lwords_old[
                        i] != term:  #add if stemmed word is different than original word
                    lwords.append(
                        (term, methods[rank_method_code]["rnkWORD_table"]))

    (recdict, rec_termcount, lrecIDs_remove) = ({}, {}, {})
    #For each term, if accepted, get a list of the records using the term
    #calculate then relevance for each term before sorting the list of records
    for (term, table) in lwords:
        term_recs = run_sql(
            """SELECT term, hitlist FROM %s WHERE term=%%s""" %
            methods[rank_method_code]["rnkWORD_table"], (term, ))
        if term_recs:  #if term exists in database, use for ranking
            term_recs = deserialize_via_marshal(term_recs[0][1])
            (recdict, rec_termcount) = calculate_record_relevance(
                (term, int(term_recs["Gi"][1])),
                term_recs,
                hitset,
                recdict,
                rec_termcount,
                verbose,
                quick=None)
            del term_recs

    if len(recdict) == 0 or (len(lwords) == 1 and lwords[0] == ""):
        return (
            None,
            "Records not ranked. The query is not detailed enough, or not enough records found, for ranking to be possible.",
            "", voutput)
    else:  #sort if we got something to sort
        (reclist, hitset) = sort_record_relevance(recdict, rec_termcount,
                                                  hitset, rank_limit_relevance,
                                                  verbose)

    #Add any documents not ranked to the end of the list
    if hitset:
        lrecIDs = list(hitset)  #using 2-3mb
        reclist = zip(lrecIDs, [0] * len(lrecIDs)) + reclist  #using 6mb

    if verbose > 0:
        voutput += "<br />Current number of recIDs: %s<br />" % (
            methods[rank_method_code]["col_size"])
        voutput += "Number of terms: %s<br />" % run_sql(
            "SELECT count(id) FROM %s" %
            methods[rank_method_code]["rnkWORD_table"])[0][0]
        voutput += "Terms: %s<br />" % lwords
        voutput += "Prepare and pre calculate time: %s<br />" % (
            str(time.time() - startCreate))
        voutput += "Total time used: %s<br />" % (str(time.time() -
                                                      startCreate))
        voutput += str(reclist) + "<br />"
        rank_method_stat(rank_method_code, reclist, lwords)
    return (reclist, methods[rank_method_code]["prefix"],
            methods[rank_method_code]["postfix"], voutput)
 def test_stemmer_none(self):
     """bibindex engine - no stemmer"""
     self.assertEqual("information",
                      bibindex_engine_stemmer.stem("information", None))
Example #10
0
def search_unit_in_bibwords(word, f, decompress=zlib.decompress, wl=0):
    """Search for 'word' inside bibwordsX table for field 'f'.

    :return: hitset of recIDs.
    """
    from invenio.legacy.bibindex.engine_stemmer import stem
    from invenio.legacy.bibindex.engine_washer import (
        lower_index_term,
        wash_index_term,
    )
    # FIXME: Should not be used for journal field.
    hitset = intbitset()  # will hold output result set
    limit_reached = 0  # flag for knowing if the query limit has been reached

    # if no field is specified, search in the global index.
    f = f or 'anyfield'
    index = IdxINDEX.get_from_field(f)
    if index is None:
        return hitset
    model = index.wordf
    stemming_language = index.stemming_language

    # wash 'word' argument and run query:
    if f.endswith('count') and word.endswith('+'):
        # field count query of the form N+ so transform N+ to N->99999:
        word = word[:-1] + '->99999'
    word = word.replace('*', '%')  # we now use '*' as the truncation character
    words = word.split("->", 1)  # check for span query
    if len(words) == 2:
        word0 = re_word.sub('', words[0])
        word1 = re_word.sub('', words[1])
        if stemming_language:
            word0 = lower_index_term(word0)
            word1 = lower_index_term(word1)
            # We remove trailing truncation character before stemming
            if word0.endswith('%'):
                word0 = stem(word0[:-1], stemming_language) + '%'
            else:
                word0 = stem(word0, stemming_language)
            if word1.endswith('%'):
                word1 = stem(word1[:-1], stemming_language) + '%'
            else:
                word1 = stem(word1, stemming_language)

        word0_washed = wash_index_term(word0)
        word1_washed = wash_index_term(word1)
        if f.endswith('count'):
            # field count query; convert to integers in order
            # to have numerical behaviour for 'BETWEEN n1 AND n2' query
            try:
                word0_washed = int(word0_washed)
                word1_washed = int(word1_washed)
            except ValueError:
                pass
        query = model.query.filter(
            model.term.between(word0_washed, word1_washed))
        if wl > 0:
            query = query.limit(wl)
        res = query.values('term', 'hitlist')
        if wl > 0 and len(res) == wl:
            limit_reached = 1  # set the limit reached flag to true
    else:
        word = re_word.sub('', word)
        if stemming_language:
            word = lower_index_term(word)
            # We remove trailing truncation character before stemming
            if word.endswith('%'):
                word = stem(word[:-1], stemming_language) + '%'
            else:
                word = stem(word, stemming_language)
        if word.find('%') >= 0:  # do we have wildcard in the word?
            query = model.query.filter(model.term.like(wash_index_term(word)))
            if wl > 0:
                query.limit(wl)
            res = query.values('term', 'hitlist')
            # set the limit reached flag to true
            limit_reached = wl > 0 and len(res) == wl
        else:
            res = model.query.filter(model.term.like(
                wash_index_term(word))).values('term', 'hitlist')
    # fill the result set:
    for word, hitlist in res:
        # add the results:
        hitset |= intbitset(hitlist)
    # check to see if the query limit was reached
    if limit_reached:
        # raise an exception, so we can print a nice message to the user
        raise InvenioWebSearchWildcardLimitError(hitset)
    # okay, return result set:
    return hitset
    def answer(self, req, user_info, of, cc, colls_to_search, p, f,
               search_units, ln):
        """
        Answer question given by context.

        Return (relevance, html_string) where relevance is integer
        from 0 to 100 indicating how relevant to the question the
        answer is (see C{CFG_WEBSEARCH_SERVICE_MAX_SERVICE_ANSWER_RELEVANCE} for details) ,
        and html_string being a formatted answer.
        """
        from invenio.legacy.search_engine import \
             get_permitted_restricted_collections, \
             get_coll_i18nname, \
             collection_i18nname_cache, \
             collection_restricted_p
        _ = gettext_set_language(ln)
        # stem search units. remove those with field
        # TODO: search in hosted collection names too
        # TODO: ignore unattached trees
        # TODO: use synonyms
        if f or (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH < 0) or \
               (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH == 0 and cc != CFG_SITE_NAME):
            return (0, '')

        words = [
            stem(unit[1], ln) for unit in search_units
            if unit[2] in ('', 'collection')
        ]  # Stemming

        if not words:
            return (0, '')

        permitted_restricted_collections = get_permitted_restricted_collections(
            user_info)
        cache = self.get_data_cache()

        matching_collections = {}
        for word in words:
            if CFG_CERN_SITE and word == 'cern':
                # This keyword is useless here...
                continue

            colls = cache.get(word.lower(), [])
            for coll in colls:
                if collection_restricted_p(coll) and \
                       not coll in permitted_restricted_collections:
                    # Skip restricted collection user do not have access
                    continue
                if not matching_collections.has_key(coll):
                    matching_collections[coll] = 0
                matching_collections[coll] += 1

        matching_collections_sorted = sorted(matching_collections.iteritems(),
                                             key=lambda (k, v): (v, k),
                                             reverse=True)
        if not matching_collections_sorted:
            return (0, '')

        matching_collections_names = [(get_coll_i18nname(coll, ln, False), CFG_SITE_URL + '/collection/' + urllib.quote(coll, safe='') + '?ln=en') \
                                      for coll, score in matching_collections_sorted]

        best_score = matching_collections_sorted[0][1]
        best_coll_words = whitespace_re.split(
            matching_collections_sorted[0][0])

        relevance = min(
            100,
            max(0, (100 * float(2 * best_score) /
                    float(len(best_coll_words) + len(words)) - 10)))

        if (('submit' in p.lower()) or (_('submit') in p.lower())) and \
               not (('submit' in best_coll_words) or (_('submit') in best_coll_words)):
            # User is probably looking for a submission. Decrease relevance
            relevance = max(0, relevance - 30)

        return (relevance,
                self.display_answer_helper(matching_collections_names, ln))
Example #12
0
    def answer(self, req, user_info, of, cc, colls_to_search, p, f,
               search_units, ln):
        """
        Answer question given by context.

        Return (relevance, html_string) where relevance is integer
        from 0 to 100 indicating how relevant to the question the
        answer is (see C{CFG_WEBSEARCH_SERVICE_MAX_SERVICE_ANSWER_RELEVANCE} for details) ,
        and html_string being a formatted answer.
        """
        _ = gettext_set_language(ln)
        if f or (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH < 0) or \
               (CFG_WEBSEARCH_COLLECTION_NAMES_SEARCH == 0 and cc != CFG_SITE_NAME):
            return (0, '')

        words = [
            stem(unit[1].lower(), CFG_SITE_LANG) for unit in search_units
            if unit[2] == ''
        ]

        if not words:
            return (0, '')

        cache = self.get_data_cache()

        # TODO: If all categories of a submission match, display only submission (not categories)

        matching_submissions = {}

        for word in words:
            # Look for submission names
            if CFG_CERN_SITE and word == 'cern':
                # This keyword is useless here...
                continue

            submissions = cache.get(word, [])
            for doctype, submission_label, category in submissions:
                if acc_authorize_action(req, 'submit', \
                                        authorized_if_no_roles=not isGuestUser(user_info['uid']), \
                                        doctype=(CFG_CERN_SITE and doctype.startswith('GENSBM#') and 'GENSBM') or doctype,
                                        categ=category)[0] != 0:
                    # Not authorized to submit in this submission
                    continue

                if not matching_submissions.has_key(
                    (doctype, submission_label)):
                    matching_submissions[(doctype, submission_label)] = 0
                add_score = 1
                if category != '*':
                    # This is the submission category, consider that
                    # words that are part of the submission name are
                    # less important than others here:
                    if not word.lower() in category.lower():
                        # word is only in submission name
                        add_score = 0.5
                    else:
                        add_score = 1.5

                matching_submissions[(doctype, submission_label)] += add_score

        matching_submissions_sorted = sorted(matching_submissions.iteritems(),
                                             key=lambda (k, v): (v, k),
                                             reverse=True)
        if not matching_submissions_sorted:
            return (0, '')
        best_score = matching_submissions_sorted[0][1]
        max_score_difference = 1.9

        matching_submissions_names = [(submission_label, \
                                       CFG_SITE_URL + '/submit?doctype=' + doctype.split("#", 1)[0] + '&ln=' + ln + (CFG_CERN_SITE and doctype.startswith('GENSBM#') and '#' + doctype.split("#", 1)[-1] or '') ) \
                                      for (doctype, submission_label), score in matching_submissions_sorted if score > best_score - max_score_difference]

        best_sbm_words = whitespace_re.split(
            matching_submissions_sorted[0][0][1])

        score_bonus = (((_("Submit").lower() in words) or ("submit" in words)) or \
                       ((_("Revise").lower() in words) or ("revise" in words)) or \
                       ((_("Modify").lower() in words) or ("modify" in words))) and 40 or 0
        relevance = min(
            100,
            max(0, (score_bonus + (100 * float(best_score) /
                                   float(len(best_sbm_words) + len(words)))) -
                10))

        return (relevance,
                self.display_answer_helper(matching_submissions_names, ln))