Esempio n. 1
0
    def tokenize_for_words(self, phrase):
        """Return list of words found in PHRASE.  Note that the phrase is
           split into groups depending on the alphanumeric characters and
           punctuation characters definition present in the config file.
        """

        words = {}
        formulas = []
        if self.remove_html_markup and phrase.find("</") > -1:
            phrase = remove_html_markup(phrase)
        if self.remove_latex_markup:
            formulas = latex_formula_re.findall(phrase)
            phrase = remove_latex_markup(phrase)
            phrase = latex_formula_re.sub(' ', phrase)
        phrase = wash_for_utf8(phrase)
        phrase = lower_index_term(phrase)
        # 1st split phrase into blocks according to whitespace
        for block in strip_accents(phrase).split():
            # 2nd remove leading/trailing punctuation and add block:
            block = re_block_punctuation_begin.sub("", block)
            block = re_block_punctuation_end.sub("", block)
            if block:
                stemmed_block = remove_stopwords(block, self.remove_stopwords)
                stemmed_block = length_check(stemmed_block)
                stemmed_block = apply_stemming(stemmed_block,
                                               self.stemming_language)
                if stemmed_block:
                    words[stemmed_block] = 1
                if re_arxiv.match(block):
                    # special case for blocks like `arXiv:1007.5048' where
                    # we would like to index the part after the colon
                    # regardless of dot or other punctuation characters:
                    words[block.split(':', 1)[1]] = 1
                # 3rd break each block into subblocks according to punctuation and add subblocks:
                for subblock in re_punctuation.split(block):
                    stemmed_subblock = remove_stopwords(
                        subblock, self.remove_stopwords)
                    stemmed_subblock = length_check(stemmed_subblock)
                    stemmed_subblock = apply_stemming(stemmed_subblock,
                                                      self.stemming_language)
                    if stemmed_subblock:
                        words[stemmed_subblock] = 1
                    # 4th break each subblock into alphanumeric groups and add groups:
                    for alphanumeric_group in re_separators.split(subblock):
                        stemmed_alphanumeric_group = remove_stopwords(
                            alphanumeric_group, self.remove_stopwords)
                        stemmed_alphanumeric_group = length_check(
                            stemmed_alphanumeric_group)
                        stemmed_alphanumeric_group = apply_stemming(
                            stemmed_alphanumeric_group, self.stemming_language)
                        if stemmed_alphanumeric_group:
                            words[stemmed_alphanumeric_group] = 1
        for block in formulas:
            words[block] = 1
        return words.keys()
Esempio n. 2
0
    def tokenize_for_words(self, phrase):
        """Return list of words found in PHRASE.  Note that the phrase is
           split into groups depending on the alphanumeric characters and
           punctuation characters definition present in the config file.
        """

        words = {}
        formulas = []
        if self.remove_html_markup and phrase.find("</") > -1:
            phrase = remove_html_markup(phrase)
        if self.remove_latex_markup:
            formulas = latex_formula_re.findall(phrase)
            phrase = remove_latex_markup(phrase)
            phrase = latex_formula_re.sub(' ', phrase)
        phrase = wash_for_utf8(phrase)
        phrase = lower_index_term(phrase)
        # 1st split phrase into blocks according to whitespace
        for block in strip_accents(phrase).split():
            # 2nd remove leading/trailing punctuation and add block:
            block = re_block_punctuation_begin.sub("", block)
            block = re_block_punctuation_end.sub("", block)
            if block:
                stemmed_block = remove_stopwords(block, self.remove_stopwords)
                stemmed_block = length_check(stemmed_block)
                stemmed_block = apply_stemming(stemmed_block, self.stemming_language)
                if stemmed_block:
                    words[stemmed_block] = 1
                if re_arxiv.match(block):
                    # special case for blocks like `arXiv:1007.5048' where
                    # we would like to index the part after the colon
                    # regardless of dot or other punctuation characters:
                    words[block.split(':', 1)[1]] = 1
                # 3rd break each block into subblocks according to punctuation and add subblocks:
                for subblock in re_punctuation.split(block):
                    stemmed_subblock = remove_stopwords(subblock, self.remove_stopwords)
                    stemmed_subblock = length_check(stemmed_subblock)
                    stemmed_subblock = apply_stemming(stemmed_subblock, self.stemming_language)
                    if stemmed_subblock:
                        words[stemmed_subblock] = 1
                    # 4th break each subblock into alphanumeric groups and add groups:
                    for alphanumeric_group in re_separators.split(subblock):
                        stemmed_alphanumeric_group = remove_stopwords(alphanumeric_group, self.remove_stopwords)
                        stemmed_alphanumeric_group = length_check(stemmed_alphanumeric_group)
                        stemmed_alphanumeric_group = apply_stemming(stemmed_alphanumeric_group, self.stemming_language)
                        if stemmed_alphanumeric_group:
                            words[stemmed_alphanumeric_group] = 1
        for block in formulas:
            words[block] = 1
        return words.keys()
Esempio n. 3
0
    def tokenize_for_pairs(self, phrase):
        """Return list of words found in PHRASE.  Note that the phrase is
           split into groups depending on the alphanumeric characters and
           punctuation characters definition present in the config file.
        """

        words = {}
        if self.remove_html_markup and phrase.find("</") > -1:
            phrase = remove_html_markup(phrase)
        if self.remove_latex_markup:
            phrase = remove_latex_markup(phrase)
            phrase = latex_formula_re.sub(' ', phrase)
        phrase = wash_for_utf8(phrase)
        phrase = lower_index_term(phrase)
        # 1st split phrase into blocks according to whitespace
        last_word = ''
        for block in strip_accents(phrase).split():
            # 2nd remove leading/trailing punctuation and add block:
            block = re_block_punctuation_begin.sub("", block)
            block = re_block_punctuation_end.sub("", block)
            if block:
                block = remove_stopwords(block, self.remove_stopwords)
                block = length_check(block)
                block = apply_stemming(block, self.stemming_language)
                # 3rd break each block into subblocks according to punctuation and add subblocks:
                for subblock in re_punctuation.split(block):
                    subblock = remove_stopwords(subblock,
                                                self.remove_stopwords)
                    subblock = length_check(subblock)
                    subblock = apply_stemming(subblock, self.stemming_language)
                    if subblock:
                        # 4th break each subblock into alphanumeric groups and add groups:
                        for alphanumeric_group in re_separators.split(
                                subblock):
                            alphanumeric_group = remove_stopwords(
                                alphanumeric_group, self.remove_stopwords)
                            alphanumeric_group = length_check(
                                alphanumeric_group)
                            alphanumeric_group = apply_stemming(
                                alphanumeric_group, self.stemming_language)
                            if alphanumeric_group:
                                if last_word:
                                    words['%s %s' %
                                          (last_word, alphanumeric_group)] = 1
                                last_word = alphanumeric_group
        return words.keys()
Esempio n. 4
0
    def tokenize_for_pairs(self, phrase):
        """Return list of words found in PHRASE.  Note that the phrase is
           split into groups depending on the alphanumeric characters and
           punctuation characters definition present in the config file.
        """

        words = {}
        if self.remove_html_markup and phrase.find("</") > -1:
            phrase = remove_html_markup(phrase)
        if self.remove_latex_markup:
            phrase = remove_latex_markup(phrase)
            phrase = latex_formula_re.sub(' ', phrase)
        phrase = wash_for_utf8(phrase)
        phrase = lower_index_term(phrase)
        # 1st split phrase into blocks according to whitespace
        last_word = ''
        for block in strip_accents(phrase).split():
            # 2nd remove leading/trailing punctuation and add block:
            block = re_block_punctuation_begin.sub("", block)
            block = re_block_punctuation_end.sub("", block)
            if block:
                block = remove_stopwords(block, self.remove_stopwords)
                block = length_check(block)
                block = apply_stemming(block, self.stemming_language)
                # 3rd break each block into subblocks according to punctuation and add subblocks:
                for subblock in re_punctuation.split(block):
                    subblock = remove_stopwords(subblock, self.remove_stopwords)
                    subblock = length_check(subblock)
                    subblock = apply_stemming(subblock, self.stemming_language)
                    if subblock:
                        # 4th break each subblock into alphanumeric groups and add groups:
                        for alphanumeric_group in re_separators.split(subblock):
                            alphanumeric_group = remove_stopwords(alphanumeric_group, self.remove_stopwords)
                            alphanumeric_group = length_check(alphanumeric_group)
                            alphanumeric_group = apply_stemming(alphanumeric_group, self.stemming_language)
                            if alphanumeric_group:
                                if last_word:
                                    words['%s %s' % (last_word, alphanumeric_group)] = 1
                                last_word = alphanumeric_group
        return words.keys()
Esempio n. 5
0
def search_unit_in_bibwords(word, f, decompress=zlib.decompress, wl=0):
    """Search for 'word' inside bibwordsX table for field 'f'.

    :return: hitset of recIDs.
    """
    from invenio.legacy.bibindex.engine_stemmer import stem
    from invenio.legacy.bibindex.engine_washer import (
        lower_index_term,
        wash_index_term,
    )
    # FIXME: Should not be used for journal field.
    hitset = intbitset()  # will hold output result set
    limit_reached = 0  # flag for knowing if the query limit has been reached

    # if no field is specified, search in the global index.
    f = f or 'anyfield'
    index = IdxINDEX.get_from_field(f)
    if index is None:
        return hitset
    model = index.wordf
    stemming_language = index.stemming_language

    # wash 'word' argument and run query:
    if f.endswith('count') and word.endswith('+'):
        # field count query of the form N+ so transform N+ to N->99999:
        word = word[:-1] + '->99999'
    word = word.replace('*', '%')  # we now use '*' as the truncation character
    words = word.split("->", 1)  # check for span query
    if len(words) == 2:
        word0 = re_word.sub('', words[0])
        word1 = re_word.sub('', words[1])
        if stemming_language:
            word0 = lower_index_term(word0)
            word1 = lower_index_term(word1)
            # We remove trailing truncation character before stemming
            if word0.endswith('%'):
                word0 = stem(word0[:-1], stemming_language) + '%'
            else:
                word0 = stem(word0, stemming_language)
            if word1.endswith('%'):
                word1 = stem(word1[:-1], stemming_language) + '%'
            else:
                word1 = stem(word1, stemming_language)

        word0_washed = wash_index_term(word0)
        word1_washed = wash_index_term(word1)
        if f.endswith('count'):
            # field count query; convert to integers in order
            # to have numerical behaviour for 'BETWEEN n1 AND n2' query
            try:
                word0_washed = int(word0_washed)
                word1_washed = int(word1_washed)
            except ValueError:
                pass
        query = model.query.filter(
            model.term.between(word0_washed, word1_washed))
        if wl > 0:
            query = query.limit(wl)
        res = query.values('term', 'hitlist')
        if wl > 0 and len(res) == wl:
            limit_reached = 1  # set the limit reached flag to true
    else:
        word = re_word.sub('', word)
        if stemming_language:
            word = lower_index_term(word)
            # We remove trailing truncation character before stemming
            if word.endswith('%'):
                word = stem(word[:-1], stemming_language) + '%'
            else:
                word = stem(word, stemming_language)
        if word.find('%') >= 0:  # do we have wildcard in the word?
            query = model.query.filter(model.term.like(wash_index_term(word)))
            if wl > 0:
                query.limit(wl)
            res = query.values('term', 'hitlist')
            # set the limit reached flag to true
            limit_reached = wl > 0 and len(res) == wl
        else:
            res = model.query.filter(model.term.like(
                wash_index_term(word))).values('term', 'hitlist')
    # fill the result set:
    for word, hitlist in res:
        # add the results:
        hitset |= intbitset(hitlist)
    # check to see if the query limit was reached
    if limit_reached:
        # raise an exception, so we can print a nice message to the user
        raise InvenioWebSearchWildcardLimitError(hitset)
    # okay, return result set:
    return hitset
Esempio n. 6
0
def search_unit_in_bibwords(word, f, decompress=zlib.decompress, wl=0):
    """Search for 'word' inside bibwordsX table for field 'f'.

    :return: hitset of recIDs.
    """
    from invenio.legacy.bibindex.engine_stemmer import stem
    from invenio.legacy.bibindex.engine_washer import lower_index_term, wash_index_term

    # FIXME: Should not be used for journal field.
    hitset = intbitset()  # will hold output result set
    limit_reached = 0  # flag for knowing if the query limit has been reached

    # if no field is specified, search in the global index.
    f = f or "anyfield"
    index = IdxINDEX.get_from_field(f)
    if index is None:
        return hitset
    model = index.wordf
    stemming_language = index.stemming_language

    # wash 'word' argument and run query:
    if f.endswith("count") and word.endswith("+"):
        # field count query of the form N+ so transform N+ to N->99999:
        word = word[:-1] + "->99999"
    word = word.replace("*", "%")  # we now use '*' as the truncation character
    words = word.split("->", 1)  # check for span query
    if len(words) == 2:
        word0 = re_word.sub("", words[0])
        word1 = re_word.sub("", words[1])
        if stemming_language:
            word0 = lower_index_term(word0)
            word1 = lower_index_term(word1)
            # We remove trailing truncation character before stemming
            if word0.endswith("%"):
                word0 = stem(word0[:-1], stemming_language) + "%"
            else:
                word0 = stem(word0, stemming_language)
            if word1.endswith("%"):
                word1 = stem(word1[:-1], stemming_language) + "%"
            else:
                word1 = stem(word1, stemming_language)

        word0_washed = wash_index_term(word0)
        word1_washed = wash_index_term(word1)
        if f.endswith("count"):
            # field count query; convert to integers in order
            # to have numerical behaviour for 'BETWEEN n1 AND n2' query
            try:
                word0_washed = int(word0_washed)
                word1_washed = int(word1_washed)
            except ValueError:
                pass
        query = model.query.filter(model.term.between(word0_washed, word1_washed))
        if wl > 0:
            query = query.limit(wl)
        res = query.values("term", "hitlist")
        if wl > 0 and len(res) == wl:
            limit_reached = 1  # set the limit reached flag to true
    else:
        word = re_word.sub("", word)
        if stemming_language:
            word = lower_index_term(word)
            # We remove trailing truncation character before stemming
            if word.endswith("%"):
                word = stem(word[:-1], stemming_language) + "%"
            else:
                word = stem(word, stemming_language)
        if word.find("%") >= 0:  # do we have wildcard in the word?
            query = model.query.filter(model.term.like(wash_index_term(word)))
            if wl > 0:
                query.limit(wl)
            res = query.values("term", "hitlist")
            # set the limit reached flag to true
            limit_reached = wl > 0 and len(res) == wl
        else:
            res = model.query.filter(model.term.like(wash_index_term(word))).values("term", "hitlist")
    # fill the result set:
    for word, hitlist in res:
        # add the results:
        hitset |= intbitset(hitlist)
    # check to see if the query limit was reached
    if limit_reached:
        # raise an exception, so we can print a nice message to the user
        raise InvenioWebSearchWildcardLimitError(hitset)
    # okay, return result set:
    return hitset