def tokenize(self, phrase): """Return list of phrases found in PHRASE. Note that the phrase is split into groups depending on the alphanumeric characters and punctuation characters definition present in the config file. """ phrase = wash_for_utf8(phrase) return [phrase] ## Note that we don't break phrases, they are used for exact style ## of searching. words = {} phrase = strip_accents(phrase) # 1st split phrase into blocks according to whitespace for block1 in phrase_delimiter_re.split(strip_accents(phrase)): block1 = block1.strip() if block1 and self.stemming_language: new_words = [] for block2 in re_punctuation.split(block1): block2 = block2.strip() if block2: for block3 in block2.split(): block3 = block3.strip() if block3: # Note that we don't stem phrases, they # are used for exact style of searching. new_words.append(block3) block1 = ' '.join(new_words) if block1: words[block1] = 1 return words.keys()
def test_strip_accents(self): """textutils - transliterate to ascii (basic)""" self.assertEqual("memememe", strip_accents('mémêmëmè')) self.assertEqual("MEMEMEME", strip_accents('MÉMÊMËMÈ')) self.assertEqual("oe", strip_accents('œ')) self.assertEqual("OE", strip_accents('Œ'))
def _sort_alphanumerically_remove_leading_articles_strip_accents(self, val): """ Convert: 'The title' => 'title' 'A title' => 'title' 'Title' => 'title' """ if not val: return '' val_tokens = str(val).split(" ", 1) #split in leading_word, phrase_without_leading_word if len(val_tokens) == 2 and val_tokens[0].lower() in LEADING_ARTICLES: return strip_accents(val_tokens[1].strip().lower()) return strip_accents(val.lower())
def tokenize_for_phrases(self, phrase): """Return list of phrases found in PHRASE. Note that the phrase is split into groups depending on the alphanumeric characters and punctuation characters definition present in the config file. """ phrase = wash_for_utf8(phrase) phrase = lower_index_term(phrase) return [strip_accents(phrase)]
def tokenize_for_phrases(self, phrase): """ Another name for tokenize_for_fuzzy_authors. It's for the compatibility. See: tokenize_for_fuzzy_authors """ phrase = wash_for_utf8(phrase) phrase = lower_index_term(phrase) return self.tokenize_for_fuzzy_authors(strip_accents(phrase))
def tokenize_for_words(self, phrase): """Return list of words found in PHRASE. Note that the phrase is split into groups depending on the alphanumeric characters and punctuation characters definition present in the config file. """ words = {} formulas = [] if self.remove_html_markup and phrase.find("</") > -1: phrase = remove_html_markup(phrase) if self.remove_latex_markup: formulas = latex_formula_re.findall(phrase) phrase = remove_latex_markup(phrase) phrase = latex_formula_re.sub(' ', phrase) phrase = wash_for_utf8(phrase) phrase = lower_index_term(phrase) # 1st split phrase into blocks according to whitespace for block in strip_accents(phrase).split(): # 2nd remove leading/trailing punctuation and add block: block = re_block_punctuation_begin.sub("", block) block = re_block_punctuation_end.sub("", block) if block: stemmed_block = remove_stopwords(block, self.remove_stopwords) stemmed_block = length_check(stemmed_block) stemmed_block = apply_stemming(stemmed_block, self.stemming_language) if stemmed_block: words[stemmed_block] = 1 if re_arxiv.match(block): # special case for blocks like `arXiv:1007.5048' where # we would like to index the part after the colon # regardless of dot or other punctuation characters: words[block.split(':', 1)[1]] = 1 # 3rd break each block into subblocks according to punctuation and add subblocks: for subblock in re_punctuation.split(block): stemmed_subblock = remove_stopwords( subblock, self.remove_stopwords) stemmed_subblock = length_check(stemmed_subblock) stemmed_subblock = apply_stemming(stemmed_subblock, self.stemming_language) if stemmed_subblock: words[stemmed_subblock] = 1 # 4th break each subblock into alphanumeric groups and add groups: for alphanumeric_group in re_separators.split(subblock): stemmed_alphanumeric_group = remove_stopwords( alphanumeric_group, self.remove_stopwords) stemmed_alphanumeric_group = length_check( stemmed_alphanumeric_group) stemmed_alphanumeric_group = apply_stemming( stemmed_alphanumeric_group, self.stemming_language) if stemmed_alphanumeric_group: words[stemmed_alphanumeric_group] = 1 for block in formulas: words[block] = 1 return words.keys()
def tokenize_for_words(self, phrase): """ If CFG_BIBINDEX_AUTHOR_WORD_INDEX_EXCLUDE_FIRST_NAMES is 1 we tokenize only for family names. In other case we perform standard tokenization for words. """ phrase = wash_for_utf8(phrase) phrase = lower_index_term(phrase) phrase = strip_accents(phrase) if CFG_BIBINDEX_AUTHOR_WORD_INDEX_EXCLUDE_FIRST_NAMES: return self.get_author_family_name_words_from_phrase(phrase) else: return self.tokenize_for_words_default(phrase)
def tokenize_for_words(self, phrase, recid): """Return list of words found in PHRASE. Note that the phrase is split into groups depending on the alphanumeric characters and punctuation characters definition present in the config file. """ if not self.isAuthority(recid): return [] words = {} formulas = [] if self.remove_html_markup and phrase.find("</") > -1: phrase = remove_html_markup(phrase) if self.remove_latex_markup: formulas = latex_formula_re.findall(phrase) phrase = remove_latex_markup(phrase) phrase = latex_formula_re.sub(" ", phrase) phrase = wash_for_utf8(phrase) phrase = lower_index_term(phrase) # 1st split phrase into blocks according to whitespace for block in strip_accents(phrase).split(): # 2nd remove leading/trailing punctuation and add block: block = re_block_punctuation_begin.sub("", block) block = re_block_punctuation_end.sub("", block) if block: stemmed_block = remove_stopwords(block, self.remove_stopwords) stemmed_block = length_check(stemmed_block) stemmed_block = apply_stemming(stemmed_block, self.stemming_language) if stemmed_block: words[stemmed_block] = 1 if re_arxiv.match(block): # special case for blocks like `arXiv:1007.5048' where # we would like to index the part after the colon # regardless of dot or other punctuation characters: words[block.split(":", 1)[1]] = 1 # 3rd break each block into subblocks according to punctuation and add subblocks: for subblock in re_punctuation.split(block): stemmed_subblock = remove_stopwords(subblock, self.remove_stopwords) stemmed_subblock = length_check(stemmed_subblock) stemmed_subblock = apply_stemming(stemmed_subblock, self.stemming_language) if stemmed_subblock: words[stemmed_subblock] = 1 # 4th break each subblock into alphanumeric groups and add groups: for alphanumeric_group in re_separators.split(subblock): stemmed_alphanumeric_group = remove_stopwords(alphanumeric_group, self.remove_stopwords) stemmed_alphanumeric_group = length_check(stemmed_alphanumeric_group) stemmed_alphanumeric_group = apply_stemming(stemmed_alphanumeric_group, self.stemming_language) if stemmed_alphanumeric_group: words[stemmed_alphanumeric_group] = 1 for block in formulas: words[block] = 1 return words.keys()
def tokenize_for_pairs(self, phrase): """Return list of words found in PHRASE. Note that the phrase is split into groups depending on the alphanumeric characters and punctuation characters definition present in the config file. """ words = {} if self.remove_html_markup and phrase.find("</") > -1: phrase = remove_html_markup(phrase) if self.remove_latex_markup: phrase = remove_latex_markup(phrase) phrase = latex_formula_re.sub(' ', phrase) phrase = wash_for_utf8(phrase) phrase = lower_index_term(phrase) # 1st split phrase into blocks according to whitespace last_word = '' for block in strip_accents(phrase).split(): # 2nd remove leading/trailing punctuation and add block: block = re_block_punctuation_begin.sub("", block) block = re_block_punctuation_end.sub("", block) if block: block = remove_stopwords(block, self.remove_stopwords) block = length_check(block) block = apply_stemming(block, self.stemming_language) # 3rd break each block into subblocks according to punctuation and add subblocks: for subblock in re_punctuation.split(block): subblock = remove_stopwords(subblock, self.remove_stopwords) subblock = length_check(subblock) subblock = apply_stemming(subblock, self.stemming_language) if subblock: # 4th break each subblock into alphanumeric groups and add groups: for alphanumeric_group in re_separators.split( subblock): alphanumeric_group = remove_stopwords( alphanumeric_group, self.remove_stopwords) alphanumeric_group = length_check( alphanumeric_group) alphanumeric_group = apply_stemming( alphanumeric_group, self.stemming_language) if alphanumeric_group: if last_word: words['%s %s' % (last_word, alphanumeric_group)] = 1 last_word = alphanumeric_group return words.keys()
def tokenize_for_pairs(self, phrase, recid): """Return list of words found in PHRASE. Note that the phrase is split into groups depending on the alphanumeric characters and punctuation characters definition present in the config file. """ if not self.isAuthority(recid): return [] words = {} if self.remove_html_markup and phrase.find("</") > -1: phrase = remove_html_markup(phrase) if self.remove_latex_markup: phrase = remove_latex_markup(phrase) phrase = latex_formula_re.sub(" ", phrase) phrase = wash_for_utf8(phrase) phrase = lower_index_term(phrase) # 1st split phrase into blocks according to whitespace last_word = "" for block in strip_accents(phrase).split(): # 2nd remove leading/trailing punctuation and add block: block = re_block_punctuation_begin.sub("", block) block = re_block_punctuation_end.sub("", block) if block: block = remove_stopwords(block, self.remove_stopwords) block = length_check(block) block = apply_stemming(block, self.stemming_language) # 3rd break each block into subblocks according to punctuation and add subblocks: for subblock in re_punctuation.split(block): subblock = remove_stopwords(subblock, self.remove_stopwords) subblock = length_check(subblock) subblock = apply_stemming(subblock, self.stemming_language) if subblock: # 4th break each subblock into alphanumeric groups and add groups: for alphanumeric_group in re_separators.split(subblock): alphanumeric_group = remove_stopwords(alphanumeric_group, self.remove_stopwords) alphanumeric_group = length_check(alphanumeric_group) alphanumeric_group = apply_stemming(alphanumeric_group, self.stemming_language) if alphanumeric_group: if last_word: words["%s %s" % (last_word, alphanumeric_group)] = 1 last_word = alphanumeric_group return words.keys()
def test_strip_accents(self): """textutils - stripping of accented letters""" self.assertEqual("memememe", strip_accents('mémêmëmè')) self.assertEqual("MEMEMEME", strip_accents('MÉMÊMËMÈ'))
def _sort_case_insensitive_strip_accents(self, val): """Remove accents and convert to lower case""" if not val: return '' return strip_accents(str(val).lower())
def test_strip_accents(self): """textutils - transliterate to ascii (basic)""" self.assertEqual("memememe", strip_accents("mémêmëmè")) self.assertEqual("MEMEMEME", strip_accents("MÉMÊMËMÈ")) self.assertEqual("oe", strip_accents("œ")) self.assertEqual("OE", strip_accents("Œ"))