def extract_searchable_text(entry): cite_key = entry["pid"] # Include the cite-key in the text that can be searched. text_lines = [cite_key] for field, funcs in FIELDS_TO_GREP: if entry.has_key(field): value = entry[field] # Apply field-specific processing functions. for func in funcs: if type(value) == types.ListType: result_values = [] for v in value: res = func(v) if type(res) == types.ListType: result_values.extend(res) else: result_values.append(res) value = result_values else: value = func(value) # Transliterate to ASCII and convert to lowercase. if type(value) == types.ListType: value = [ unicode_string_utils.transliterate_to_ascii(v).lower() for v in value ] else: value = unicode_string_utils.transliterate_to_ascii( value).lower() # Add to the searchable text lines for this entry. if type(value) == types.ListType: # Remove duplicates. value = list(set(value)) text_lines.extend(value) else: text_lines.append(value) return (cite_key, text_lines)
def extract_searchable_text(entry): cite_key = entry["pid"] # Include the cite-key in the text that can be searched. text_lines = [cite_key] for field, funcs in FIELDS_TO_GREP: if entry.has_key(field): value = entry[field] # Apply field-specific processing functions. for func in funcs: if type(value) == types.ListType: result_values = [] for v in value: res = func(v) if type(res) == types.ListType: result_values.extend(res) else: result_values.append(res) value = result_values else: value = func(value) # Transliterate to ASCII and convert to lowercase. if type(value) == types.ListType: value = [unicode_string_utils.transliterate_to_ascii(v).lower() for v in value] else: value = unicode_string_utils.transliterate_to_ascii(value).lower() # Add to the searchable text lines for this entry. if type(value) == types.ListType: # Remove duplicates. value = list(set(value)) text_lines.extend(value) else: text_lines.append(value) return (cite_key, text_lines)
def normalise_hyphens_strip_punctuation(w): # We assume that word 'w' contains no whitespace. It may contain hyphens, # which we should not remove (but we should reduce any sequences of multiple # hyphens to a single hyphen). # First, let's invoke 'transliterate_to_ascii', in case there are any # hyphen-like characters that we want to convert into real ASCII hyphens. # While we're at it, let's convert all the now-ASCII characters to lowercase. w = unicode_string_utils.transliterate_to_ascii(w).lower() # We want to retain any hyphens in the word. # Hence, split at the hyphens before we remove punctuation, so we can # re-join the pieces with hyphens later. We remove any empty word pieces, # because these indicate that there were multiple adjacent hyphens in the word. # As an added benefit, this will also ensure that the re-assembled 'w' does not # begin or end with a hyphen. word_pieces = filter(None, w.split("-")) word_pieces = map(strip_punctuation_and_whitespace, word_pieces) return '-'.join(word_pieces)
def build_query(expr): tokens = expr.replace(":", " ").split() return [unicode_string_utils.transliterate_to_ascii(t).lower() for t in tokens]
def normalise_to_ascii_lower(s): # Note that we invoke 'transliterate_to_ascii' before 'strip_punctuation_and_whitespace' # because 'transliterate_to_ascii' may well introduce some punctuation or whitespace # which we then want 'strip_punctuation_and_whitespace' to remove. return strip_punctuation_and_whitespace( unicode_string_utils.transliterate_to_ascii(s)).lower()
def build_query(expr): tokens = expr.replace(":", " ").split() return [ unicode_string_utils.transliterate_to_ascii(t).lower() for t in tokens ]