def replace_tokens(string): for word in get_words(string): for token in REPLACEMENT_TOKENS: if token == word: string = string.replace(word, REPLACEMENT_TOKENS[token]) break return string
def remove_house_token(string): words = get_words(string, True) for index, word in enumerate(words): if index not in xrange(len(words) - 1): break if word in HOUSE_TOKENS: for next_words in words[index + 1:]: if is_separator(next_words): continue if next_words.isdigit(): words[index] = u'' break return u''.join(words)
def _parse(self, token_types, get_tokens, set_token, is_parse_default_token=lambda: False, known_tokens=[]): for token_type in token_types: words = get_words(self.address, True) token_indexes_for_type = [] for index, word in filter(lambda (index, word): word == token_type, enumerate(words)): if index not in xrange(len(words) - 1): break (token, token_indexes) = get_tokens(words, index + 1, known_tokens) if token: set_token(token, token_type) token_indexes.append(index) token_indexes_for_type.extend(token_indexes) for index in sorted(token_indexes_for_type, reverse=True): words.pop(index) self.address = u''.join(words)
def spaces_round_second_hyphen_in_word(string): for word in get_words(string): base_word = word index = -1 hyphen_count = 0 while True: index = word.find('-', index + 1) if index == -1 or index + 1 >= len(word): break hyphen_count += 1 if hyphen_count > 1: word = insert_space(word, index + 1) word = insert_space(word, index) string = string.replace(base_word, word) break return string
def separate_house_and_housing(string): words = get_words(string, True) index = -1 while True: index += 1 if index + 1 > len(words): break word = words[index] for token in HOUSE_NUMBER_SEPARATORS: subwords = word.split(token) if len(subwords) == 2 and\ len(filter(is_digit_or_one_alpha, subwords)) == 2 and\ len(filter(lambda subword: subword.isdigit(), subwords)) > 0: words[index] = subwords[0] words.insert(index + 1, subwords[1]) words.insert(index + 1, u', корп. ' if subwords[1].isdigit() else u' ') return u''.join(words)
def replace_house_number_tokens(string): words = get_words(string, True) index = -1 while True: index += 1 if index not in xrange(len(words) - 1): break word = words[index] for token in filter(lambda token: word == token, HOUSE_NUMBER_TOKENS): for next_word in words[index + 1:]: if is_separator(next_word): continue if is_digit_or_one_alpha(next_word): words[index] = u'корп.' for previous_index, previous_word in enumerate(words[index-1::-1]): if previous_word == u',': break if is_separator(previous_word): continue words[index - previous_index] = u',%s' % words[index - previous_index] break return u''.join(words)
def rearranged_words(string): words = get_words(string, True) for index, word in enumerate(words): for token in REARRANGED_WORDS: if token == word: previous_word_index = index while True: previous_word_index -= 1 if 0 > previous_word_index: break previous_word = words[previous_word_index] if previous_word == ',': break if is_separator(previous_word): continue if previous_word[0].isupper(): words[index], words[previous_word_index] = words[previous_word_index], words[index] break string = u''.join(words) for token in REARRANGED_WORDS_REPLACEMENT_TOKENS: while token in string: string = string.replace(token, REARRANGED_WORDS_REPLACEMENT_TOKENS[token]) return string
def remove_house_token_from_names(string): words = get_words(string, True) for index, word in enumerate(words): if word[-len(HOUSE_TOKEN):] == HOUSE_TOKEN: words[index] = word[:-len(HOUSE_TOKEN)] return u''.join(words)