Esempio n. 1
0
def replace_tokens(string):
    for word in get_words(string):
        for token in REPLACEMENT_TOKENS:
            if token == word:
                string = string.replace(word, REPLACEMENT_TOKENS[token])
                break
    return string
Esempio n. 2
0
def remove_house_token(string):
    words = get_words(string, True)
    for index, word in enumerate(words):
        if index not in xrange(len(words) - 1):
            break
        if word in HOUSE_TOKENS:
            for next_words in words[index + 1:]:
                if is_separator(next_words):
                    continue
                if next_words.isdigit():
                    words[index] = u''
                break
    return u''.join(words)
Esempio n. 3
0
 def _parse(self, token_types, get_tokens, set_token, is_parse_default_token=lambda: False, known_tokens=[]):
     for token_type in token_types:
         words = get_words(self.address, True)
         token_indexes_for_type = []
         for index, word in filter(lambda (index, word): word == token_type, enumerate(words)):
             if index not in xrange(len(words) - 1):
                 break
             (token, token_indexes) = get_tokens(words, index + 1, known_tokens)
             if token:
                 set_token(token, token_type)
                 token_indexes.append(index)
                 token_indexes_for_type.extend(token_indexes)
         for index in sorted(token_indexes_for_type, reverse=True):
             words.pop(index)
         self.address = u''.join(words)
Esempio n. 4
0
def spaces_round_second_hyphen_in_word(string):
    for word in get_words(string):
        base_word = word
        index = -1
        hyphen_count = 0
        while True:
            index = word.find('-', index + 1)
            if index == -1 or index + 1 >= len(word):
                break
            hyphen_count += 1
            if hyphen_count > 1:
                word = insert_space(word, index + 1)
                word = insert_space(word, index)
                string = string.replace(base_word, word)
                break
    return string
Esempio n. 5
0
def separate_house_and_housing(string):
    words = get_words(string, True)
    index = -1
    while True:
        index += 1
        if index + 1 > len(words):
            break
        word = words[index]
        for token in HOUSE_NUMBER_SEPARATORS:
            subwords = word.split(token)
            if len(subwords) == 2 and\
               len(filter(is_digit_or_one_alpha, subwords)) == 2 and\
               len(filter(lambda subword: subword.isdigit(), subwords)) > 0:
                words[index] = subwords[0]
                words.insert(index + 1, subwords[1])
                words.insert(index + 1, u', корп. ' if subwords[1].isdigit() else u' ')
    return u''.join(words)
Esempio n. 6
0
def replace_house_number_tokens(string):
    words = get_words(string, True)
    index = -1
    while True:
        index += 1
        if index not in xrange(len(words) - 1):
            break
        word = words[index]
        for token in filter(lambda token: word == token, HOUSE_NUMBER_TOKENS):
            for next_word in words[index + 1:]:
                if is_separator(next_word):
                    continue
                if is_digit_or_one_alpha(next_word):
                    words[index] = u'корп.'
                    for previous_index, previous_word in enumerate(words[index-1::-1]):
                        if previous_word == u',':
                            break
                        if is_separator(previous_word):
                            continue
                        words[index - previous_index] = u',%s' % words[index - previous_index]
                        break
    return u''.join(words)
Esempio n. 7
0
def rearranged_words(string):
    words = get_words(string, True)
    for index, word in enumerate(words):
        for token in REARRANGED_WORDS:
            if token == word:
                previous_word_index = index
                while True:
                    previous_word_index -= 1
                    if 0 > previous_word_index:
                        break
                    previous_word = words[previous_word_index]
                    if previous_word == ',':
                        break
                    if is_separator(previous_word):
                        continue
                    if previous_word[0].isupper():
                        words[index], words[previous_word_index] = words[previous_word_index], words[index]
                    break
    string = u''.join(words)
    for token in REARRANGED_WORDS_REPLACEMENT_TOKENS:
        while token in string:
            string = string.replace(token, REARRANGED_WORDS_REPLACEMENT_TOKENS[token])
    return string
Esempio n. 8
0
def remove_house_token_from_names(string):
    words = get_words(string, True)
    for index, word in enumerate(words):
        if word[-len(HOUSE_TOKEN):] == HOUSE_TOKEN:
            words[index] = word[:-len(HOUSE_TOKEN)]
    return u''.join(words)