Esempio n. 1
0
def split_address(x):
    if type(x)==float:
        x = ''
    address = {'suite':''}
    x = x.lower().replace('\r','').replace('\n','')
    x = x.replace(' ste-',' ste ')
    
    words = x.split()
    # some street numbers have a dash, append the second part to suite
    if len(words)>0 and words[0].find('-') > 0:
        n = words[0].split('-')
        address['suite'] += n[-1] + ' '
        x = ' '.join([n[0]] + words[1:])
    
    x = x.replace('-',' ')
    x = re.sub('[%s]' % re.escape(string.punctuation.replace('&','')), '', x)
    abbr = {'road':'rd', 'street':'st', 'avenue':'av', 'ave':'av', 'drive':'dr', 'boulevard':'blvd',
            'lane':'ln', 'circle':'cir', 'building':'building', 'mount':'mt', 
            'n':'north', 'e':'east', 's':'south', 'w':'west', 'suite':'ste', 'bv':'blvd', 'suit':'ste',
            'pky':'pkwy', 'parkway':'pkwy',
            'first':'1st', 'second':'2nd', 'third':'3rd', 'fourth':'4th', 'fifth':'5th', 'sixth':'6th',
            'seventh':'7th', 'eighth':'8th', 'ninth':'9th', 'tenth':'10th'}
    for key, value in abbr.iteritems():
        x = re.sub(r'\b(%s)\b' % key, value, x) 
    x = x.strip()
    
    n = re.findall(r'^(\d+\D?)\b',x)
    s = re.search(r'\b(ste\W?\D?\W?\d*\W?\D?)|(unit\W?\D?\W?\d*\W?\D?)\Z', x)
    if len(n) > 0 and s is not None and len(s.group()) > 0:
        i = re.search(r'\b(ste)|(bldg)|(unit)\b', x)
        address['num'] = n[0]
        address['street'] = re.sub(r'^(\d+\D?)\b', '', x[:x.find(i.group())])
        address['suite'] += x[x.find(i.group()):]
    elif len(n) > 0:
        address['num'] = n[0]
        address['street'] = re.sub(r'^(\d+\D?)\b', '', x)
    elif len(x) > 0:
        address = {'num':'', 'street':x[0], 'suite':''}
    else:
        address = {'num':'', 'street':'', 'suite':''}
        
    address = lib.strip_address_letters(address)
        
    # remove extra white spaces
    for key, value in address.iteritems():
        address[key] = re.sub(r'\s+', ' ', value).strip()
        
    return pd.Series(address)
Esempio n. 2
0
def split_address(x):
    if type(x)==float:
        x = ''
    address = {'suite':''}
    x = x.lower().replace('\r','').replace('\n','')
    x = x.replace(' ste-',' ste ')
    
    words = x.split()
    # some street numbers have a dash, append the second part to suite
    if len(words)>0 and words[0].find('-') > 0:
        n = words[0].split('-')
        address['suite'] += n[-1] + ' '
        x = ' '.join([n[0]] + words[1:])
    
    x = x.replace('-',' ')
    x = re.sub('[%s]' % re.escape(string.punctuation.replace('&','')), '', x)
    abbr = lib.address_abbr()
    for key, value in abbr.iteritems():
        x = re.sub(r'\b(%s)\b' % key, value, x) 
    x = x.strip()
    
    n = re.findall(r'^(\d+\D?)\b',x)
    s = re.search(r'\b(ste\W?\D?\W?\d*\W?\D?)|(unit\W?\D?\W?\d*\W?\D?)\Z', x)
    if len(n) > 0 and s is not None and len(s.group()) > 0:
        i = re.search(r'\b(ste)|(bldg)|(unit)\b', x)
        address['num'] = n[0]
        address['street'] = re.sub(r'^(\d+\D?)\b', '', x[:x.find(i.group())])
        address['suite'] += x[x.find(i.group()):]
    elif len(n) > 0:
        address['num'] = n[0]
        address['street'] = re.sub(r'^(\d+\D?)\b', '', x)
    elif len(x) > 0:
        address = {'num':'', 'street':x[0], 'suite':''}
    else:
        address = {'num':'', 'street':'', 'suite':''}
        
    address = lib.strip_address_letters(address)
        
    # remove extra white spaces
    for key, value in address.iteritems():
        address[key] = re.sub(r'\s+', ' ', value).strip()
        
    address['suite'] = lib.clean_string(address['suite'])
    return pd.Series(address)
Esempio n. 3
0
def split_address(x):
    address = {"suite": ""}
    x = x.lower().replace("\r", "").replace("\n", "")
    x = x.replace(" ste-", " ste ")

    words = x.split()
    # some street numbers have a dash, append the second part to suite
    if words[0].find("-") > 0:
        n = words[0].split("-")
        address["suite"] += n[-1] + " "
        x = " ".join([n[0]] + words[1:])

    x = x.replace("-", " ")
    x = re.sub("[%s]" % re.escape(string.punctuation.replace("&", "")), "", x)
    abbr = lib.address_abbr()
    for key, value in abbr.iteritems():
        x = re.sub(r"\b(%s)\b" % key, value, x)
    x = x.strip()

    n = re.findall(r"^(\d+\D?)\b", x)
    s = re.search(r"\b(ste\W?\D?\W?\d*\W?\D?)|(unit\W?\D?\W?\d*\W?\D?)\Z", x)
    if len(n) > 0 and s is not None and len(s.group()) > 0:
        i = re.search(r"\b(ste)|(bldg)|(unit)\b", x)
        address["num"] = n[0]
        address["street"] = re.sub(r"^(\d+\D?)\b", "", x[: x.find(i.group())])
        address["suite"] += x[x.find(i.group()) :]
    elif len(n) > 0:
        address["num"] = n[0]
        address["street"] = re.sub(r"^(\d+\D?)\b", "", x)
    else:
        address = {"num": "", "street": x[0], "suite": ""}

    address = lib.strip_address_letters(address)

    # remove extra white spaces
    for key, value in address.iteritems():
        address[key] = re.sub(r"\s+", " ", value).strip()

    return pd.Series(address)