def split_address(x): if type(x)==float: x = '' address = {'suite':''} x = x.lower().replace('\r','').replace('\n','') x = x.replace(' ste-',' ste ') words = x.split() # some street numbers have a dash, append the second part to suite if len(words)>0 and words[0].find('-') > 0: n = words[0].split('-') address['suite'] += n[-1] + ' ' x = ' '.join([n[0]] + words[1:]) x = x.replace('-',' ') x = re.sub('[%s]' % re.escape(string.punctuation.replace('&','')), '', x) abbr = {'road':'rd', 'street':'st', 'avenue':'av', 'ave':'av', 'drive':'dr', 'boulevard':'blvd', 'lane':'ln', 'circle':'cir', 'building':'building', 'mount':'mt', 'n':'north', 'e':'east', 's':'south', 'w':'west', 'suite':'ste', 'bv':'blvd', 'suit':'ste', 'pky':'pkwy', 'parkway':'pkwy', 'first':'1st', 'second':'2nd', 'third':'3rd', 'fourth':'4th', 'fifth':'5th', 'sixth':'6th', 'seventh':'7th', 'eighth':'8th', 'ninth':'9th', 'tenth':'10th'} for key, value in abbr.iteritems(): x = re.sub(r'\b(%s)\b' % key, value, x) x = x.strip() n = re.findall(r'^(\d+\D?)\b',x) s = re.search(r'\b(ste\W?\D?\W?\d*\W?\D?)|(unit\W?\D?\W?\d*\W?\D?)\Z', x) if len(n) > 0 and s is not None and len(s.group()) > 0: i = re.search(r'\b(ste)|(bldg)|(unit)\b', x) address['num'] = n[0] address['street'] = re.sub(r'^(\d+\D?)\b', '', x[:x.find(i.group())]) address['suite'] += x[x.find(i.group()):] elif len(n) > 0: address['num'] = n[0] address['street'] = re.sub(r'^(\d+\D?)\b', '', x) elif len(x) > 0: address = {'num':'', 'street':x[0], 'suite':''} else: address = {'num':'', 'street':'', 'suite':''} address = lib.strip_address_letters(address) # remove extra white spaces for key, value in address.iteritems(): address[key] = re.sub(r'\s+', ' ', value).strip() return pd.Series(address)
def split_address(x): if type(x)==float: x = '' address = {'suite':''} x = x.lower().replace('\r','').replace('\n','') x = x.replace(' ste-',' ste ') words = x.split() # some street numbers have a dash, append the second part to suite if len(words)>0 and words[0].find('-') > 0: n = words[0].split('-') address['suite'] += n[-1] + ' ' x = ' '.join([n[0]] + words[1:]) x = x.replace('-',' ') x = re.sub('[%s]' % re.escape(string.punctuation.replace('&','')), '', x) abbr = lib.address_abbr() for key, value in abbr.iteritems(): x = re.sub(r'\b(%s)\b' % key, value, x) x = x.strip() n = re.findall(r'^(\d+\D?)\b',x) s = re.search(r'\b(ste\W?\D?\W?\d*\W?\D?)|(unit\W?\D?\W?\d*\W?\D?)\Z', x) if len(n) > 0 and s is not None and len(s.group()) > 0: i = re.search(r'\b(ste)|(bldg)|(unit)\b', x) address['num'] = n[0] address['street'] = re.sub(r'^(\d+\D?)\b', '', x[:x.find(i.group())]) address['suite'] += x[x.find(i.group()):] elif len(n) > 0: address['num'] = n[0] address['street'] = re.sub(r'^(\d+\D?)\b', '', x) elif len(x) > 0: address = {'num':'', 'street':x[0], 'suite':''} else: address = {'num':'', 'street':'', 'suite':''} address = lib.strip_address_letters(address) # remove extra white spaces for key, value in address.iteritems(): address[key] = re.sub(r'\s+', ' ', value).strip() address['suite'] = lib.clean_string(address['suite']) return pd.Series(address)
def split_address(x): address = {"suite": ""} x = x.lower().replace("\r", "").replace("\n", "") x = x.replace(" ste-", " ste ") words = x.split() # some street numbers have a dash, append the second part to suite if words[0].find("-") > 0: n = words[0].split("-") address["suite"] += n[-1] + " " x = " ".join([n[0]] + words[1:]) x = x.replace("-", " ") x = re.sub("[%s]" % re.escape(string.punctuation.replace("&", "")), "", x) abbr = lib.address_abbr() for key, value in abbr.iteritems(): x = re.sub(r"\b(%s)\b" % key, value, x) x = x.strip() n = re.findall(r"^(\d+\D?)\b", x) s = re.search(r"\b(ste\W?\D?\W?\d*\W?\D?)|(unit\W?\D?\W?\d*\W?\D?)\Z", x) if len(n) > 0 and s is not None and len(s.group()) > 0: i = re.search(r"\b(ste)|(bldg)|(unit)\b", x) address["num"] = n[0] address["street"] = re.sub(r"^(\d+\D?)\b", "", x[: x.find(i.group())]) address["suite"] += x[x.find(i.group()) :] elif len(n) > 0: address["num"] = n[0] address["street"] = re.sub(r"^(\d+\D?)\b", "", x) else: address = {"num": "", "street": x[0], "suite": ""} address = lib.strip_address_letters(address) # remove extra white spaces for key, value in address.iteritems(): address[key] = re.sub(r"\s+", " ", value).strip() return pd.Series(address)