def rm_addresses(doc): doc_l = [] rm_st = [] sens = doc.split("\n") for sent in sens: sent_l = [] usaddress.parse(sent) for tuple2 in usaddress.parse(sent): # print(tuple2) if tuple2[1] == 'BuildingName' or tuple2[ 1] == 'Recipient' or tuple2[1] == 'OccupancyType' or tuple2[ 1] == 'OccupancyIdentifier' or tuple2[ 1] == 'LandmarkName': sent_l.append(tuple2[0]) else: sent_l.append("█" * len(tuple2[0])) rm_st.append(tuple2[0]) # print(sent_l) deto = MosesDetokenizer() sent_n = deto.detokenize(sent_l, return_str=True) # sent_n = " ".join(sent_l) doc_l.append(sent_n) # print(doc_l) doc = "\n".join(doc_l) #print(doc) return doc, rm_st
def parse_page(self, response): print("========= Checking.......") store_list = response.xpath('//div[@id="tabs-3"]/div') try: item = ChainItem() store = response.xpath('//div[@id="tabs-3"]') address_temp = self.eliminate_space(store.xpath('./text()').extract()) address = '' for temp in address_temp: if '(' not in temp: address += temp + ', ' else: item['phone_number'] = temp break item['address'] = '' item['city'] = '' addr = usaddress.parse(address) for temp in addr: if temp[1] == 'PlaceName': item['city'] += temp[0].replace(',','') + ' ' elif temp[1] == 'StateName': item['state'] = temp[0].replace(',','') elif temp[1] == 'ZipCode': item['zip_code'] = temp[0].replace(',','') else: item['address'] += temp[0].replace(',', '') + ' ' item['country'] = 'United States' if item['phone_number'] != '': yield item except: pass for store in store_list: try: item = ChainItem() item['store_name'] = self.validate(store.xpath('.//strong/text()').extract_first()) address_temp = self.eliminate_space(store.xpath('./text()').extract()) address = '' for temp in address_temp: address += temp + ', ' item['address'] = '' item['city'] = '' addr = usaddress.parse(address) for temp in addr: if temp[1] == 'PlaceName': item['city'] += temp[0].replace(',','') + ' ' elif temp[1] == 'StateName': item['state'] = temp[0].replace(',','') elif temp[1] == 'ZipCode': item['zip_code'] = temp[0].replace(',','') else: item['address'] += temp[0].replace(',', '') + ' ' item['country'] = 'United States' item['phone_number'] = self.validate(store.xpath('.//div[@class="phoneNumberBox"]/text()').extract_first()) if item['store_name'] != '': yield item except: pdb.set_trace()
def body(self, response): print("========= Checking.......") store_list = response.xpath('//div[@class="storelist-inner-tab"]') for store in store_list: try: item = ChainItem() item['store_name'] = self.validate(store.xpath('.//h4/text()').extract_first()) detail = self.eliminate_space(store.xpath('.//p//text()').extract()) address = '' for de in detail: if 'phone' in de.lower(): item['phone_number'] = de.split(':')[1].strip() break address += de + ', ' item['address'] = '' item['city'] = '' addr = usaddress.parse(address[:-2]) for temp in addr: if temp[1] == 'PlaceName': item['city'] += temp[0].replace(',','') + ' ' elif temp[1] == 'StateName': item['state'] = temp[0].replace(',','') elif temp[1] == 'ZipCode': item['zip_code'] = temp[0].replace(',','') else: item['address'] += temp[0].replace(',', '') + ' ' item['country'] = 'United States' h_temp = '' hour_list = self.eliminate_space(store.xpath('.//div[@id="StoreServicesContainer"]//span//text()').extract()) for hour in hour_list: h_temp += hour + ', ' item['store_hours'] = h_temp[:-2] yield item except: pass
def geo_parser(location, gmaps_json): # parse json response try: results = gmaps_json["results"][0] std_name = results['name'] print(std_name) lat = results['geometry']['location']['lat'] lng = results['geometry']['location']['lng'] std_address = results['formatted_address'] # parse address try: parsed_address = usaddress.tag(std_address) city = parsed_address[0]['PlaceName'] state = parsed_address[0]['StateName'] except: parsed_address = usaddress.parse(std_address) # traverse parsed address list if the tagger fails city = '' state = '' for addr_tup in parsed_address: print(addr_tup) if addr_tup[1] == 'PlaceName': city += ' ' + addr_tup[0] if addr_tup[1] == 'StateName': state += ' ' + addr_tup[0] city = city.strip() print(city) df = pd.DataFrame([[location, std_name, lat, lng, city, state]], columns=['Raw_Name', 'Name', 'Latitude', 'Longitude', 'City', 'State']) return df except IndexError: print(gmaps_json) df = pd.DataFrame() return
def parse_page(self, response): detail = response.xpath('//div[contains(@class, "location-content")]') item = ChainItem() item['store_name'] = self.validate(detail.xpath('.//h3[@class="white"]/text()')) address = self.validate(detail.xpath('.//pre[@class="white"]/text()')).replace('.', ' ') addr = usaddress.parse(address) item['address'] = '' item['city'] = '' for temp in addr: if temp[1] == 'AddressNumber': item['address'] += temp[0] + ' ' elif temp[1] == 'StreetName': item['address'] += temp[0] + ' ' elif temp[1] == 'StreetNamePostType': item['address'] += temp[0] + ' ' elif temp[1] == 'OccupancyType': item['address'] += temp[0] + ' ' elif temp[1] == 'OccupancyIdentifier': item['address'] += temp[0] + ' ' elif temp[1] == 'PlaceName': item['city'] += temp[0] + ' ' elif temp[1] == 'StateName': item['state'] = temp[0] elif temp[1] == 'ZipCode': item['zip_code'] = temp[0] item['country'] = 'United States' item['phone_number'] = self.validate(detail.xpath('.//p[@class="white"]/text()')) item['store_hours'] = self.validate(detail.xpath('.//div[contains(@class, "hours")]//pre/text()')) yield item
def body(self, response): store_list = response.xpath('//div[contains(@class,"store-info")]') for store in store_list: store_url = store.xpath('.//a/@href').extract_first() if store_url: yield scrapy.Request(url=store_url, callback=self.parse_page) else : item = ChainItem() item['store_name'] = self.validate(store.xpath('.//div[@class="box"]//h3//text()')) address = self.validate(store.xpath('.//div[@class="box"]//p/text()')) addr = usaddress.parse(address) item['address'] = '' item['city'] = '' for temp in addr: if temp[1] == 'AddressNumber': item['address'] += temp[0] + ' ' elif temp[1] == 'StreetName': item['address'] += temp[0] + ' ' elif temp[1] == 'StreetNamePostType': item['address'] += temp[0] + ' ' elif temp[1] == 'OccupancyType': item['address'] += temp[0] + ' ' elif temp[1] == 'OccupancyIdentifier': item['address'] += temp[0] + ' ' elif temp[1] == 'PlaceName': item['city'] += temp[0] + ' ' elif temp[1] == 'StateName': item['state'] = temp[0] elif temp[1] == 'ZipCode': item['zip_code'] = temp[0] item['country'] = 'United States' if item['city'] == '' : item['address'] = self.validate(store.xpath('.//div[@class="box"]//p/text()')) item['country'] = '' yield item
def getAddress(self, text): """ parse text for address Parameters: text (str): text to parse Returns: address string """ doc = self.nlp(text) # for some reason zipcode 60601 is not an entity, so we can't filter by entity type #print([(ent.text, ent.label_) for ent in doc.ents]) #str = ['CARDINAL', 'LOC', 'GPE', 'FAC'] #addrlist = [ent.text for ent in doc.ents if ent.label_ in str] #addr = " ".join(addrlist) #parsed = usaddress.parse(addr) parsed = usaddress.parse(text) parsed = [x for x in parsed if x[1] != 'Recipient'] # filter out recipient addr = '' for x in parsed: val = x[0] typ = x[1] addr += val + ' ' #print(val,'-> ', typ) if len(addr) > 0: addr = addr.replace('.', '').strip() return addr
def body(self, response): store_list = response.xpath('//div[contains(@class, "altrow")]') for store in store_list: item = ChainItem() detail = self.eliminate_space(store.xpath('.//text()').extract()) item['store_name'] = self.validate(detail[0]) address = '' item['phone_number'] = '' for de in detail: if 'phone' in de: item['phone_number'] = self.validate(de.split(':')[1]) break else: address += de + ', ' item['address'] = '' item['city'] = '' addr = usaddress.parse(address) for temp in addr: if temp[1] == 'PlaceName': item['city'] += temp[0].replace(',', '') + ' ' elif temp[1] == 'StateName': item['state'] = temp[0].replace(',', '') elif temp[1] == 'ZipCode': item['zip_code'] = temp[0].replace(',', '') else: item['address'] += temp[0].replace(',', '') + ' ' item['country'] = 'United States' if item['address'] + item['phone_number'] not in self.history: self.history.append(item['address'] + item['phone_number']) yield item
def parse_page(self, response): try: item = ChainItem() detail = self.eliminate_space( response.xpath( '//div[@class="location-overlay"]//text()').extract()) address = '' item['phone_number'] = '' for de in detail: if ':' not in de and '-' in de: item['phone_number'] = de break address += de + ', ' item['address'] = '' item['city'] = '' addr = usaddress.parse(address[:-2]) for temp in addr: if temp[1] == 'PlaceName': item['city'] += temp[0].replace(',', '') + ' ' elif temp[1] == 'StateName': item['state'] = temp[0].replace(',', '') elif temp[1] == 'ZipCode': item['zip_code'] = temp[0].replace(',', '') else: item['address'] += temp[0].replace(',', '') + ' ' item['country'] = 'United States' h_temp = '' for de in detail: if ':' in de: h_temp += de + ', ' item['store_hours'] = h_temp[:-2] yield item except: pdb.set_trace()
def body(self, response): print("========= Checking.......") store_list_one = response.xpath('//div[contains(@class, "plb")]') store_list_two = response.xpath('//div[contains(@class, "plw")]') store_list = store_list_one + store_list_two for store in store_list: url = self.domain[:-1] + store.xpath( './/b//a/@href').extract_first() request = scrapy.Request(url=url, callback=self.parse_page) detail = self.eliminate_space(store.xpath('./text()').extract()) request.meta['store_name'] = store.xpath( './/b//a/text()').extract_first() address = detail[1] + detail[2] request.meta['address'] = '' request.meta['city'] = '' addr = usaddress.parse(address) for temp in addr: if temp[1] == 'PlaceName': request.meta['city'] += temp[0].replace(',', '') + ' ' elif temp[1] == 'StateName': request.meta['state'] = temp[0] elif temp[1] == 'ZipCode': request.meta['zip_code'] = temp[0].replace(',', '') else: request.meta['address'] += temp[0].replace(',', '') + ' ' yield request pagenation = response.xpath('//div[contains(@class, "panelpn")]//a') pagenation = pagenation[len(pagenation) - 1].xpath('./@href').extract_first() if pagenation is not None: pagenation = self.domain + pagenation yield scrapy.Request(url=pagenation, callback=self.body)
def parse_page(self, response): try: detail = self.eliminate_space(response.xpath('//div[@id="contentLocator"]//text()').extract()) item = ChainItem() item['store_name'] = 'Kinney Drugs' address = detail[2] + ', ' + detail[3] item['address'] = '' item['city'] = '' addr = usaddress.parse(address) for temp in addr: if temp[1] == 'PlaceName': item['city'] += temp[0].replace(',','') + ' ' elif temp[1] == 'StateName': item['state'] = temp[0].replace(',','') elif temp[1] == 'ZipCode': item['zip_code'] = temp[0].replace(',','') else: item['address'] += temp[0].replace(',', '') + ' ' item['country'] = 'United States' item['phone_number'] = detail[5] item['store_hours'] = detail[7] + ', ' + detail[8] + ', ' item['store_hours'] += detail[9] + ' ' + detail[10] + ', ' + detail[11] yield item except: pass
def body(self, response): print("========= Checking.......") store_list = response.xpath('//div[@class="indent1"]//table') name_list = self.eliminate_space(response.xpath('//div[@class="indent1"]//h3//text()').extract()) for ind in range(0, len(store_list)): try: item = ChainItem() detail = self.eliminate_space(store_list[ind].xpath('.//text()').extract()) item['store_name'] = name_list[ind] address = detail[3] item['address'] = '' item['city'] = '' addr = usaddress.parse(address) for temp in addr: if temp[1] == 'PlaceName': item['city'] += temp[0].replace(',','') + ' ' elif temp[1] == 'StateName': item['state'] = temp[0].replace(',','') elif temp[1] == 'ZipCode': item['zip_code'] = temp[0].replace(',','') else: item['address'] += temp[0].replace(',', '') + ' ' item['country'] = 'United States' item['phone_number'] = detail[1] if item['address']+item['phone_number'] not in self.history: self.history.append(item['address']+item['phone_number']) yield item except: pass
def parse_store(self, response): item = ChainItem() try: pdb.set_trace() item['store_number'] = '' item['store_name'] = response.xpath( './/h2/strong/text()').extract_first() address = response.xpath( ".//div[@id='pageContent']/p[2]/text()").extract() addr = usaddress.parse(address) item['address'] = self.validate(store_info['address']['address1']) item['address2'] = self.validate(store_info['address']['address2']) item['city'] = self.validate(store_info['address']['city']) item['state'] = self.validate(store_info['address']['state']) item['zip_code'] = self.validate(store_info['address']['zip']) item['country'] = 'United States' item['phone_number'] = response.xpath( './/a[@class="bold mb2 db"]/text()').extract_first() item['latitude'] = '' item['longitude'] = '' item['store_hours'] = '' hours = response.xpath( './/div[@class="ph1 hours-wrap"]/div[@class="mb2"]/p') for hour in hours: item['store_hours'] += self.validate("".join( hour.xpath('.//text()').extract())) + "; " item['other_fields'] = "" item['coming_soon'] = "0" yield item except: pdb.set_trace()
def parse_page(self, response): item = ChainItem() item['store_name'] = self.validate( response.xpath( '//div[contains(@class, "location-data")]//h2/text()'). extract_first()) address_temp = self.eliminate_space( response.xpath('//div[@class="address"]//p//text()').extract()) address = '' for temp in address_temp: address += temp + ', ' item['address'] = '' item['city'] = '' addr = usaddress.parse(address) for temp in addr: if temp[1] == 'PlaceName': item['city'] += temp[0].replace(',', '') + ' ' elif temp[1] == 'StateName': item['state'] = temp[0] elif temp[1] == 'ZipCode': item['zip_code'] = temp[0].replace(',', '') else: item['address'] += temp[0].replace(',', '') + ' ' item['country'] = 'United States' item['phone_number'] = self.validate( response.xpath( '//span[@class="address-phone"]/text()').extract_first()) h_temp = '' hour_list = self.eliminate_space( response.xpath('//div[@class="hours"]//p//text()').extract()) for hour in hour_list: if 'a.m.' in hour: h_temp += hour + ' , ' item['store_hours'] = self.validate(h_temp[:-2]) yield item
def body(self, response): print("========= Checking.......") store_list = response.xpath('//table[@class="tr-caption-container"]') for store in store_list: try: item = ChainItem() detail = self.eliminate_space( store.xpath( './/td[@class="tr-caption"]//text()').extract()) item['store_name'] = detail[0] + ' ' + detail[1] address = '' for de in detail[2:-1]: address += de + ', ' item['address'] = '' item['city'] = '' addr = usaddress.parse(address) for temp in addr: if temp[1] == 'PlaceName': item['city'] += temp[0].replace(',', '') + ' ' elif temp[1] == 'StateName': item['state'] = temp[0].replace(',', '') elif temp[1] == 'ZipCode': item['zip_code'] = temp[0].replace(',', '') else: item['address'] += temp[0].replace(',', '') + ' ' item['country'] = 'United States' item['phone_number'] = detail[-1] yield item except: pass
def preprocess(self, text): #this case is included because usaddress doesn't do a great job if there isn't a number at parsing semantic information #However if there is a number it tends to be better than streetaddress #Therefore usaddress is better at figuring out where the start of an address is, in say a very long body of text with an address in there at some point #It isn't that great at approximate locations nouns = ['NN', 'NNP', 'NNPS', 'NNS'] if any([elem.isdigit() for elem in text.split(" ")]): addr = usaddress.parse(text) addr = [elem for elem in addr if elem[1] != 'Recipient'] addr_dict = {} for value, key in addr: if key in addr_dict.keys(): addr_dict[key] += " " + value else: addr_dict[key] = value return addr_dict, "complete" else: possible_streets = [] for word, tag in tagger.tag(text.split()): if tag == 'LOCATION': possible_streets.append(word) parts = nltk.pos_tag(nltk.word_tokenize(text)) for part in parts: if any([part[1] == noun for noun in nouns]): possible_streets.append(part[0]) return possible_streets, "cross streets"
def body(self, response): print("========= Checking.......") store_list = response.xpath('//li[@class="fusion-builder-row fusion-row"]') for store in store_list: try: item = ChainItem() item['store_name'] = self.validate(store.xpath('.//h4/text()').extract_first()) address = self.validate(store.xpath('.//address/text()').extract_first()) addr = usaddress.parse(address) item['city'] = '' item['address'] = '' for temp in addr: if temp[1] == 'PlaceName': item['city'] += temp[0].replace(',','') + ' ' elif temp[1] == 'StateName': item['state'] = temp[0] elif temp[1] == 'ZipCode': item['zip_code'] = temp[0] else : item['address'] += temp[0] + ' ' item['country'] = 'United States' item['phone_number'] = self.validate(store.xpath('.//a[2]/text()').extract_first()) yield item except: pass
def test_simple_addresses(self): test_file = 'training/test_data/simple_address_patterns.xml' for address_text, components in parseTrainingData(test_file) : _, labels_true = zip(*components) _, labels_pred = zip(*parse(address_text)) yield equals, address_text, labels_pred, labels_true
def body(self, response): self.driver.get("https://www.racebros.com/contact") source = self.driver.page_source.encode("utf8") tree = etree.HTML(source) store_list = tree.xpath( '//div[@class="c2inlineContent"]//div[@class="txtNew"]') h_temp = self.eliminate_space(store_list[5].xpath( './/text()'))[0].split('stores')[1].strip().replace('or', ',') for store in store_list[1:4]: try: item = ChainItem() detail = self.eliminate_space(store.xpath('.//text()')) item['store_name'] = detail[0] address = detail[1] + ',' + detail[2] item['address'] = '' item['city'] = '' addr = usaddress.parse(address) for temp in addr: if temp[1] == 'PlaceName': item['city'] += temp[0].replace(',', '') + ' ' elif temp[1] == 'StateName': item['state'] = temp[0].replace(',', '') elif temp[1] == 'ZipCode': item['zip_code'] = temp[0].replace(',', '') else: item['address'] += temp[0].replace(',', '') + ' ' item['country'] = 'United States' item['phone_number'] = detail[3] item['store_hours'] = h_temp yield item except: pass
def body(self, response): store_list = response.xpath('//table//tr') for store in store_list: try: item = ChainItem() address_temp = self.eliminate_space( store.xpath('.//td[2]//text()').extract()) item['store_name'] = address_temp[0] address = address_temp[1] + ', ' + address_temp[2] item['address'] = '' item['city'] = '' addr = usaddress.parse(address) for temp in addr: if temp[1] == 'PlaceName': item['city'] += temp[0].replace(',', '') + ' ' elif temp[1] == 'StateName': item['state'] = temp[0].replace(',', '') elif temp[1] == 'ZipCode': item['zip_code'] = temp[0].replace(',', '') else: item['address'] += temp[0].replace(',', '') + ' ' item['country'] = 'United States' p_temp = self.eliminate_space( store.xpath('.//td[1]//text()').extract()) item['phone_number'] = p_temp[1] if item['address'] + item['phone_number'] not in self.history: self.history.append(item['address'] + item['phone_number']) yield item except: pass
def parse_page(self, response): try: item = ChainItem() item['store_name'] = self.validate(response.xpath('//div[@class="store-locations"]//h1/text()').extract_first()).split('#')[0].strip() item['store_number'] = self.validate(response.xpath('//div[@class="store-locations"]//h1/text()').extract_first()).split('#')[1].strip() address = self.validate(response.xpath('//div[@class="store-locations"]//div[@class="address"]//p/text()').extract_first()) item['address'] = '' item['city'] = '' addr = usaddress.parse(address) for temp in addr: if temp[1] == 'PlaceName': item['city'] += temp[0].replace(',','') + ' ' elif temp[1] == 'StateName': item['state'] = temp[0].replace(',','') elif temp[1] == 'ZipCode': item['zip_code'] = temp[0].replace(',','') else: item['address'] += temp[0].replace(',', '') + ' ' item['country'] = 'United States' detail = self.eliminate_space(response.xpath('//div[@class="store-locations"]//div[@class="store-info"]//text()').extract()) for ind in range(0, len(detail)): if 'Phone' in detail[ind]: item['phone_number'] = self.validate(detail[ind+1]) break h_temp = '' for ind in range(0, len(detail)): if 'day' in detail[ind].lower(): h_temp += self.validate(detail[ind]) + ' ' + self.validate(detail[ind+1]) + ', ' ind += 2 item['store_hours'] = h_temp[:-2] yield item except: pass
def getusaddress(addr): address = usaddress.parse(addr) m = 0 street = "" city = "" state = "" pcode = "" while m < len(address): temp = address[m] if temp[1].find("Address") != -1 or temp[1].find( "Street" ) != -1 or temp[1].find('Occupancy') != -1 or temp[1].find( "Recipient" ) != -1 or temp[1].find("BuildingName") != -1 or temp[1].find( "USPSBoxType") != -1 or temp[1].find("USPSBoxID") != -1: street = street + " " + temp[0] if temp[1].find("PlaceName") != -1: city = city + " " + temp[0] if temp[1].find("StateName") != -1: state = state + " " + temp[0] if temp[1].find("ZipCode") != -1: pcode = pcode + " " + temp[0] m += 1 street = street.lstrip().replace(',', '') city = city.lstrip().replace(',', '') state = state.lstrip().replace(',', '') pcode = pcode.lstrip().replace(',', '') return (street, city, state, pcode)
def parse_page(self, response): try: item = ChainItem() item['store_name'] = self.validate(response.xpath('//div[@class="retailer-details"]//h4//text()').extract()[0]) address = self.eliminate_space(response.xpath('//div[@class="retailer-details"]/p//text()').extract())[0] item['address'] = '' item['city'] = '' addr = usaddress.parse(address) for temp in addr: if temp[1] == 'PlaceName': item['city'] += temp[0].replace(',','') + ' ' elif temp[1] == 'StateName': item['state'] = temp[0].replace(',','') elif temp[1] == 'ZipCode': item['zip_code'] = temp[0].replace(',','') else: item['address'] += temp[0].replace(',', '') + ' ' item['country'] = 'United States' item['phone_number'] = self.eliminate_space(response.xpath('//div[@class="retailer-details"]/p//text()').extract())[1] h_temp = '' hour_list = self.eliminate_space(response.xpath('//ul[@class="hours-results"]//text()').extract()) cnt = 1 for hour in hour_list: h_temp += hour if cnt % 2 == 0: h_temp += ', ' else: h_temp += ' ' cnt += 1 item['store_hours'] = h_temp[:-2] if len(hour_list) < 10: item['store_hours'] = '' yield item except: pdb.set_trace()
def isValidAddress(ady, verbose=False): address = usaddress.parse(ady) if len(address) < 4: showFailureReason('Not Enough Terms', ady, address, verbose) return False if any([a[1] == 'Recipient' for a in address]): showFailureReason('Recipient', ady, address, verbose) return False if not any([a[1] == 'StreetName' for a in address]): dic = {} for v, k in address: dic[k] = '%s %s' % (dic.get(k, ''), v) if 'BuildingName' in dic and 'PlaceName' in dic and \ 'StateName' in dic: return True showFailureReason('StreetName', ady, address, verbose) return False if not any([a[1] == 'AddressNumber' for a in address]): showFailureReason('AddressNumber', ady, address, verbose) return False return True
def parse(self, response): phone = response.meta.get('phone') store_list = response.xpath('//section[@id="mainContent"]') for store in store_list: item = ChainItem() item['store_name'] = self.validate(store.xpath('.//h2//text()').extract_first()) addr_list= store.xpath('.//address//text()').extract() address = '' for addr in addr_list[:-2]: address += addr + ' ' addr = usaddress.parse(self.validate(address)) item['address'] = '' item['city'] = '' item['state'] = '' item['zip_code'] = '' for temp in addr: if temp[1] == 'PlaceName': item['city'] += temp[0].replace(',','') + ' ' elif temp[1] == 'StateName': item['state'] = temp[0].replace(',','') elif temp[1] == 'ZipCode': item['zip_code'] = temp[0].replace(',','') else: item['address'] += temp[0].replace(',', '') + ' ' item['country'] = "United States" item['phone_number'] = phone yield item
def body(self, response): data = response.body.split('initial_locations:')[1].split('min_zoom:')[0].strip()[:-1] store_list = json.loads(data) for store in store_list: item = ChainItem() item['store_number'] = store['location_id'] item['store_name'] = store['title'] item['latitude'] = store['latitude'] item['longitude'] = store['longitude'] item['phone_number'] = store['phone'] city = '' address = '' state = '' zip_code = '' temp_address = store['address'] addr = usaddress.parse(temp_address) for temp in addr: if temp[1] == 'PlaceName': city += temp[0].replace(',','') + ' ' elif temp[1] == 'StateName': state = temp[0].replace(',','') elif temp[1] == 'ZipCode': zip_code = temp[0].replace(',','') else: address += temp[0].replace(',','') + ' ' item['address'] = address item['city'] = city item['state'] = state item['zip_code'] = zip_code item['country'] = store['country'] yield item
def predict(self, X): reload(usaddress ) # tagger object is defined at the module level, update now predictions = [] for address in X: predictions.append([foo[1] for foo in usaddress.parse(address)]) return predictions
def parse_page(self, response): try: item = ChainItem() addr_list = self.eliminate_space(response.xpath('//div[@class="dBox01"][1]//text()').extract())[1:] address = '' for addr in addr_list: if 'locate' in addr.lower() or 'corner' in addr.lower() or 'shop' in addr.lower() or 'null' in addr.lower() or 'intersection' in addr.lower() or 'market' in addr.lower(): addr = self.validate(addr.split(',')[0]) address += addr + ' ' item['address'] = '' item['city'] = '' item['state'] = '' addr = usaddress.parse(address[:-2]) for temp in addr: if temp[1] == 'PlaceName': item['city'] += temp[0].replace(',','') + ' ' elif temp[1] == 'StateName': item['state'] = temp[0].replace(',','') elif temp[1] == 'ZipCode': item['zip_code'] = temp[0].replace(',','') else: item['address'] += temp[0].replace(',', '') + ' ' item['country'] = self.check_country(item['state']) h_temp = 'MinuteClinic hours: ' hour_list = self.eliminate_space(response.xpath('//div[@class="dMarginBot2"][1]/text()').extract()) cnt = 1 for hour in hour_list: h_temp += hour + ', ' if cnt == 3: h_temp += 'Lunch hours: ' cnt += 1 item['store_hours'] = h_temp[:-2] yield item except: pass
def parse_page(self, response): try: item = ChainItem() detail = self.eliminate_space( response.xpath( '//div[@class="location"][1]//address//text()').extract()) address = '' for de in detail[1:]: address += de + ', ' item['address'] = '' item['city'] = '' addr = usaddress.parse(address) for temp in addr: if temp[1] == 'PlaceName': item['city'] += temp[0].replace(',', '') + ' ' elif temp[1] == 'StateName': item['state'] = temp[0].replace(',', '') elif temp[1] == 'ZipCode': item['zip_code'] = temp[0].replace(',', '') else: item['address'] += temp[0].replace(',', '') + ' ' item['country'] = 'United States' item['phone_number'] = self.eliminate_space( response.xpath( '//div[@class="location"][2]//text()').extract())[2] yield item except: pass
def preprocess(self,text): #this case is included because usaddress doesn't do a great job if there isn't a number at parsing semantic information #However if there is a number it tends to be better than streetaddress #Therefore usaddress is better at figuring out where the start of an address is, in say a very long body of text with an address in there at some point #It isn't that great at approximate locations nouns = ['NN','NNP','NNPS','NNS'] if any([elem.isdigit() for elem in text.split(" ")]): addr = usaddress.parse(text) addr = [elem for elem in addr if elem[1] != 'Recipient'] addr_dict = {} for value,key in addr: if key in addr_dict.keys(): addr_dict[key] += " "+value else: addr_dict[key] = value return addr_dict,"complete" else: possible_streets = [] for word,tag in tagger.tag(text.split()): if tag == 'LOCATION': possible_streets.append(word) parts = nltk.pos_tag(nltk.word_tokenize(text)) for part in parts: if any([part[1]==noun for noun in nouns]): possible_streets.append(part[0]) return possible_streets,"cross streets"
def body(self, response): print("========= Checking.......") store_list = response.xpath('//div[@id="cms_content_frame"]//tr') for store in store_list[6:-1]: detail = self.eliminate_space(store.xpath('.//text()').extract()) try: item = ChainItem() item['store_name'] = detail[0] item['address'] = '' item['city'] = '' addr = usaddress.parse(detail[1]) for temp in addr: if temp[1] == 'PlaceName': item['city'] += temp[0].replace(',', '') + ' ' elif temp[1] == 'StateName': item['state'] = temp[0].replace(',', '') elif temp[1] == 'ZipCode': item['zip_code'] = temp[0].replace(',', '') else: item['address'] += temp[0].replace(',', '') + ' ' item['country'] = 'United States' item['phone_number'] = self.validate( detail[2].split('Grocery Hours:')[0].strip()[1:-1]) item['store_hours'] = self.validate( detail[2].split('Grocery Hours:')[1].strip()) if item['store_name'] == 'Nob Hill Foods': yield item except: pass
def parse_page(self, response): item = ChainItem() store = response.xpath('//div[@class="store-list-detail"]') item['store_name'] = self.validate(response.xpath('.//div[@class="store-details-section"]//h1/text()').extract_first()) address_temp = self.eliminate_space(store.xpath('.//div[@class="sl-address"]//text()').extract()) address = '' for temp in address_temp: address += temp +', ' item['address'] = '' item['city'] = '' addr = usaddress.parse(address) for temp in addr: if temp[1] == 'PlaceName': item['city'] += temp[0].replace(',','') + ' ' elif temp[1] == 'StateName': item['state'] = temp[0] elif temp[1] == 'ZipCode': item['zip_code'] = temp[0].replace(',','') else: item['address'] += temp[0].replace(',', '') + ' ' item['country'] = 'United States' item['phone_number'] = self.validate(store.xpath('.//a[@itemprop="telephone"]/text()').extract_first()) h_temp = '' hour_list = store.xpath('.//div[contains(@class, "sl-store-hours")]//div[@class="sl-hours"]') for hour in hour_list: hour = self.eliminate_space(hour.xpath('.//div/text()').extract()) for h in hour: h_temp += h +' ' h_temp += ', ' item['store_hours'] = h_temp[:-2] yield item
def parse_page(self, response): try: item = ChainItem() detail = self.eliminate_space( response.xpath('//table//text()').extract()) item['store_name'] = self.validate( response.xpath('//h1/text()').extract_first()) address = '' for cnt in range(1, len(detail)): if 'phone' in detail[cnt].lower(): item['phone_number'] = detail[cnt + 1] break address += detail[cnt] + ', ' item['address'] = '' item['city'] = '' addr = usaddress.parse(address[:-2]) for temp in addr: if temp[1] == 'PlaceName': item['city'] += temp[0].replace(',', '') + ' ' elif temp[1] == 'StateName': item['state'] = temp[0].replace(',', '') elif temp[1] == 'ZipCode': item['zip_code'] = temp[0].replace(',', '') else: item['address'] += temp[0].replace(',', '') + ' ' item['country'] = 'United States' h_temp = '' for de in detail: if ':' in de: h_temp += de + ', ' item['store_hours'] = h_temp[:-2] yield item except: pass
def test_us_ia_linn(self) : test_file = 'training_data/openaddress_us_ia_linn.xml' for address_text, components in parseTrainingData(test_file) : _, labels_true = zip(*components) _, labels_pred = zip(*parse(address_text)) yield equals, address_text, labels_pred, labels_true
def anonymize(self, address): try: return self._cache[address] except KeyError: parsed = usaddress.parse(address) anonymized = ' '.join(self._anonymize_parsed(parsed)) self._cache[address] = anonymized return anonymized
def test_Parser(self): test_file = 'training/test_data/synthetic_osm_data_xml.xml' for address_text, components in parseTrainingData(test_file) : _, labels_true = zip(*components) _, labels_pred = zip(*parse(address_text)) yield equals, address_text, labels_pred, labels_true
def test_synthetic_addresses(self): test_file = "measure_performance/test_data/synthetic_osm_data.xml" data = list(readTrainingData([test_file], GROUP_LABEL)) for labeled_address in data: address_text, components = labeled_address _, labels_true = list(zip(*components)) _, labels_pred = list(zip(*parse(address_text))) yield equals, address_text, labels_pred, labels_true
def test_Parser(self): test_file = 'training/test_data/us50_test_tagged.xml' for address_text, components in parseTrainingData(test_file) : _, labels_true = zip(*components) _, labels_pred = zip(*parse(address_text)) yield fuzzyEquals, address_text, labels_pred, labels_true
def typoCol(self, x): result = [] print usaddress.parse(x) for r in usaddress.parse(x): if numpy.random.rand(1,1) > self.typoprob: result.append(str(r[0])) elif r[1] == 'ZipCode': pass elif r[1] == 'StreetNamePostType': newr = str(r[0]) for i in self.term_alts: if newr.lower() in i: newr = random.choice(i) result.append(newr) else: result.append(str(r[0])) return ' '.join(result)
def format_address(addr): addr_components = usaddress.parse(addr) dicter = {} for component in addr_components: if not component[1] in dicter.keys(): dicter[component[1]] = component[0] else: dicter[component[1]] += " "+component[0] return dicter["AddressNumber"] + " " + dicter["StreetName"] + " " + format_streetname_post_type(dicter["StreetNamePostType"]) + " " + dicter["PlaceName"] + " " + dicter["StateName"] + " "+ dicter["ZipCode"]
def test_us50(self): test_file = "measure_performance/test_data/us50_test_tagged.xml" data = list(readTrainingData([test_file], GROUP_LABEL)) for labeled_address in data: address_text, components = labeled_address _, labels_true = list(zip(*components)) _, labels_pred = list(zip(*parse(address_text))) yield fuzzyEquals, address_text, labels_pred, labels_true
def parse_with_usaddress_parse(self, addr_str): """ Parses address string using usaddress's `parse()` function """ parsed = usaddress.parse(addr_str) addr_parts = [{'code': self.standard_part_mapping[v], 'value': k} for k, v in parsed] return addr_parts
def completeStreetTypeAbbreviation(streetName): streetComponents = usaddress.parse(streetName) reconstructedStreet = [] for streetComponent in streetComponents: if streetComponent[1] != "StreetNamePostType": reconstructedStreet.append(streetComponent[0]) else: abbreviationToConvert = re.sub("[^A-Z]+", "", streetComponent[0].upper()) #convert to upper case and remove nonalpha chars reconstructedStreet.append(streetConversionMap.get(abbreviationToConvert, streetComponent[0]).capitalize()) return " ".join(reconstructedStreet)
def parse_address(address): parsed = usaddress.parse(address) street_number = ' '.join([p[0] for p in parsed if p[1] == 'AddressNumber']) street_dir = ' '.join([p[0] for p in parsed if p[1] == 'StreetNamePreDirectional']) street_name = ' '.join([p[0] for p in parsed if p[1] == 'StreetName']) street_type = ' '.join([p[0] for p in parsed if p[1] == 'StreetNamePostType']) unit_number = ' '.join([p[0] for p in parsed if p[1] == 'OccupancyIdentifier']) return (street_number, street_dir, street_name, street_type, unit_number)
def usaddress_to_dict(text): addr = usaddress.parse(text) addr = [elem for elem in addr if elem[1] != 'Recipient'] addr_dict = {} for value,key in addr: if key in addr_dict.keys(): addr_dict[key] += " "+value else: addr_dict[key] = value return addr_dict
def consoleLabel(raw_addr, label_options): friendly_tag_dict = dict((label[1], label[0]) for label in label_options) valid_responses = ['y', 'n', 's', 'f', ''] addrs_left_to_tag = [] finished = False addrs_left_to_tag = raw_addr.copy() total_addrs = len(raw_addr) tagged_addr = set([]) for i, addr_string in enumerate(raw_addr, 1): if not finished: print "(%s of %s)" % (i, total_addrs) print "-"*50 print "ADDRESS STRING: ", addr_string preds = usaddress.parse(addr_string) user_input = None while user_input not in valid_responses : friendly_addr = [(token[0], friendly_tag_dict[token[1]]) for token in preds] print_table(friendly_addr) sys.stderr.write('Is this correct? (y)es / (n)o / (s)kip / (f)inish tagging\n') user_input = sys.stdin.readline().strip() if user_input =='y': tagged_addr.add(tuple(preds)) addrs_left_to_tag.remove(addr_string) elif user_input =='n': corrected_addr = manualTagging(preds, label_options, friendly_tag_dict) tagged_addr.add(tuple(corrected_addr)) addrs_left_to_tag.remove(addr_string) elif user_input in ('' or 's') : print "Skipped\n" elif user_input == 'f': finished = True print "Done! Yay!" return tagged_addr, addrs_left_to_tag
def get_streetnames(text): streetnames = [] parsed_text = usaddress.parse(text) for ind,word in enumerate(parsed_text): if word[1] == "StreetName": if word[0] not in ["and","or","near","between"]: if ind+1 < len(parsed_text): if parsed_text[ind+1][1] == "StreetNamePostType": streetnames.append(word[0]+ " " + parsed_text[ind+1][0]) else: streetnames.append(word[0]) else: streetnames.append(word[0]) return streetnames
def _reverse_geocode(geocoder, tag_name, lat, long): point = Point(lat, long) try: point_location = geocoder.reverse(point, exactly_one=True) if point_location and point_location.address: raw_parse = usaddress.parse(point_location.address) parsed = SocialExplorer._process_usaddress_parsed(raw_parse, [tag_name]) return parsed.get('PlaceName') if parsed.get('PlaceName') else None except exc.GeocoderQuotaExceeded as e: print e return except Exception as e: print e return
def extract_name_and_address(txt): match = _name_from_address_splitter.search(txt) result = {'street': '', 'name': match.group(1)} parsed = usaddress.parse(match.group(2)) """usaddress parses to many pieces, most of which should be reassembled into `street`.""" for (word, word_type) in parsed: if word_type == 'PlaceName': result['city'] = word elif word_type == 'StateName': result['state'] = word elif word_type == 'ZipCode': result['zip_code'] = word else: result['street'] += word + ' ' return {r: result[r].strip().strip(',') for r in result}
def address_is_complete(text): streetname_exists = False streetnumber_exists = False cross_street = False num_streets = 0 for elem in usaddress.parse(text): if "StreetName" == elem[1]: streetname_exists = True num_streets += 1 if "AddressNumber" == elem[1] and elem[1].isdigit(): streetnumber_exists = True if streetname_exists and streetnumber_exists: return "complete" elif num_streets > 1 and not streetnumber_exists: return "cross street" else: return "no address information"
def isValidAddress(ady, verbose=False): address = usaddress.parse(ady) if len(address) < 4: showFailureReason('Not Enough Terms', ady, address, verbose) return False if any([a[1] == 'Recipient' for a in address]): showFailureReason('Recipient', ady, address, verbose) return False if not any([a[1] == 'StreetName' for a in address]): showFailureReason('StreetName', ady, address, verbose) return False if not any([a[1] == 'AddressNumber' for a in address]): showFailureReason('AddressNumber', ady, address, verbose) return False return True
def lookup_geo(g, ady): components = usaddress.parse(ady) addressNumber = ' '.join([a[0] for a in components if a[1] == 'AddressNumber']) streetName = ' '.join([a[0] for a in components if a[1].startswith('Street')]) streetName = streetName.replace(',', '') borough = components[-2][0].replace(',', '') if borough == 'NY': borough = 'Manhattan' # print '%s : %s : %s ' %(addressNumber, streetName, borough) dic = g.address(addressNumber, streetName, borough) zipcode = dic.get('zipCode', '') streetAddress = '%s %s' % (dic.get('houseNumber', ''), dic.get('firstStreetNameNormalized', '')) borough = dic.get('firstBoroughName', '') longitude = dic.get('longitude', '') latitude = dic.get('latitude', '') return { "refLocation": [{ "@type": "Place", "@context": "http://schema.org", "address": { "@type": "PostalAddress", "addressLocality": "New York City", "addressRegion": "NY", "postalCode": zipcode, "streetAddress": streetAddress.strip(), "borough": borough }, "geo": { "@type": "GeoCoordinates", "latitude": latitude, "longitude": longitude }}] }
def parse(self, name): usa = ', USA' if name.endswith(usa): name = name[:-len(usa)] try: parts = usaddress.parse(name) except UnicodeDecodeError: parts = () place_parts = [] state = '' for n, t in parts: if t == 'PlaceName': place_parts.append(n.strip(' ,')) elif t == 'StateName': state = n.strip(' ,') return { 'place': ' '.join(place_parts), 'state': state }
streets.add(tag.get('v').split(" ")[-1]) elem.clear() streets # In[58]: #Trying a more sophisticated approach that uses natural language processing module usaddress #https://github.com/datamade/usaddress streets = Set() for event, elem in ET.iterparse('los-angeles_california.osm', events=("start",)): if elem.tag == "node" or elem.tag == "way": for tag in elem.iter("tag"): if is_street_name(tag): try: streets.add(str(next((st for st, comp in usaddress.parse(tag.get('v')) if comp == "StreetNamePostType")))) except: continue elem.clear() streets # In[4]: #The file `C1 Street Suffix Abbreviations.html` is the html downloaded from the webpage at http://pe.usps.gov/text/pub28/28apc_002.htm #The main table with all of the street suffix abbreviations and variations has an id of ep533076 #Its body is structured as follows, and the algorithm below uses this structure #to make {abbrev: full_name} pairs for each abbreviation, including the standard one if #it's not already included in the middle column #------------------------------------------- #| Full Name | Abbrev #1 | Standard Abbrev |
if columnNum == 1: columnTest = column columnNum+=1 numMatch = re.search(r'\d+', columnTest) hashTagMatch = re.search(r'#', columnTest) if hashTagMatch: columnList.append('Code Violations') if numMatch: columnNum = 0 for column in columnList: if column == '': columnList[columnNum] = 'Not applicable' columnNum+=1 addressString = columnList[2] mapsUrl = "http://maps.google.com/?q=" + addressString addressDict = usaddress.parse(addressString) streetNum = (addressDict[0])[0] street = (addressDict[1])[0] if len(addressDict) >= 3: streetType = stTypeToAbbrev((addressDict[2])[0]) else: streetType = '' columnList[2] = mapsUrlToHref(mapsUrl, addressString) siteTest = columnList[3] noSiteMatch = re.search(r'No website', siteTest) if not noSiteMatch: columnList[3] = '<a href=\"' + siteTest + '\">Link</a>' columnList.append(("<a href=javascript:loadDoc(\'%s\',\'%s\',\'%s\')>Link</a>" %(streetNum, street, streetType)).encode('utf-8').strip()) columnList = list(filter(lambda x: x!= '', columnList)) housingTable = housingTable + rowGenerator(columnList)
curs = conn.cursor() rows = curs.execute(sel, params) for row in curs.fetchall(): yield row def geocode(address): url = 'https://geomap.ffiec.gov/FFIECGeocMap/GeocodeMap1.aspx/GetGeocodeData' headers = {'content-type': 'application/json; charset=utf-8'} params = {'sSingleLine': '{0} Chicago, IL'.format(address), 'iCensusYear': "2014"} r = requests.post(url, headers=headers, data=json.dumps(params)) return r.json() if __name__ == "__main__": import time for row in getRecords(): parsed_address = usaddress.parse(row[0]) add = ' '.join([component for component, label in parsed_address \ if label in ['AddressNumber', 'StreetNamePreDirectional', 'StreetName']]) response = geocode(add) latitude = response['d']['sLatitude'] longitude = response['d']['sLongitude'] formatted_address = response['d']['sMatchAddr'] raw_address = row[0] if latitude and longitude: print raw_address ins = ''' INSERT INTO addresses ( status, formatted_address, raw_address, source,
def predict(self, X): reload(usaddress) # tagger object is defined at the module level, update now predictions = [] for address in X: predictions.append([foo[1] for foo in usaddress.parse(address)]) return predictions
if outdoor: print 'outdoor:', outdoor if wifi: print 'wifi:', wifi if goodFor: print 'goodFor:', goodFor if alcohol: print 'alcohol:', alcohol if noise: print 'noise:', noise if ambience: print 'ambience:', ambience if tv: print 'tv:', tv if caters: print 'caters:', caters if wheelchairAccessible: print 'wheelchairAccessible:', wheelchairAccessible row = dict(title=title, categories=categories, rating=rating, img=img, addr=addr, phone=phone, price=price, menu=menu, creditCards=creditCards, parking=parking, attire=attire, groups=groups, kids=kids, reservations=reservations, delivery=delivery, takeout=takeout, waiterService=waiterService, outdoor=outdoor, wifi=wifi, goodFor=goodFor, alcohol=alcohol, noise=noise, ambience=ambience, tv=tv, caters=caters, wheelchairAccessible=wheelchairAccessible) # dont know why we need to switch these parsed_address = usaddress.parse(addr) row.update({v: k for k,v in parsed_address}) writer.writerow({k:v.encode('utf8') for k,v in row.items()}) return extracted, True def crawl(zipcode=None): page = 0 flag = True some_zipcodes = [zipcode] if zipcode else get_zips() if zipcode is None: print '\n**We are attempting to extract all zipcodes in America!**' for zipcode in some_zipcodes: # print '\n===== Attempting extraction for zipcode <', zipcode, '>=====\n'
def lookupAddress(address): parsedAddress = usaddress.parse(address) return parsedAddress