Example #1
0
def rm_addresses(doc):
    doc_l = []
    rm_st = []
    sens = doc.split("\n")
    for sent in sens:
        sent_l = []
        usaddress.parse(sent)
        for tuple2 in usaddress.parse(sent):
            # print(tuple2)
            if tuple2[1] == 'BuildingName' or tuple2[
                    1] == 'Recipient' or tuple2[1] == 'OccupancyType' or tuple2[
                        1] == 'OccupancyIdentifier' or tuple2[
                            1] == 'LandmarkName':
                sent_l.append(tuple2[0])
            else:
                sent_l.append("█" * len(tuple2[0]))
                rm_st.append(tuple2[0])
        # print(sent_l)
        deto = MosesDetokenizer()
        sent_n = deto.detokenize(sent_l, return_str=True)
        # sent_n = " ".join(sent_l)
        doc_l.append(sent_n)
    # print(doc_l)
    doc = "\n".join(doc_l)
    #print(doc)
    return doc, rm_st
	def parse_page(self, response):
		print("=========  Checking.......")	
		store_list = response.xpath('//div[@id="tabs-3"]/div')
		try:
			item = ChainItem()
			store = response.xpath('//div[@id="tabs-3"]')
			address_temp = self.eliminate_space(store.xpath('./text()').extract())
			address = ''
			for temp in address_temp:
				if '(' not in temp:
					address += temp + ', '		
				else:
					item['phone_number'] = temp
					break
			item['address'] = ''
			item['city'] = ''
			addr = usaddress.parse(address)
			for temp in addr:
				if temp[1] == 'PlaceName':
					item['city'] += temp[0].replace(',','')	+ ' '
				elif temp[1] == 'StateName':
					item['state'] = temp[0].replace(',','')
				elif temp[1] == 'ZipCode':
					item['zip_code'] = temp[0].replace(',','')
				else:
					item['address'] += temp[0].replace(',', '') + ' '
			item['country'] = 'United States'
			if item['phone_number'] != '':
				yield item
		except:
			pass

		for store in store_list:
			try:
				item = ChainItem()
				item['store_name'] = self.validate(store.xpath('.//strong/text()').extract_first())
				address_temp = self.eliminate_space(store.xpath('./text()').extract())
				address = ''
				for temp in address_temp:
					address += temp + ', '		
				item['address'] = ''
				item['city'] = ''
				addr = usaddress.parse(address)
				for temp in addr:
					if temp[1] == 'PlaceName':
						item['city'] += temp[0].replace(',','')	+ ' '
					elif temp[1] == 'StateName':
						item['state'] = temp[0].replace(',','')
					elif temp[1] == 'ZipCode':
						item['zip_code'] = temp[0].replace(',','')
					else:
						item['address'] += temp[0].replace(',', '') + ' '
				item['country'] = 'United States'
				item['phone_number'] = self.validate(store.xpath('.//div[@class="phoneNumberBox"]/text()').extract_first())
				if item['store_name'] != '':
					yield item			
			except:
				pdb.set_trace()
	def body(self, response):
		print("=========  Checking.......")
		store_list = response.xpath('//div[@class="storelist-inner-tab"]')
		for store in store_list:
			try:
				item = ChainItem()
				item['store_name'] = self.validate(store.xpath('.//h4/text()').extract_first())
				detail = self.eliminate_space(store.xpath('.//p//text()').extract())
				address = ''
				for de in detail:
					if 'phone' in de.lower():
						item['phone_number'] = de.split(':')[1].strip()
						break
					address += de + ', '
				item['address'] = ''
				item['city'] = ''
				addr = usaddress.parse(address[:-2])
				for temp in addr:
					if temp[1] == 'PlaceName':
						item['city'] += temp[0].replace(',','')	+ ' '
					elif temp[1] == 'StateName':
						item['state'] = temp[0].replace(',','')
					elif temp[1] == 'ZipCode':
						item['zip_code'] = temp[0].replace(',','')
					else:
						item['address'] += temp[0].replace(',', '') + ' '
				item['country'] = 'United States'
				h_temp = ''
				hour_list = self.eliminate_space(store.xpath('.//div[@id="StoreServicesContainer"]//span//text()').extract())
				for hour in hour_list:
					h_temp += hour + ', '
				item['store_hours'] = h_temp[:-2]
				yield item	
			except:
				pass
def geo_parser(location, gmaps_json):
    # parse json response
    try:
        results = gmaps_json["results"][0]
        std_name = results['name']
        print(std_name)
        lat = results['geometry']['location']['lat']
        lng = results['geometry']['location']['lng']
        std_address = results['formatted_address']
        # parse address
        try:
            parsed_address = usaddress.tag(std_address)
            city = parsed_address[0]['PlaceName']
            state = parsed_address[0]['StateName']
        except:
            parsed_address = usaddress.parse(std_address)
            # traverse parsed address list if the tagger fails
            city = ''
            state = ''
            for addr_tup in parsed_address:
                print(addr_tup)
                if addr_tup[1] == 'PlaceName':
                    city += ' ' + addr_tup[0]
                if addr_tup[1] == 'StateName':
                    state += ' ' + addr_tup[0]
                city = city.strip()
        print(city)
        df = pd.DataFrame([[location, std_name, lat, lng, city, state]], columns=['Raw_Name', 'Name', 'Latitude', 'Longitude', 'City', 'State'])
        return df

    except IndexError:
        print(gmaps_json)
        df = pd.DataFrame()
        return
 def parse_page(self, response):
   detail = response.xpath('//div[contains(@class, "location-content")]')
   item = ChainItem()
   item['store_name'] = self.validate(detail.xpath('.//h3[@class="white"]/text()'))
   address = self.validate(detail.xpath('.//pre[@class="white"]/text()')).replace('.', ' ')
   addr = usaddress.parse(address)
   item['address'] = ''
   item['city'] = ''
   for temp in addr:
     if temp[1] == 'AddressNumber':
       item['address'] += temp[0] + ' '
     elif temp[1] == 'StreetName':
       item['address'] += temp[0] + ' '
     elif temp[1] == 'StreetNamePostType':
       item['address'] += temp[0] + ' '
     elif temp[1] == 'OccupancyType':
       item['address'] += temp[0] + ' '
     elif temp[1] == 'OccupancyIdentifier':	
       item['address'] += temp[0] + ' '
     elif temp[1] == 'PlaceName':
       item['city'] += temp[0]	+ ' '
     elif temp[1] == 'StateName':
       item['state'] = temp[0]
     elif temp[1] == 'ZipCode':
       item['zip_code'] = temp[0]
   item['country'] = 'United States'
   item['phone_number'] = self.validate(detail.xpath('.//p[@class="white"]/text()'))
   item['store_hours'] = self.validate(detail.xpath('.//div[contains(@class, "hours")]//pre/text()'))
   yield item			
 def body(self, response):
   store_list = response.xpath('//div[contains(@class,"store-info")]')
   for store in store_list:
     store_url = store.xpath('.//a/@href').extract_first()
     if store_url:
       yield scrapy.Request(url=store_url, callback=self.parse_page)
     else :
       item = ChainItem()
       item['store_name'] = self.validate(store.xpath('.//div[@class="box"]//h3//text()'))		
       address = self.validate(store.xpath('.//div[@class="box"]//p/text()'))
       addr = usaddress.parse(address)
       item['address'] = ''
       item['city'] = ''
       for temp in addr:
         if temp[1] == 'AddressNumber':
           item['address'] += temp[0] + ' '
         elif temp[1] == 'StreetName':
           item['address'] += temp[0] + ' '
         elif temp[1] == 'StreetNamePostType':
           item['address'] += temp[0] + ' '
         elif temp[1] == 'OccupancyType':
           item['address'] += temp[0] + ' '
         elif temp[1] == 'OccupancyIdentifier':	
           item['address'] += temp[0] + ' '
         elif temp[1] == 'PlaceName':
           item['city'] += temp[0]	+ ' '
         elif temp[1] == 'StateName':
           item['state'] = temp[0]
         elif temp[1] == 'ZipCode':
           item['zip_code'] = temp[0]
       item['country'] = 'United States'
       if item['city'] == '' :
         item['address'] = self.validate(store.xpath('.//div[@class="box"]//p/text()'))
         item['country'] = ''
       yield item
Example #7
0
    def getAddress(self, text):
        """ parse text for address 
    
        Parameters:
        text (str): text to parse
        Returns: address string
        
        """

        doc = self.nlp(text)

        # for some reason zipcode 60601 is not an entity, so we can't filter by entity type
        #print([(ent.text, ent.label_) for ent in doc.ents])
        #str = ['CARDINAL', 'LOC', 'GPE', 'FAC']
        #addrlist = [ent.text for ent in doc.ents if ent.label_ in str]
        #addr = " ".join(addrlist)
        #parsed = usaddress.parse(addr)

        parsed = usaddress.parse(text)
        parsed = [x for x in parsed
                  if x[1] != 'Recipient']  # filter out recipient

        addr = ''
        for x in parsed:
            val = x[0]
            typ = x[1]
            addr += val + ' '
            #print(val,'-> ', typ)

        if len(addr) > 0:
            addr = addr.replace('.', '').strip()
        return addr
Example #8
0
 def body(self, response):
     store_list = response.xpath('//div[contains(@class, "altrow")]')
     for store in store_list:
         item = ChainItem()
         detail = self.eliminate_space(store.xpath('.//text()').extract())
         item['store_name'] = self.validate(detail[0])
         address = ''
         item['phone_number'] = ''
         for de in detail:
             if 'phone' in de:
                 item['phone_number'] = self.validate(de.split(':')[1])
                 break
             else:
                 address += de + ', '
         item['address'] = ''
         item['city'] = ''
         addr = usaddress.parse(address)
         for temp in addr:
             if temp[1] == 'PlaceName':
                 item['city'] += temp[0].replace(',', '') + ' '
             elif temp[1] == 'StateName':
                 item['state'] = temp[0].replace(',', '')
             elif temp[1] == 'ZipCode':
                 item['zip_code'] = temp[0].replace(',', '')
             else:
                 item['address'] += temp[0].replace(',', '') + ' '
         item['country'] = 'United States'
         if item['address'] + item['phone_number'] not in self.history:
             self.history.append(item['address'] + item['phone_number'])
             yield item
Example #9
0
    def parse_page(self, response):
        try:
            item = ChainItem()
            detail = self.eliminate_space(
                response.xpath(
                    '//div[@class="location-overlay"]//text()').extract())
            address = ''
            item['phone_number'] = ''
            for de in detail:
                if ':' not in de and '-' in de:
                    item['phone_number'] = de
                    break
                address += de + ', '
            item['address'] = ''
            item['city'] = ''
            addr = usaddress.parse(address[:-2])
            for temp in addr:
                if temp[1] == 'PlaceName':
                    item['city'] += temp[0].replace(',', '') + ' '
                elif temp[1] == 'StateName':
                    item['state'] = temp[0].replace(',', '')
                elif temp[1] == 'ZipCode':
                    item['zip_code'] = temp[0].replace(',', '')
                else:
                    item['address'] += temp[0].replace(',', '') + ' '

            item['country'] = 'United States'
            h_temp = ''
            for de in detail:
                if ':' in de:
                    h_temp += de + ', '
            item['store_hours'] = h_temp[:-2]
            yield item
        except:
            pdb.set_trace()
Example #10
0
    def body(self, response):
        print("=========  Checking.......")
        store_list_one = response.xpath('//div[contains(@class, "plb")]')
        store_list_two = response.xpath('//div[contains(@class, "plw")]')
        store_list = store_list_one + store_list_two
        for store in store_list:
            url = self.domain[:-1] + store.xpath(
                './/b//a/@href').extract_first()
            request = scrapy.Request(url=url, callback=self.parse_page)
            detail = self.eliminate_space(store.xpath('./text()').extract())
            request.meta['store_name'] = store.xpath(
                './/b//a/text()').extract_first()
            address = detail[1] + detail[2]
            request.meta['address'] = ''
            request.meta['city'] = ''
            addr = usaddress.parse(address)
            for temp in addr:
                if temp[1] == 'PlaceName':
                    request.meta['city'] += temp[0].replace(',', '') + ' '
                elif temp[1] == 'StateName':
                    request.meta['state'] = temp[0]
                elif temp[1] == 'ZipCode':
                    request.meta['zip_code'] = temp[0].replace(',', '')
                else:
                    request.meta['address'] += temp[0].replace(',', '') + ' '
            yield request

        pagenation = response.xpath('//div[contains(@class, "panelpn")]//a')
        pagenation = pagenation[len(pagenation) -
                                1].xpath('./@href').extract_first()
        if pagenation is not None:
            pagenation = self.domain + pagenation
            yield scrapy.Request(url=pagenation, callback=self.body)
	def parse_page(self, response):	
		try:
			detail = self.eliminate_space(response.xpath('//div[@id="contentLocator"]//text()').extract())
			item = ChainItem()
			item['store_name'] = 'Kinney Drugs'
			address = detail[2] + ', ' + detail[3]
			item['address'] = ''
			item['city'] = ''
			addr = usaddress.parse(address)
			for temp in addr:
				if temp[1] == 'PlaceName':
					item['city'] += temp[0].replace(',','')	+ ' '
				elif temp[1] == 'StateName':
					item['state'] = temp[0].replace(',','')
				elif temp[1] == 'ZipCode':
					item['zip_code'] = temp[0].replace(',','')
				else:
					item['address'] += temp[0].replace(',', '') + ' '
			item['country'] = 'United States'
			item['phone_number'] = detail[5]
			item['store_hours'] = detail[7] + ', ' + detail[8] + ', '
			item['store_hours'] += detail[9] + ' ' + detail[10] + ', ' + detail[11]
			yield item			
		except:
			pass
	def body(self, response):
		print("=========  Checking.......")

		store_list = response.xpath('//div[@class="indent1"]//table')
		name_list = self.eliminate_space(response.xpath('//div[@class="indent1"]//h3//text()').extract())
		for ind in range(0, len(store_list)):
			try:
				item = ChainItem()
				detail = self.eliminate_space(store_list[ind].xpath('.//text()').extract())
				item['store_name'] = name_list[ind]
				address = detail[3]
				item['address'] = ''
				item['city'] = ''
				addr = usaddress.parse(address)
				for temp in addr:
					if temp[1] == 'PlaceName':
						item['city'] += temp[0].replace(',','')	+ ' '
					elif temp[1] == 'StateName':
						item['state'] = temp[0].replace(',','')
					elif temp[1] == 'ZipCode':
						item['zip_code'] = temp[0].replace(',','')
					else:
						item['address'] += temp[0].replace(',', '') + ' '
				item['country'] = 'United States'
				item['phone_number'] = detail[1]
				if item['address']+item['phone_number'] not in self.history:
					self.history.append(item['address']+item['phone_number'])
					yield item	
			except:
				pass	
Example #13
0
    def parse_store(self, response):
        item = ChainItem()
        try:
            pdb.set_trace()
            item['store_number'] = ''
            item['store_name'] = response.xpath(
                './/h2/strong/text()').extract_first()
            address = response.xpath(
                ".//div[@id='pageContent']/p[2]/text()").extract()
            addr = usaddress.parse(address)
            item['address'] = self.validate(store_info['address']['address1'])
            item['address2'] = self.validate(store_info['address']['address2'])
            item['city'] = self.validate(store_info['address']['city'])
            item['state'] = self.validate(store_info['address']['state'])
            item['zip_code'] = self.validate(store_info['address']['zip'])
            item['country'] = 'United States'
            item['phone_number'] = response.xpath(
                './/a[@class="bold mb2 db"]/text()').extract_first()
            item['latitude'] = ''
            item['longitude'] = ''

            item['store_hours'] = ''
            hours = response.xpath(
                './/div[@class="ph1 hours-wrap"]/div[@class="mb2"]/p')
            for hour in hours:
                item['store_hours'] += self.validate("".join(
                    hour.xpath('.//text()').extract())) + "; "

            item['other_fields'] = ""
            item['coming_soon'] = "0"
            yield item
        except:
            pdb.set_trace()
Example #14
0
 def parse_page(self, response):
     item = ChainItem()
     item['store_name'] = self.validate(
         response.xpath(
             '//div[contains(@class, "location-data")]//h2/text()').
         extract_first())
     address_temp = self.eliminate_space(
         response.xpath('//div[@class="address"]//p//text()').extract())
     address = ''
     for temp in address_temp:
         address += temp + ', '
     item['address'] = ''
     item['city'] = ''
     addr = usaddress.parse(address)
     for temp in addr:
         if temp[1] == 'PlaceName':
             item['city'] += temp[0].replace(',', '') + ' '
         elif temp[1] == 'StateName':
             item['state'] = temp[0]
         elif temp[1] == 'ZipCode':
             item['zip_code'] = temp[0].replace(',', '')
         else:
             item['address'] += temp[0].replace(',', '') + ' '
     item['country'] = 'United States'
     item['phone_number'] = self.validate(
         response.xpath(
             '//span[@class="address-phone"]/text()').extract_first())
     h_temp = ''
     hour_list = self.eliminate_space(
         response.xpath('//div[@class="hours"]//p//text()').extract())
     for hour in hour_list:
         if 'a.m.' in hour:
             h_temp += hour + ' , '
     item['store_hours'] = self.validate(h_temp[:-2])
     yield item
    def body(self, response):
        print("=========  Checking.......")
        store_list = response.xpath('//table[@class="tr-caption-container"]')
        for store in store_list:
            try:
                item = ChainItem()
                detail = self.eliminate_space(
                    store.xpath(
                        './/td[@class="tr-caption"]//text()').extract())
                item['store_name'] = detail[0] + ' ' + detail[1]
                address = ''
                for de in detail[2:-1]:
                    address += de + ', '
                item['address'] = ''
                item['city'] = ''
                addr = usaddress.parse(address)
                for temp in addr:
                    if temp[1] == 'PlaceName':
                        item['city'] += temp[0].replace(',', '') + ' '
                    elif temp[1] == 'StateName':
                        item['state'] = temp[0].replace(',', '')
                    elif temp[1] == 'ZipCode':
                        item['zip_code'] = temp[0].replace(',', '')
                    else:
                        item['address'] += temp[0].replace(',', '') + ' '

                item['country'] = 'United States'
                item['phone_number'] = detail[-1]
                yield item
            except:
                pass
Example #16
0
 def preprocess(self, text):
     #this case is included because usaddress doesn't do a great job if there isn't a number at parsing semantic information
     #However if there is a number it tends to be better than streetaddress
     #Therefore usaddress is better at figuring out where the start of an address is, in say a very long body of text with an address in there at some point
     #It isn't that great at approximate locations
     nouns = ['NN', 'NNP', 'NNPS', 'NNS']
     if any([elem.isdigit() for elem in text.split(" ")]):
         addr = usaddress.parse(text)
         addr = [elem for elem in addr if elem[1] != 'Recipient']
         addr_dict = {}
         for value, key in addr:
             if key in addr_dict.keys():
                 addr_dict[key] += " " + value
             else:
                 addr_dict[key] = value
         return addr_dict, "complete"
     else:
         possible_streets = []
         for word, tag in tagger.tag(text.split()):
             if tag == 'LOCATION':
                 possible_streets.append(word)
         parts = nltk.pos_tag(nltk.word_tokenize(text))
         for part in parts:
             if any([part[1] == noun for noun in nouns]):
                 possible_streets.append(part[0])
         return possible_streets, "cross streets"
	def body(self, response):
		print("=========  Checking.......")
		store_list = response.xpath('//li[@class="fusion-builder-row fusion-row"]')
		for store in store_list:
			try:
				item = ChainItem()
				item['store_name'] = self.validate(store.xpath('.//h4/text()').extract_first())
				address = self.validate(store.xpath('.//address/text()').extract_first())
				addr = usaddress.parse(address)
				item['city'] = ''
				item['address'] = ''
				for temp in addr:
					if temp[1] == 'PlaceName':
						item['city'] += temp[0].replace(',','')	+ ' '
					elif temp[1] == 'StateName':
							item['state'] = temp[0]
					elif temp[1] == 'ZipCode':
						item['zip_code'] = temp[0]
					else :
						item['address'] += temp[0] + ' '
				item['country'] = 'United States'
				item['phone_number'] = self.validate(store.xpath('.//a[2]/text()').extract_first())		
				yield item
			except:
				pass		
Example #18
0
    def test_simple_addresses(self):
        test_file = 'training/test_data/simple_address_patterns.xml'

        for address_text, components in parseTrainingData(test_file) :
            _, labels_true = zip(*components)
            _, labels_pred = zip(*parse(address_text))
            yield equals, address_text, labels_pred, labels_true
    def body(self, response):
        self.driver.get("https://www.racebros.com/contact")
        source = self.driver.page_source.encode("utf8")
        tree = etree.HTML(source)
        store_list = tree.xpath(
            '//div[@class="c2inlineContent"]//div[@class="txtNew"]')
        h_temp = self.eliminate_space(store_list[5].xpath(
            './/text()'))[0].split('stores')[1].strip().replace('or', ',')
        for store in store_list[1:4]:
            try:
                item = ChainItem()
                detail = self.eliminate_space(store.xpath('.//text()'))
                item['store_name'] = detail[0]
                address = detail[1] + ',' + detail[2]
                item['address'] = ''
                item['city'] = ''
                addr = usaddress.parse(address)
                for temp in addr:
                    if temp[1] == 'PlaceName':
                        item['city'] += temp[0].replace(',', '') + ' '
                    elif temp[1] == 'StateName':
                        item['state'] = temp[0].replace(',', '')
                    elif temp[1] == 'ZipCode':
                        item['zip_code'] = temp[0].replace(',', '')
                    else:
                        item['address'] += temp[0].replace(',', '') + ' '

                item['country'] = 'United States'
                item['phone_number'] = detail[3]
                item['store_hours'] = h_temp
                yield item
            except:
                pass
Example #20
0
 def body(self, response):
     store_list = response.xpath('//table//tr')
     for store in store_list:
         try:
             item = ChainItem()
             address_temp = self.eliminate_space(
                 store.xpath('.//td[2]//text()').extract())
             item['store_name'] = address_temp[0]
             address = address_temp[1] + ', ' + address_temp[2]
             item['address'] = ''
             item['city'] = ''
             addr = usaddress.parse(address)
             for temp in addr:
                 if temp[1] == 'PlaceName':
                     item['city'] += temp[0].replace(',', '') + ' '
                 elif temp[1] == 'StateName':
                     item['state'] = temp[0].replace(',', '')
                 elif temp[1] == 'ZipCode':
                     item['zip_code'] = temp[0].replace(',', '')
                 else:
                     item['address'] += temp[0].replace(',', '') + ' '
             item['country'] = 'United States'
             p_temp = self.eliminate_space(
                 store.xpath('.//td[1]//text()').extract())
             item['phone_number'] = p_temp[1]
             if item['address'] + item['phone_number'] not in self.history:
                 self.history.append(item['address'] + item['phone_number'])
                 yield item
         except:
             pass
	def parse_page(self, response):
		try:
			item = ChainItem()
			item['store_name'] = self.validate(response.xpath('//div[@class="store-locations"]//h1/text()').extract_first()).split('#')[0].strip()
			item['store_number'] = self.validate(response.xpath('//div[@class="store-locations"]//h1/text()').extract_first()).split('#')[1].strip()
			address = self.validate(response.xpath('//div[@class="store-locations"]//div[@class="address"]//p/text()').extract_first())
			item['address'] = ''
			item['city'] = ''
			addr = usaddress.parse(address)
			for temp in addr:
				if temp[1] == 'PlaceName':
					item['city'] += temp[0].replace(',','')	+ ' '
				elif temp[1] == 'StateName':
					item['state'] = temp[0].replace(',','')
				elif temp[1] == 'ZipCode':
					item['zip_code'] = temp[0].replace(',','')
				else:
					item['address'] += temp[0].replace(',', '') + ' '
			item['country'] = 'United States'
			detail = self.eliminate_space(response.xpath('//div[@class="store-locations"]//div[@class="store-info"]//text()').extract())
			for ind in range(0, len(detail)):
				if 'Phone' in detail[ind]:
					item['phone_number'] = self.validate(detail[ind+1])
					break
			h_temp = ''
			for ind in range(0, len(detail)):
				if 'day' in detail[ind].lower():
					h_temp += self.validate(detail[ind]) + ' ' + self.validate(detail[ind+1]) + ', '
					ind += 2
			item['store_hours'] = h_temp[:-2]
			yield item	
		except:
			pass
Example #22
0
def getusaddress(addr):
    address = usaddress.parse(addr)
    m = 0
    street = ""
    city = ""
    state = ""
    pcode = ""
    while m < len(address):
        temp = address[m]
        if temp[1].find("Address") != -1 or temp[1].find(
                "Street"
        ) != -1 or temp[1].find('Occupancy') != -1 or temp[1].find(
                "Recipient"
        ) != -1 or temp[1].find("BuildingName") != -1 or temp[1].find(
                "USPSBoxType") != -1 or temp[1].find("USPSBoxID") != -1:
            street = street + " " + temp[0]
        if temp[1].find("PlaceName") != -1:
            city = city + " " + temp[0]
        if temp[1].find("StateName") != -1:
            state = state + " " + temp[0]
        if temp[1].find("ZipCode") != -1:
            pcode = pcode + " " + temp[0]
        m += 1
    street = street.lstrip().replace(',', '')
    city = city.lstrip().replace(',', '')
    state = state.lstrip().replace(',', '')
    pcode = pcode.lstrip().replace(',', '')
    return (street, city, state, pcode)
	def parse_page(self, response):
		try:
			item = ChainItem()
			item['store_name'] = self.validate(response.xpath('//div[@class="retailer-details"]//h4//text()').extract()[0])
			address = self.eliminate_space(response.xpath('//div[@class="retailer-details"]/p//text()').extract())[0]
			item['address'] = ''
			item['city'] = ''
			addr = usaddress.parse(address)
			for temp in addr:
				if temp[1] == 'PlaceName':
					item['city'] += temp[0].replace(',','')	+ ' '
				elif temp[1] == 'StateName':
					item['state'] = temp[0].replace(',','')
				elif temp[1] == 'ZipCode':
					item['zip_code'] = temp[0].replace(',','')
				else:
					item['address'] += temp[0].replace(',', '') + ' '
			item['country'] = 'United States'
			item['phone_number'] = self.eliminate_space(response.xpath('//div[@class="retailer-details"]/p//text()').extract())[1]
			h_temp = ''
			hour_list = self.eliminate_space(response.xpath('//ul[@class="hours-results"]//text()').extract())
			cnt = 1
			for hour in hour_list:
				h_temp += hour
				if cnt % 2 == 0:
					h_temp += ', '
				else:
					h_temp += ' '
				cnt += 1
			item['store_hours'] = h_temp[:-2]
			if len(hour_list) < 10:
				item['store_hours'] = ''	
			yield item		
		except:
			pdb.set_trace()
Example #24
0
def isValidAddress(ady, verbose=False):

    address = usaddress.parse(ady)
    if len(address) < 4:
        showFailureReason('Not Enough Terms', ady, address, verbose)
        return False

    if any([a[1] == 'Recipient' for a in address]):
        showFailureReason('Recipient', ady, address, verbose)
        return False

    if not any([a[1] == 'StreetName' for a in address]):
        dic = {}
        for v, k in address:
            dic[k] = '%s %s' % (dic.get(k, ''), v)

        if 'BuildingName' in dic and 'PlaceName' in dic and \
           'StateName' in dic:
            return True

        showFailureReason('StreetName', ady, address, verbose)
        return False

    if not any([a[1] == 'AddressNumber' for a in address]):
        showFailureReason('AddressNumber', ady, address, verbose)
        return False

    return True
 def parse(self, response):
     phone = response.meta.get('phone')
     store_list = response.xpath('//section[@id="mainContent"]')
     for store in store_list:
         item = ChainItem()
         item['store_name'] = self.validate(store.xpath('.//h2//text()').extract_first())
         addr_list= store.xpath('.//address//text()').extract()
         address = ''
         for addr in addr_list[:-2]:
             address += addr + ' '
         addr = usaddress.parse(self.validate(address))
         item['address'] = ''
         item['city'] = ''
         item['state'] = ''
         item['zip_code'] = ''
         for temp in addr:
             if temp[1] == 'PlaceName':
                 item['city'] += temp[0].replace(',','') + ' '
             elif temp[1] == 'StateName':
                 item['state'] = temp[0].replace(',','')
             elif temp[1] == 'ZipCode':
                 item['zip_code'] = temp[0].replace(',','')
             else:
                 item['address'] += temp[0].replace(',', '') + ' '
         item['country'] = "United States"
         item['phone_number'] = phone
         yield item          
Example #26
0
 def body(self, response):
     data = response.body.split('initial_locations:')[1].split('min_zoom:')[0].strip()[:-1]
     store_list = json.loads(data)
     for store in store_list:
         item = ChainItem()
         item['store_number'] = store['location_id']
         item['store_name'] = store['title']
         item['latitude'] = store['latitude']
         item['longitude'] = store['longitude']
         item['phone_number'] = store['phone']
         city = ''
         address = ''
         state = ''
         zip_code = ''
         temp_address = store['address']
         addr = usaddress.parse(temp_address)
         for temp in addr:
             if temp[1] == 'PlaceName':
                 city += temp[0].replace(',','')	+ ' '
             elif temp[1] == 'StateName':
                 state = temp[0].replace(',','')
             elif temp[1] == 'ZipCode':
                 zip_code = temp[0].replace(',','')
             else:
                 address += temp[0].replace(',','') + ' '
         item['address'] = address
         item['city'] = city
         item['state'] = state
         item['zip_code'] = zip_code
         item['country'] = store['country']
         yield item
Example #27
0
 def predict(self, X):
     reload(usaddress
            )  # tagger object is defined at the module level, update now
     predictions = []
     for address in X:
         predictions.append([foo[1] for foo in usaddress.parse(address)])
     return predictions
	def parse_page(self, response):
		try:
			item = ChainItem()
			addr_list = self.eliminate_space(response.xpath('//div[@class="dBox01"][1]//text()').extract())[1:]
			address = ''
			for addr in addr_list:
				if 'locate' in addr.lower() or 'corner' in addr.lower() or 'shop' in addr.lower() or 'null' in addr.lower() or 'intersection' in addr.lower() or 'market' in addr.lower():
					addr = self.validate(addr.split(',')[0])
				address += addr + ' '
			item['address'] = ''
			item['city'] = ''
			item['state'] = ''
			addr = usaddress.parse(address[:-2])
			for temp in addr:
				if temp[1] == 'PlaceName':
					item['city'] += temp[0].replace(',','')	+ ' '
				elif temp[1] == 'StateName':
					item['state'] = temp[0].replace(',','')
				elif temp[1] == 'ZipCode':
					item['zip_code'] = temp[0].replace(',','')
				else:
					item['address'] += temp[0].replace(',', '') + ' '
			item['country'] = self.check_country(item['state'])
			h_temp = 'MinuteClinic hours: '
			hour_list = self.eliminate_space(response.xpath('//div[@class="dMarginBot2"][1]/text()').extract())
			cnt = 1
			for hour in hour_list:
				h_temp += hour + ', '
				if cnt == 3:
					h_temp += 'Lunch hours: '
				cnt += 1
			item['store_hours'] = h_temp[:-2]
			yield item			
		except:
			pass
 def parse_page(self, response):
     try:
         item = ChainItem()
         detail = self.eliminate_space(
             response.xpath(
                 '//div[@class="location"][1]//address//text()').extract())
         address = ''
         for de in detail[1:]:
             address += de + ', '
         item['address'] = ''
         item['city'] = ''
         addr = usaddress.parse(address)
         for temp in addr:
             if temp[1] == 'PlaceName':
                 item['city'] += temp[0].replace(',', '') + ' '
             elif temp[1] == 'StateName':
                 item['state'] = temp[0].replace(',', '')
             elif temp[1] == 'ZipCode':
                 item['zip_code'] = temp[0].replace(',', '')
             else:
                 item['address'] += temp[0].replace(',', '') + ' '
         item['country'] = 'United States'
         item['phone_number'] = self.eliminate_space(
             response.xpath(
                 '//div[@class="location"][2]//text()').extract())[2]
         yield item
     except:
         pass
 def preprocess(self,text):
     #this case is included because usaddress doesn't do a great job if there isn't a number at parsing semantic information
     #However if there is a number it tends to be better than streetaddress
     #Therefore usaddress is better at figuring out where the start of an address is, in say a very long body of text with an address in there at some point
     #It isn't that great at approximate locations
     nouns = ['NN','NNP','NNPS','NNS']
     if any([elem.isdigit() for elem in text.split(" ")]):
         addr = usaddress.parse(text)
         addr = [elem for elem in addr if elem[1] != 'Recipient']
         addr_dict = {}
         for value,key in addr:
             if key in addr_dict.keys():
                 addr_dict[key] += " "+value
             else:
                 addr_dict[key] = value
         return addr_dict,"complete"
     else:
         possible_streets = []
         for word,tag in tagger.tag(text.split()):
             if tag == 'LOCATION':
                 possible_streets.append(word)
         parts = nltk.pos_tag(nltk.word_tokenize(text))
         for part in parts:
             if any([part[1]==noun for noun in nouns]):
                 possible_streets.append(part[0])
         return possible_streets,"cross streets"
 def body(self, response):
     print("=========  Checking.......")
     store_list = response.xpath('//div[@id="cms_content_frame"]//tr')
     for store in store_list[6:-1]:
         detail = self.eliminate_space(store.xpath('.//text()').extract())
         try:
             item = ChainItem()
             item['store_name'] = detail[0]
             item['address'] = ''
             item['city'] = ''
             addr = usaddress.parse(detail[1])
             for temp in addr:
                 if temp[1] == 'PlaceName':
                     item['city'] += temp[0].replace(',', '') + ' '
                 elif temp[1] == 'StateName':
                     item['state'] = temp[0].replace(',', '')
                 elif temp[1] == 'ZipCode':
                     item['zip_code'] = temp[0].replace(',', '')
                 else:
                     item['address'] += temp[0].replace(',', '') + ' '
             item['country'] = 'United States'
             item['phone_number'] = self.validate(
                 detail[2].split('Grocery Hours:')[0].strip()[1:-1])
             item['store_hours'] = self.validate(
                 detail[2].split('Grocery Hours:')[1].strip())
             if item['store_name'] == 'Nob Hill Foods':
                 yield item
         except:
             pass
Example #32
0
	def parse_page(self, response):
		item = ChainItem()
		store = response.xpath('//div[@class="store-list-detail"]')
		item['store_name'] = self.validate(response.xpath('.//div[@class="store-details-section"]//h1/text()').extract_first())
		address_temp = self.eliminate_space(store.xpath('.//div[@class="sl-address"]//text()').extract())
		address = ''
		for temp in address_temp:
			address += temp +', '
		item['address'] = ''
		item['city'] = ''
		addr = usaddress.parse(address)
		for temp in addr:
			if temp[1] == 'PlaceName':
				item['city'] += temp[0].replace(',','')	+ ' '
			elif temp[1] == 'StateName':
				item['state'] = temp[0]
			elif temp[1] == 'ZipCode':
				item['zip_code'] = temp[0].replace(',','')
			else:
				item['address'] += temp[0].replace(',', '') + ' '
		item['country'] = 'United States'
		item['phone_number'] = self.validate(store.xpath('.//a[@itemprop="telephone"]/text()').extract_first())
		h_temp = ''
		hour_list = store.xpath('.//div[contains(@class, "sl-store-hours")]//div[@class="sl-hours"]')
		for hour in hour_list:
			hour = self.eliminate_space(hour.xpath('.//div/text()').extract())
			for h in hour:
				h_temp += h +' '
			h_temp += ', '
		item['store_hours'] = h_temp[:-2]
		yield item			
 def parse_page(self, response):
     try:
         item = ChainItem()
         detail = self.eliminate_space(
             response.xpath('//table//text()').extract())
         item['store_name'] = self.validate(
             response.xpath('//h1/text()').extract_first())
         address = ''
         for cnt in range(1, len(detail)):
             if 'phone' in detail[cnt].lower():
                 item['phone_number'] = detail[cnt + 1]
                 break
             address += detail[cnt] + ', '
         item['address'] = ''
         item['city'] = ''
         addr = usaddress.parse(address[:-2])
         for temp in addr:
             if temp[1] == 'PlaceName':
                 item['city'] += temp[0].replace(',', '') + ' '
             elif temp[1] == 'StateName':
                 item['state'] = temp[0].replace(',', '')
             elif temp[1] == 'ZipCode':
                 item['zip_code'] = temp[0].replace(',', '')
             else:
                 item['address'] += temp[0].replace(',', '') + ' '
         item['country'] = 'United States'
         h_temp = ''
         for de in detail:
             if ':' in de:
                 h_temp += de + ', '
         item['store_hours'] = h_temp[:-2]
         yield item
     except:
         pass
    def test_us_ia_linn(self) :

        test_file = 'training_data/openaddress_us_ia_linn.xml'

        for address_text, components in parseTrainingData(test_file) :
            _, labels_true = zip(*components)
            _, labels_pred = zip(*parse(address_text))
            yield equals, address_text, labels_pred, labels_true
Example #35
0
 def anonymize(self, address):
     try:
         return self._cache[address]
     except KeyError:
         parsed = usaddress.parse(address)
         anonymized = ' '.join(self._anonymize_parsed(parsed))
         self._cache[address] = anonymized
         return anonymized
    def test_Parser(self):

        test_file = 'training/test_data/synthetic_osm_data_xml.xml'

        for address_text, components in parseTrainingData(test_file) :
            _, labels_true = zip(*components)
            _, labels_pred = zip(*parse(address_text))
            yield equals, address_text, labels_pred, labels_true
Example #37
0
    def test_synthetic_addresses(self):
        test_file = "measure_performance/test_data/synthetic_osm_data.xml"
        data = list(readTrainingData([test_file], GROUP_LABEL))

        for labeled_address in data:
            address_text, components = labeled_address
            _, labels_true = list(zip(*components))
            _, labels_pred = list(zip(*parse(address_text)))
            yield equals, address_text, labels_pred, labels_true
    def test_Parser(self):

        test_file = 'training/test_data/us50_test_tagged.xml'

        for address_text, components in parseTrainingData(test_file) :
            _, labels_true = zip(*components)
            _, labels_pred = zip(*parse(address_text))
            
            yield fuzzyEquals, address_text, labels_pred, labels_true
  def typoCol(self, x):
    result = []
    print usaddress.parse(x)
    for r in usaddress.parse(x):
      if numpy.random.rand(1,1) > self.typoprob:
        result.append(str(r[0]))
      elif r[1] == 'ZipCode':
        pass
      elif r[1] == 'StreetNamePostType':
        newr = str(r[0])
        for i in self.term_alts:
          if newr.lower() in i:
            newr = random.choice(i)
        result.append(newr)
      else:
        result.append(str(r[0]))

    return ' '.join(result)
Example #40
0
def format_address(addr):
    addr_components = usaddress.parse(addr)
    dicter = {}
    for component in addr_components:
        if not component[1] in dicter.keys():
            dicter[component[1]] = component[0]
        else:
            dicter[component[1]] += " "+component[0]
    return dicter["AddressNumber"] + " " + dicter["StreetName"] + " " + format_streetname_post_type(dicter["StreetNamePostType"]) + " " + dicter["PlaceName"] + " " + dicter["StateName"] + " "+ dicter["ZipCode"]
Example #41
0
    def test_us50(self):
        test_file = "measure_performance/test_data/us50_test_tagged.xml"
        data = list(readTrainingData([test_file], GROUP_LABEL))

        for labeled_address in data:
            address_text, components = labeled_address
            _, labels_true = list(zip(*components))
            _, labels_pred = list(zip(*parse(address_text)))
            yield fuzzyEquals, address_text, labels_pred, labels_true
Example #42
0
    def parse_with_usaddress_parse(self, addr_str):
        """
        Parses address string using usaddress's `parse()` function
        """
        parsed = usaddress.parse(addr_str)

        addr_parts = [{'code': self.standard_part_mapping[v], 'value': k} for k, v in parsed]

        return addr_parts
def completeStreetTypeAbbreviation(streetName):
    streetComponents = usaddress.parse(streetName)
    reconstructedStreet = []
    for streetComponent in streetComponents:
        if streetComponent[1] != "StreetNamePostType":
            reconstructedStreet.append(streetComponent[0])
        else:
            abbreviationToConvert = re.sub("[^A-Z]+", "", streetComponent[0].upper()) #convert to upper case and remove nonalpha chars
            reconstructedStreet.append(streetConversionMap.get(abbreviationToConvert, streetComponent[0]).capitalize())
    return " ".join(reconstructedStreet)
Example #44
0
def parse_address(address):
    parsed = usaddress.parse(address)

    street_number = ' '.join([p[0] for p in parsed if p[1] == 'AddressNumber'])
    street_dir = ' '.join([p[0] for p in parsed if p[1] == 'StreetNamePreDirectional'])
    street_name = ' '.join([p[0] for p in parsed if p[1] == 'StreetName'])
    street_type = ' '.join([p[0] for p in parsed if p[1] == 'StreetNamePostType'])
    unit_number = ' '.join([p[0] for p in parsed if p[1] == 'OccupancyIdentifier'])

    return (street_number, street_dir, street_name, street_type, unit_number)
def usaddress_to_dict(text):
    addr = usaddress.parse(text)
    addr = [elem for elem in addr if elem[1] != 'Recipient']
    addr_dict = {}
    for value,key in addr:
        if key in addr_dict.keys():
            addr_dict[key] += " "+value
        else:
            addr_dict[key] = value
    return addr_dict
def consoleLabel(raw_addr, label_options): 

    friendly_tag_dict = dict((label[1], label[0])
                            for label in label_options)
    valid_responses = ['y', 'n', 's', 'f', '']
    addrs_left_to_tag = []
    finished = False

    addrs_left_to_tag = raw_addr.copy()

    total_addrs = len(raw_addr)

    tagged_addr = set([])

    for i, addr_string in enumerate(raw_addr, 1):
        if not finished:

            print "(%s of %s)" % (i, total_addrs)
            print "-"*50
            print "ADDRESS STRING: ", addr_string
                
            preds = usaddress.parse(addr_string)

            user_input = None 
            while user_input not in valid_responses :


                friendly_addr = [(token[0], friendly_tag_dict[token[1]]) for token in preds]
                print_table(friendly_addr)

                sys.stderr.write('Is this correct? (y)es / (n)o / (s)kip / (f)inish tagging\n')
                user_input = sys.stdin.readline().strip()

                if user_input =='y':
                    tagged_addr.add(tuple(preds))
                    addrs_left_to_tag.remove(addr_string)

                elif user_input =='n':
                    corrected_addr = manualTagging(preds, 
                                                label_options,
                                                friendly_tag_dict)
                    tagged_addr.add(tuple(corrected_addr))
                    addrs_left_to_tag.remove(addr_string)


                elif user_input in ('' or 's') :
                    print "Skipped\n"
                elif user_input == 'f':
                    finished = True

    print "Done! Yay!"
    
    return tagged_addr, addrs_left_to_tag
Example #47
0
def get_streetnames(text):
    streetnames = []
    parsed_text = usaddress.parse(text)
    for ind,word in enumerate(parsed_text):
        if word[1] == "StreetName":
            if word[0] not in ["and","or","near","between"]:
                if ind+1 < len(parsed_text):
                    if parsed_text[ind+1][1] == "StreetNamePostType": 
                        streetnames.append(word[0]+ " " + parsed_text[ind+1][0])
                    else:
                        streetnames.append(word[0])
                else:
                    streetnames.append(word[0])
    return streetnames
 def _reverse_geocode(geocoder, tag_name, lat, long):
     point = Point(lat, long)
     try:
         point_location = geocoder.reverse(point, exactly_one=True)
         if point_location and point_location.address:
             raw_parse = usaddress.parse(point_location.address)
             parsed = SocialExplorer._process_usaddress_parsed(raw_parse, [tag_name])
             return parsed.get('PlaceName') if parsed.get('PlaceName') else None
     except exc.GeocoderQuotaExceeded as e:
         print e
         return
     except Exception as e:
         print e
         return
Example #49
0
def extract_name_and_address(txt):
    match = _name_from_address_splitter.search(txt)
    result = {'street': '', 'name': match.group(1)}
    parsed = usaddress.parse(match.group(2))
    """usaddress parses to many pieces, most of which should be
       reassembled into `street`."""
    for (word, word_type) in parsed:
        if word_type == 'PlaceName':
            result['city'] = word
        elif word_type == 'StateName':
            result['state'] = word
        elif word_type == 'ZipCode':
            result['zip_code'] = word
        else:
            result['street'] += word + ' '
    return {r: result[r].strip().strip(',') for r in result}
Example #50
0
def address_is_complete(text):
    streetname_exists = False
    streetnumber_exists = False
    cross_street = False
    num_streets = 0
    for elem in usaddress.parse(text):
        if "StreetName" == elem[1]:
            streetname_exists = True
            num_streets += 1
        if "AddressNumber" == elem[1] and elem[1].isdigit():
            streetnumber_exists = True
    if streetname_exists and streetnumber_exists:
        return "complete"
    elif num_streets > 1 and not streetnumber_exists:
        return "cross street"
    else:
        return "no address information"
Example #51
0
def isValidAddress(ady, verbose=False):

    address = usaddress.parse(ady)
    if len(address) < 4:
        showFailureReason('Not Enough Terms', ady, address, verbose)
        return False

    if any([a[1] == 'Recipient' for a in address]):
        showFailureReason('Recipient', ady, address, verbose)
        return False

    if not any([a[1] == 'StreetName' for a in address]):
        showFailureReason('StreetName', ady, address, verbose)
        return False

    if not any([a[1] == 'AddressNumber' for a in address]):
        showFailureReason('AddressNumber', ady, address, verbose)
        return False

    return True
Example #52
0
def lookup_geo(g, ady):
    components = usaddress.parse(ady)
    addressNumber = ' '.join([a[0] for a in components
                              if a[1] == 'AddressNumber'])
    streetName = ' '.join([a[0] for a in components
                           if a[1].startswith('Street')])

    streetName = streetName.replace(',', '')
    borough = components[-2][0].replace(',', '')
    if borough == 'NY':
        borough = 'Manhattan'

    # print '%s  : %s : %s ' %(addressNumber, streetName, borough)
    dic = g.address(addressNumber, streetName, borough)
    zipcode = dic.get('zipCode', '')
    streetAddress = '%s %s' % (dic.get('houseNumber', ''),
                               dic.get('firstStreetNameNormalized', ''))

    borough = dic.get('firstBoroughName', '')
    longitude = dic.get('longitude', '')
    latitude = dic.get('latitude', '')

    return {
        "refLocation": [{
            "@type": "Place",
            "@context": "http://schema.org",
            "address": {
                "@type": "PostalAddress",
                "addressLocality": "New York City",
                "addressRegion": "NY",
                "postalCode": zipcode,
                "streetAddress": streetAddress.strip(),
                "borough": borough
            },

            "geo": {
                "@type": "GeoCoordinates",
                "latitude": latitude,
                "longitude": longitude
            }}]
    }
Example #53
0
    def parse(self, name):
        usa = ', USA'
        if name.endswith(usa):
            name = name[:-len(usa)]
        
        try:
            parts = usaddress.parse(name)
        except UnicodeDecodeError:
            parts = ()
        
        place_parts = []
        state = ''
        
        for n, t in parts:
            if t == 'PlaceName':
                place_parts.append(n.strip(' ,'))
            elif t == 'StateName':
                state = n.strip(' ,')

        return {
                'place': ' '.join(place_parts),
                'state': state
                }
                streets.add(tag.get('v').split(" ")[-1])
    elem.clear()
streets


# In[58]:

#Trying a more sophisticated approach that uses natural language processing module usaddress
#https://github.com/datamade/usaddress
streets = Set()
for event, elem in ET.iterparse('los-angeles_california.osm', events=("start",)):
    if elem.tag == "node" or elem.tag == "way":
        for tag in elem.iter("tag"):
            if is_street_name(tag):
                try:
                    streets.add(str(next((st for st, comp in usaddress.parse(tag.get('v')) if comp == "StreetNamePostType"))))
                except:
                    continue
    elem.clear()
streets


# In[4]:

#The file `C1 Street Suffix Abbreviations.html` is the html downloaded from the webpage at http://pe.usps.gov/text/pub28/28apc_002.htm 
#The main table with all of the street suffix abbreviations and variations has an id of ep533076
#Its body is structured as follows, and the algorithm below uses this structure 
#to make {abbrev: full_name} pairs for each abbreviation, including the standard one if
#it's not already included in the middle column
#-------------------------------------------
#| Full Name | Abbrev #1 | Standard Abbrev |
            if columnNum == 1:
                columnTest = column
            columnNum+=1
        numMatch = re.search(r'\d+', columnTest)
        hashTagMatch = re.search(r'#', columnTest)
        if hashTagMatch:
            columnList.append('Code Violations')
        if numMatch:
            columnNum = 0
            for column in columnList:
                if column == '':
                    columnList[columnNum] = 'Not applicable'
                columnNum+=1
            addressString = columnList[2]
            mapsUrl = "http://maps.google.com/?q=" + addressString
            addressDict = usaddress.parse(addressString)
            streetNum = (addressDict[0])[0]
            street = (addressDict[1])[0]
            if len(addressDict) >= 3:
                streetType = stTypeToAbbrev((addressDict[2])[0])
            else:
                streetType = ''
            columnList[2] = mapsUrlToHref(mapsUrl, addressString)
            siteTest = columnList[3]
            noSiteMatch = re.search(r'No website', siteTest)
            if not noSiteMatch:
                columnList[3] = '<a href=\"' + siteTest + '\">Link</a>'
            columnList.append(("<a href=javascript:loadDoc(\'%s\',\'%s\',\'%s\')>Link</a>" %(streetNum, street, streetType)).encode('utf-8').strip()) 

        columnList = list(filter(lambda x: x!= '', columnList))
        housingTable = housingTable + rowGenerator(columnList)
    curs = conn.cursor()
    rows = curs.execute(sel, params)
    for row in curs.fetchall():
        yield row

def geocode(address):
    url = 'https://geomap.ffiec.gov/FFIECGeocMap/GeocodeMap1.aspx/GetGeocodeData'
    headers = {'content-type': 'application/json; charset=utf-8'}
    params = {'sSingleLine': '{0} Chicago, IL'.format(address), 'iCensusYear': "2014"}
    r = requests.post(url, headers=headers, data=json.dumps(params))
    return r.json()

if __name__ == "__main__":
    import time
    for row in getRecords():
        parsed_address = usaddress.parse(row[0])
        add = ' '.join([component for component, label in parsed_address \
                if label in ['AddressNumber', 'StreetNamePreDirectional', 'StreetName']])
        response = geocode(add)
        latitude = response['d']['sLatitude']
        longitude = response['d']['sLongitude']
        formatted_address = response['d']['sMatchAddr']
        raw_address = row[0]
        if latitude and longitude:
            print raw_address
            ins = '''
                INSERT INTO addresses (
                    status,
                    formatted_address,
                    raw_address,
                    source, 
Example #57
0
 def predict(self, X):
     reload(usaddress)  # tagger object is defined at the module level, update now
     predictions = []
     for address in X:
         predictions.append([foo[1] for foo in usaddress.parse(address)])
     return predictions
Example #58
0
            if outdoor: print 'outdoor:', outdoor
            if wifi: print 'wifi:', wifi
            if goodFor: print 'goodFor:', goodFor
            if alcohol: print 'alcohol:', alcohol
            if noise: print 'noise:', noise
            if ambience: print 'ambience:', ambience
            if tv: print 'tv:', tv
            if caters: print 'caters:', caters
            if wheelchairAccessible: print 'wheelchairAccessible:', wheelchairAccessible

        row = dict(title=title, categories=categories, rating=rating, img=img, addr=addr, phone=phone, price=price, menu=menu,
           creditCards=creditCards, parking=parking, attire=attire, groups=groups, kids=kids, reservations=reservations, delivery=delivery, takeout=takeout,
           waiterService=waiterService, outdoor=outdoor, wifi=wifi, goodFor=goodFor, alcohol=alcohol, noise=noise, ambience=ambience, tv=tv, caters=caters,
           wheelchairAccessible=wheelchairAccessible)
        # dont know why we need to switch these
        parsed_address = usaddress.parse(addr)
        row.update({v: k for k,v in parsed_address})
        writer.writerow({k:v.encode('utf8') for k,v in row.items()})

    return extracted, True

def crawl(zipcode=None):
    page = 0
    flag = True
    some_zipcodes = [zipcode] if zipcode else get_zips()

    if zipcode is None:
        print '\n**We are attempting to extract all zipcodes in America!**'

    for zipcode in some_zipcodes:
        # print '\n===== Attempting extraction for zipcode <', zipcode, '>=====\n'
Example #59
0
def lookupAddress(address):
	parsedAddress = usaddress.parse(address)
	return parsedAddress