def extract(self, response): """ Takes the data out of the members entries """ sel = Selector(response) items = [] containers = sel.xpath( '//div[@id="membersearchresults"]//div[@id="container"]') for container in containers: item = DaytonChamberItem() item['data_source_url'] = response.url item['retrieved_on'] = datetime.datetime.now().strftime( "%I:%M%p on %B %d, %Y") rows = container.css('div.row') row_dict = {} for row in rows: key = row.css('div.leftcol').xpath('./text()').extract() if len(key) == 0: # No key, so don't bother looking for a value continue key = key[0].strip() if key == 'Business Name:': value = row.css( 'div.rightcol').xpath('./strong/text()').extract() elif key == 'Website:': value = row.xpath('./a/ @href').extract() else: value = row.css('div.rightcol').xpath('./text()').extract() if len(value) == 0: # No value, so don't bother storing continue value = value[0].strip() # Finally store the results in the dict row_dict[key] = value item['name'] = row_dict.get('Business Name:', None) item['category'] = row_dict.get('Business Category:', None) item['contact_name'] = row_dict.get('Contact Name:', None) item['contact_title'] = row_dict.get('Contact Title:', None) item['address'] = row_dict.get('Address:', None) item['website'] = row_dict.get('Website:', None) # Normalize phone numbers try: p_original = row_dict.get('Phone Number:', None) p = phonenumbers.parse(p_original, 'US') p = phonenumbers.normalize_digits_only(p) item['phone'] = p except Exception: # Non-standard phone, so just going to store the original item['phone'] = p_original items.append(item) break return items
def extract(self, response): """ Takes the data out of the members entries """ sel = Selector(response) items = [] containers = sel.xpath( '//div[@id="membersearchresults"]//div[@id="container"]') for container in containers: item = DaytonChamberItem() item['data_source_url'] = response.url item['retrieved_on'] = datetime.datetime.now().strftime( "%I:%M%p on %B %d, %Y") rows = container.css('div.row') row_dict = {} for row in rows: key = row.css('div.leftcol').xpath('./text()').extract() #TODO remove print "key = %s" % key if len(key) == 0: #TODO remove print "no key found" # No key, so don't bother looking for a value continue key = key[0].strip() if key == 'Business Name:': value = row.css('div.rightcol').xpath( './strong/text()').extract() elif key == 'Website:': value = row.xpath('./a/ @href').extract() else: value = row.css('div.rightcol').xpath('./text()').extract() if len(value) == 0: #TODO remove print "no value found" #No value, so don't bother storing continue value = value[0].strip() #Finally store the results in the dict row_dict[key] = value item['name'] = row_dict.get('Business Name:', None) item['category'] = row_dict.get('Business Category:', None) item['contact_name'] = row_dict.get('Contact Name:', None) item['contact_title'] = row_dict.get('Contact Title:', None) item['address'] = row_dict.get('Address:', None) item['website'] = row_dict.get('Website:', None) #Normalize phone numbers try: p_original = row_dict.get('Phone Number:', None) p = phonenumbers.parse(p_original, 'US') p = phonenumbers.normalize_digits_only(p) item['phone'] = p except Exception: #Non-standard phone, so just going to store the original item['phone'] = p_original #TODO remove # from scrapy.shell import inspect_response # inspect_response(response) items.append(item) #TODO remove print row_dict break return items
def extract(self, response): """ Takes the data out of the pages at www.daytonlocal.com/listings/* """ sel = Selector(response) logo = sel.xpath('//*[@id="MainContentArea"]//div[contains(@class, "dright")]/a/img/ @src').extract() # noqa item = DaytonlocalItem() items = [] for card in sel.xpath('//div[contains(@class, "vcard")]'): item['data_source_url'] = response.url item['retrieved_on'] = datetime.datetime.now().strftime( "%I:%M%p on %B %d, %Y") name = card.xpath( '//*[contains(@class, "fn")]//strong/text()').extract() item['name'] = name[0] if name else None website = card.xpath( '//*[contains(@class, "fn")]//a/ @href').extract() item['website'] = website[0] if website else None item['logo'] = urlparse.urljoin('http://www.daytonlocal.com', logo[0]) if logo else None address1 = card.xpath( '//span[contains(@class, "street-address")]/text()').extract() item['address1'] = address1[0] if address1 else None # This ones weird..the text we want is between two <br> tags addr_div = card.css('.adr').extract() address2 = None if addr_div: br = lxml.html.fromstring(addr_div[0]).cssselect('br') if br: address2 = br[0].tail item['address2'] = address2 city = card.xpath( '//span[contains(@class, "locality")]/text()').extract() item['city'] = city[0] if city else None state = card.xpath( '//span[contains(@class, "region")]/text()').extract() item['state'] = state[0] if state else None zipcode = card.xpath( '//span[contains(@class, "postal-code")]/text()').extract() item['zip'] = zipcode[0] if zipcode else None special_divs = card.xpath('div[contains(@class, "clearl")]') if special_divs: phone = special_divs[0].xpath('text()').extract() try: p = phonenumbers.parse(phone[0], 'US') p = phonenumbers.normalize_digits_only(p) item['phone'] = p except Exception as e: item['phone'] = None print(e) if len(special_divs) >= 3: descr = special_divs[2].xpath('text()').extract() item['description'] = descr[0] if descr else None item['facebook'] = None item['twitter'] = None item['category'] = None # social media links hrefs = special_divs[1].xpath('a/ @href').extract() for href in hrefs: if 'facebook' in href: item['facebook'] = facebook_matcher.match(href).group(1) elif 'twitter' in href: item['twitter'] = twitter_matcher.match(href).group(1) else: match = category_matcher.match(href) if match: item['category'] = match.group(1).split('/') # Strip all strings for k, v in item.iteritems(): if isinstance(v, basestring): item[k] = v.strip() items.append(item) return items
def extract(self, response): """ Takes the data out of the members entries """ sel = Selector(response) items = [] containers = sel.xpath('//div[@id="membersearchresults"]//div[@id="container"]') for container in containers: item = DaytonChamberItem() item["data_source_url"] = response.url item["retrieved_on"] = datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y") rows = container.css("div.row") row_dict = {} for row in rows: key = row.css("div.leftcol").xpath("./text()").extract() # TODO remove print "key = %s" % key if len(key) == 0: # TODO remove print "no key found" # No key, so don't bother looking for a value continue key = key[0].strip() if key == "Business Name:": value = row.css("div.rightcol").xpath("./strong/text()").extract() elif key == "Website:": value = row.xpath("./a/ @href").extract() else: value = row.css("div.rightcol").xpath("./text()").extract() if len(value) == 0: # TODO remove print "no value found" # No value, so don't bother storing continue value = value[0].strip() # Finally store the results in the dict row_dict[key] = value item["name"] = row_dict.get("Business Name:", None) item["category"] = row_dict.get("Business Category:", None) item["contact_name"] = row_dict.get("Contact Name:", None) item["contact_title"] = row_dict.get("Contact Title:", None) item["address"] = row_dict.get("Address:", None) item["website"] = row_dict.get("Website:", None) # Normalize phone numbers try: p_original = row_dict.get("Phone Number:", None) p = phonenumbers.parse(p_original, "US") p = phonenumbers.normalize_digits_only(p) item["phone"] = p except Exception: # Non-standard phone, so just going to store the original item["phone"] = p_original # TODO remove # from scrapy.shell import inspect_response # inspect_response(response) items.append(item) # TODO remove print row_dict break return items
def rm_punct_leaving_plus_sign(num): prefix, num = COUNTRY_RG.match(num).groups() num = normalize_digits_only(num) return '{} {}'.format(prefix, num)
def extract(self, response): """ Takes the data out of the pages at www.daytonlocal.com/listings/* """ sel = Selector(response) logo = sel.xpath('//*[@id="MainContentArea"]//div[contains(@class, "dright")]/a/img/ @src').extract() item = DaytonlocalItem() items = [] for card in sel.xpath('//div[contains(@class, "vcard")]'): item['data_source_url'] = response.url item['retrieved_on'] = datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y") name = card.xpath('//*[contains(@class, "fn")]//strong/text()').extract() item['name'] = name[0] if name else None website = card.xpath('//*[contains(@class, "fn")]//a/ @href').extract() item['website'] = website[0] if website else None item['logo'] = urlparse.urljoin('http://www.daytonlocal.com', logo[0]) if logo else None address1 = card.xpath('//span[contains(@class, "street-address")]/text()').extract() item['address1'] = address1[0] if address1 else None # This ones weird..the text we want is between two <br> tags addr_div = card.css('.adr').extract() address2 = None if addr_div: br = lxml.html.fromstring(addr_div[0]).cssselect('br') if br: address2 = br[0].tail item['address2'] = address2 city = card.xpath('//span[contains(@class, "locality")]/text()').extract() item['city'] = city[0] if city else None state = card.xpath('//span[contains(@class, "region")]/text()').extract() item['state'] = state[0] if state else None zipcode = card.xpath('//span[contains(@class, "postal-code")]/text()').extract() item['zip'] = zipcode[0] if zipcode else None special_divs = card.xpath('div[contains(@class, "clearl")]') if special_divs: phone = special_divs[0].xpath('text()').extract() try: p = phonenumbers.parse(phone[0], 'US') p = phonenumbers.normalize_digits_only(p) item['phone'] = p except Exception, e: item['phone'] = None print e if len(special_divs) >=3: descr = special_divs[2].xpath('text()').extract() item['description'] = descr[0] if descr else None item['facebook'] = None item['twitter'] = None item['category'] = None #social media links hrefs = special_divs[1].xpath('a/ @href').extract() for href in hrefs: if 'facebook' in href: item['facebook'] = facebook_matcher.match(href).group(1) elif 'twitter' in href: item['twitter'] = twitter_matcher.match(href).group(1) else: match = category_matcher.match(href) if match: item['category'] = match.group(1).split('/') #Strip all strings for k, v in item.iteritems(): if isinstance(v, basestring): item[k] = v.strip() items.append(item)