def extract(self, response):
        """
        Takes the data out of the members entries
        """

        sel = Selector(response)

        items = []

        containers = sel.xpath(
            '//div[@id="membersearchresults"]//div[@id="container"]')

        for container in containers:

            item = DaytonChamberItem()

            item['data_source_url'] = response.url
            item['retrieved_on'] = datetime.datetime.now().strftime(
                "%I:%M%p on %B %d, %Y")

            rows = container.css('div.row')

            row_dict = {}

            for row in rows:
                key = row.css('div.leftcol').xpath('./text()').extract()

                if len(key) == 0:
                    # No key, so don't bother looking for a value
                    continue

                key = key[0].strip()

                if key == 'Business Name:':
                    value = row.css(
                        'div.rightcol').xpath('./strong/text()').extract()
                elif key == 'Website:':
                    value = row.xpath('./a/ @href').extract()
                else:
                    value = row.css('div.rightcol').xpath('./text()').extract()

                if len(value) == 0:
                    # No value, so don't bother storing
                    continue

                value = value[0].strip()

                # Finally store the results in the dict
                row_dict[key] = value

            item['name'] = row_dict.get('Business Name:', None)
            item['category'] = row_dict.get('Business Category:', None)
            item['contact_name'] = row_dict.get('Contact Name:', None)
            item['contact_title'] = row_dict.get('Contact Title:', None)
            item['address'] = row_dict.get('Address:', None)
            item['website'] = row_dict.get('Website:', None)

            # Normalize phone numbers
            try:
                p_original = row_dict.get('Phone Number:', None)
                p = phonenumbers.parse(p_original, 'US')
                p = phonenumbers.normalize_digits_only(p)
                item['phone'] = p
            except Exception:
                # Non-standard phone, so just going to store the original
                item['phone'] = p_original

            items.append(item)

            break

        return items
    def extract(self, response):
        """
        Takes the data out of the members entries
        """

        sel = Selector(response)

        items = []

        containers = sel.xpath(
            '//div[@id="membersearchresults"]//div[@id="container"]')

        for container in containers:

            item = DaytonChamberItem()

            item['data_source_url'] = response.url
            item['retrieved_on'] = datetime.datetime.now().strftime(
                "%I:%M%p on %B %d, %Y")

            rows = container.css('div.row')

            row_dict = {}

            for row in rows:
                key = row.css('div.leftcol').xpath('./text()').extract()

                #TODO remove
                print "key = %s" % key

                if len(key) == 0:
                    #TODO remove
                    print "no key found"
                    # No key, so don't bother looking for a value
                    continue

                key = key[0].strip()

                if key == 'Business Name:':
                    value = row.css('div.rightcol').xpath(
                        './strong/text()').extract()
                elif key == 'Website:':
                    value = row.xpath('./a/ @href').extract()
                else:
                    value = row.css('div.rightcol').xpath('./text()').extract()

                if len(value) == 0:
                    #TODO remove
                    print "no value found"

                    #No value, so don't bother storing
                    continue

                value = value[0].strip()

                #Finally store the results in the dict
                row_dict[key] = value

            item['name'] = row_dict.get('Business Name:', None)
            item['category'] = row_dict.get('Business Category:', None)
            item['contact_name'] = row_dict.get('Contact Name:', None)
            item['contact_title'] = row_dict.get('Contact Title:', None)
            item['address'] = row_dict.get('Address:', None)
            item['website'] = row_dict.get('Website:', None)

            #Normalize phone numbers
            try:
                p_original = row_dict.get('Phone Number:', None)
                p = phonenumbers.parse(p_original, 'US')
                p = phonenumbers.normalize_digits_only(p)
                item['phone'] = p
            except Exception:
                #Non-standard phone, so just going to store the original
                item['phone'] = p_original

            #TODO remove
            # from scrapy.shell import inspect_response
            # inspect_response(response)

            items.append(item)

            #TODO remove
            print row_dict
            break

        return items
    def extract(self, response):
        """
        Takes the data out of the pages at www.daytonlocal.com/listings/*
        """

        sel = Selector(response)

        logo = sel.xpath('//*[@id="MainContentArea"]//div[contains(@class, "dright")]/a/img/ @src').extract()  # noqa

        item = DaytonlocalItem()

        items = []

        for card in sel.xpath('//div[contains(@class, "vcard")]'):
            item['data_source_url'] = response.url
            item['retrieved_on'] = datetime.datetime.now().strftime(
                "%I:%M%p on %B %d, %Y")

            name = card.xpath(
                '//*[contains(@class, "fn")]//strong/text()').extract()
            item['name'] = name[0] if name else None

            website = card.xpath(
                '//*[contains(@class, "fn")]//a/ @href').extract()
            item['website'] = website[0] if website else None

            item['logo'] = urlparse.urljoin('http://www.daytonlocal.com',
                                            logo[0]) if logo else None

            address1 = card.xpath(
                '//span[contains(@class, "street-address")]/text()').extract()
            item['address1'] = address1[0] if address1 else None

            # This ones weird..the text we want is between two <br> tags
            addr_div = card.css('.adr').extract()
            address2 = None
            if addr_div:
                br = lxml.html.fromstring(addr_div[0]).cssselect('br')
                if br:
                    address2 = br[0].tail
            item['address2'] = address2

            city = card.xpath(
                '//span[contains(@class, "locality")]/text()').extract()
            item['city'] = city[0] if city else None

            state = card.xpath(
                '//span[contains(@class, "region")]/text()').extract()
            item['state'] = state[0] if state else None

            zipcode = card.xpath(
                '//span[contains(@class, "postal-code")]/text()').extract()
            item['zip'] = zipcode[0] if zipcode else None

            special_divs = card.xpath('div[contains(@class, "clearl")]')

            if special_divs:
                phone = special_divs[0].xpath('text()').extract()
                try:
                    p = phonenumbers.parse(phone[0], 'US')
                    p = phonenumbers.normalize_digits_only(p)
                    item['phone'] = p
                except Exception as e:
                    item['phone'] = None
                    print(e)

            if len(special_divs) >= 3:
                descr = special_divs[2].xpath('text()').extract()
                item['description'] = descr[0] if descr else None

            item['facebook'] = None
            item['twitter'] = None
            item['category'] = None

            # social media links
            hrefs = special_divs[1].xpath('a/ @href').extract()
            for href in hrefs:
                if 'facebook' in href:
                    item['facebook'] = facebook_matcher.match(href).group(1)
                elif 'twitter' in href:
                    item['twitter'] = twitter_matcher.match(href).group(1)
                else:
                    match = category_matcher.match(href)
                    if match:
                        item['category'] = match.group(1).split('/')

            # Strip all strings
            for k, v in item.iteritems():
                if isinstance(v, basestring):
                    item[k] = v.strip()

            items.append(item)

        return items
    def extract(self, response):
        """
        Takes the data out of the members entries
        """

        sel = Selector(response)

        items = []

        containers = sel.xpath('//div[@id="membersearchresults"]//div[@id="container"]')

        for container in containers:

            item = DaytonChamberItem()

            item["data_source_url"] = response.url
            item["retrieved_on"] = datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y")

            rows = container.css("div.row")

            row_dict = {}

            for row in rows:
                key = row.css("div.leftcol").xpath("./text()").extract()

                # TODO remove
                print "key = %s" % key

                if len(key) == 0:
                    # TODO remove
                    print "no key found"
                    # No key, so don't bother looking for a value
                    continue

                key = key[0].strip()

                if key == "Business Name:":
                    value = row.css("div.rightcol").xpath("./strong/text()").extract()
                elif key == "Website:":
                    value = row.xpath("./a/ @href").extract()
                else:
                    value = row.css("div.rightcol").xpath("./text()").extract()

                if len(value) == 0:
                    # TODO remove
                    print "no value found"

                    # No value, so don't bother storing
                    continue

                value = value[0].strip()

                # Finally store the results in the dict
                row_dict[key] = value

            item["name"] = row_dict.get("Business Name:", None)
            item["category"] = row_dict.get("Business Category:", None)
            item["contact_name"] = row_dict.get("Contact Name:", None)
            item["contact_title"] = row_dict.get("Contact Title:", None)
            item["address"] = row_dict.get("Address:", None)
            item["website"] = row_dict.get("Website:", None)

            # Normalize phone numbers
            try:
                p_original = row_dict.get("Phone Number:", None)
                p = phonenumbers.parse(p_original, "US")
                p = phonenumbers.normalize_digits_only(p)
                item["phone"] = p
            except Exception:
                # Non-standard phone, so just going to store the original
                item["phone"] = p_original

            # TODO remove
            # from scrapy.shell import inspect_response
            # inspect_response(response)

            items.append(item)

            # TODO remove
            print row_dict
            break

        return items
Exemple #5
0
def rm_punct_leaving_plus_sign(num):
    prefix, num = COUNTRY_RG.match(num).groups()
    num = normalize_digits_only(num)
    return '{} {}'.format(prefix, num)
Exemple #6
0
    def extract(self, response):
        """
        Takes the data out of the pages at www.daytonlocal.com/listings/*
        """

        sel = Selector(response)

        logo = sel.xpath('//*[@id="MainContentArea"]//div[contains(@class, "dright")]/a/img/ @src').extract()

        item = DaytonlocalItem()

        items = []

        for card in sel.xpath('//div[contains(@class, "vcard")]'):

            item['data_source_url'] = response.url
            item['retrieved_on'] = datetime.datetime.now().strftime("%I:%M%p on %B %d, %Y")

            name = card.xpath('//*[contains(@class, "fn")]//strong/text()').extract()
            item['name'] = name[0] if name else None

            website = card.xpath('//*[contains(@class, "fn")]//a/ @href').extract()
            item['website'] = website[0] if website else None

            item['logo'] = urlparse.urljoin('http://www.daytonlocal.com', logo[0]) if logo else None

            address1 = card.xpath('//span[contains(@class, "street-address")]/text()').extract()
            item['address1'] = address1[0] if address1 else None

            # This ones weird..the text we want is between two <br> tags
            addr_div = card.css('.adr').extract()
            address2 = None
            if addr_div:
                br = lxml.html.fromstring(addr_div[0]).cssselect('br')
                if br:
                    address2 = br[0].tail
            item['address2'] = address2

            city = card.xpath('//span[contains(@class, "locality")]/text()').extract()
            item['city'] = city[0] if city else None

            state = card.xpath('//span[contains(@class, "region")]/text()').extract()
            item['state'] = state[0] if state else None

            zipcode = card.xpath('//span[contains(@class, "postal-code")]/text()').extract()
            item['zip'] = zipcode[0] if zipcode else None

            special_divs = card.xpath('div[contains(@class, "clearl")]')

            if special_divs:
                phone = special_divs[0].xpath('text()').extract()
                try:
                    p = phonenumbers.parse(phone[0], 'US')
                    p = phonenumbers.normalize_digits_only(p)
                    item['phone'] = p
                except Exception, e:
                    item['phone'] = None
                    print e

            if len(special_divs) >=3:
                descr = special_divs[2].xpath('text()').extract()
                item['description'] = descr[0] if descr else None

            item['facebook'] = None
            item['twitter'] = None
            item['category'] = None

            #social media links
            hrefs = special_divs[1].xpath('a/ @href').extract()
            for href in hrefs:
                if 'facebook' in href:
                    item['facebook'] = facebook_matcher.match(href).group(1)
                elif 'twitter' in href:
                    item['twitter'] = twitter_matcher.match(href).group(1)
                else:
                    match = category_matcher.match(href)
                    if match:
                        item['category'] = match.group(1).split('/')

            #Strip all strings
            for k, v in item.iteritems():
                if isinstance(v, basestring):
                    item[k] = v.strip()

            items.append(item)