Ejemplo n.º 1
0
def parse(handle):
    # req = urllib2.urlopen(url)
    soup = BeautifulSoup.BeautifulSoup(handle)
    result_div = soup.find('div', {'class' : 'tableTopBorder'})
    for result in result_div.findAll('tr', {'class' : re.compile('lightBlue')}):
        try:
            cells = result.findAll('td')

            addresses = ''.join([e for e in cells[1].recursiveChildGenerator() if isinstance(e,unicode)])
            addresses = addresses.strip().split('\n')

            name = addresses.pop(0)
            postcode = addresses.pop(-1)

            addresses_dict = {}
            for i, address in enumerate(addresses):
                i = i+1
                addresses_dict["address%s" % i] = address

            url = cells[-1].a['href']
            parsed_url = urlparse.parse_qs(url)
            lat = parsed_url['lat']
            lon = parsed_url['lon']

            try:
                d = Dodger.objects.get(name=name, company=company_id, postcode=postcode)
            except Exception, e:
                d = Dodger(name=name, company=company_id, postcode=postcode)

            d.address1 = addresses_dict.get('address1')
            d.address2 = addresses_dict.get('address2')
            d.address3 = addresses_dict.get('address3')
            d.address4 = addresses_dict.get('address4')
            d.country = "United Kingdom"
            d.location = Point(float(lat[0]), float(lon[0]))
            d.brand = brand
            d.save()
        except Exception, e:
            print brand_name, e
Ejemplo n.º 2
0
def parse_details(handle):
    soup = BeautifulSoup.BeautifulSoup(handle)
    result_div = soup.find('div', {'id' : 'template9_middle_bottom_left'})
    for result in result_div.findAll('div', {'class' : re.compile('transBack')}):
        try:
            cells = result.findAll('p', {'class' : 'paddingTop10'})

            addresses = ''.join([e for e in cells[3].recursiveChildGenerator() if isinstance(e,unicode)])
            addresses = addresses.strip().split('\n')

            name = addresses.pop(0)
            postcode = addresses.pop(-1)

            addresses_dict = {}
            for i, address in enumerate(addresses):
                i = i+1
                addresses_dict["address%s" % i] = address

            phone = cells[2].contents[1].split(' or ')[0].strip()

            try:
                d = Dodger.objects.get(name=name, company=company_id, postcode=postcode)
            except Exception, e:
                d = Dodger(name=name, company=company_id, postcode=postcode)
                print "unknown store"
            
            d.phone = phone
            d.save()

            opening_times = cells[1]
            if len(opening_times) > 3:
                # print opening_times
                opening_times = ''.join([e for e in opening_times.recursiveChildGenerator() if isinstance(e,unicode)])
                opening_times = opening_times.split('\n')[2:]

                mon = opening_times[0]
                tue = opening_times[1]
                wed = opening_times[2]
                thu = opening_times[3]
                fri = opening_times[4]
                sat = opening_times[5]
                sun = opening_times[6]

            # Delete all opeing times for this Dodger
            d.opening_times.all().delete()
            def parse_open_time(str_time):
                # Sunday: 1000-1630
                if str_time:
                    open_close = str_time.split(':')[1]
                    open_close = open_close.split('-')
                    if len(open_close) >= 2:
                        for i,v in enumerate(open_close):
                            v = v.replace('.', ':').strip()
                            v = list(v)
                            v.insert(2, ':')
                            v = "".join(v)
                            open_close[i] = v
                        return open_close
            try:
                # Monday
                open_close = parse_open_time(mon)
                if mon and open_close:
                    o = d.opening_times.create(day_of_week=0, open_time=open_close[0], close_time=open_close[1])
                # Tuesday
                open_close = parse_open_time(tue)
                if tue and open_close:
                    o = d.opening_times.create(day_of_week=1, open_time=open_close[0], close_time=open_close[1])
                # Wednesday
                open_close = parse_open_time(wed)
                if wed and open_close:
                    o = d.opening_times.create(day_of_week=2, open_time=open_close[0], close_time=open_close[1])
                # Thursday
                open_close = parse_open_time(thu)
                if thu and open_close:
                    o = d.opening_times.create(day_of_week=3, open_time=open_close[0], close_time=open_close[1])
                # Friday
                open_close = parse_open_time(fri)
                if fri and open_close:
                    o = d.opening_times.create(day_of_week=4, open_time=open_close[0], close_time=open_close[1])
                # Saturday
                open_close = parse_open_time(sat)
                if sat and open_close:
                    o = d.opening_times.create(day_of_week=5, open_time=open_close[0], close_time=open_close[1])
                # Sunday
                open_close = parse_open_time(sun)
                if sun and open_close:
                    o = d.opening_times.create(day_of_week=6, open_time=open_close[0], close_time=open_close[1])
            except Exception, e:
                print "error parsing opening times"
                print e
                print open_close
Ejemplo n.º 3
0
def scrape():
    
    company_id = Dodger.C_ARCADIA
    
    brands = ( 
        (12551,'Burton',          "http://cloudservices.arcadiagroup.co.uk/storestock/storestock?brand=12551&jsonp_callback=jsonp1292077523475&lat=51.461752&long=-0.114286&dist=50000&res=10000000&_=1292077544710",),
        (12552,'Dorothy Perkins', "http://cloudservices.arcadiagroup.co.uk/storestock/storestock?brand=12552&jsonp_callback=jsonp1292077523475&lat=51.461752&long=-0.114286&dist=50000&res=10000000&_=1292077544710",),
        (12553,'Evans',           "http://cloudservices.arcadiagroup.co.uk/storestock/storestock?brand=12553&jsonp_callback=jsonp1292077523475&lat=51.461752&long=-0.114286&dist=50000&res=10000000&_=1292077544710",),
        (12554,'Miss selfridge',  "http://cloudservices.arcadiagroup.co.uk/storestock/storestock?brand=12554&jsonp_callback=jsonp1292077523475&lat=51.461752&long=-0.114286&dist=50000&res=10000000&_=1292077544710",),
        (12555,'Topshop',         "http://cloudservices.arcadiagroup.co.uk/storestock/storestock?brand=12555&jsonp_callback=jsonp1292077523475&lat=51.461752&long=-0.114286&dist=50000&res=10000000&_=1292077544710",),
        (12556,'Topman' ,         "http://cloudservices.arcadiagroup.co.uk/storestock/storestock?brand=12556&jsonp_callback=jsonp1292077523475&lat=51.461752&long=-0.114286&dist=50000&res=10000000&_=1292077544710",),
        (12557,'Wallis',          "http://cloudservices.arcadiagroup.co.uk/storestock/storestock?brand=12557&jsonp_callback=jsonp1292077523475&lat=51.461752&long=-0.114286&dist=50000&res=10000000&_=1292077544710",),
    )

    for brand_id, brand_name, url in brands:
    
        try:
            brand = Brand.objects.get(brand_id=brand_id)
        except Brand.DoesNotExist:
            brand = Brand(brand_id=brand_id, name=brand_name)
            brand.save()
        
        r = urllib2.Request(url, headers={'User-Agent' : "Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11"})
        res = urllib2.urlopen(r)
        x = res.read()[19:-2]

        data = json.loads(x)
    
        for store in data['stores']['store']:
            for k,v in store.items():
                store[k] = v.encode('utf8')
            
            try:
                d = Dodger.objects.get(company=company_id, doger_id=store['storeId'], brand=brand)
            except Dodger.DoesNotExist:
                d = Dodger()
            d.name = store['storeName']
            d.company = company_id
            d.brand = brand
            d.doger_id = store['storeId']
            d.address1 = store.get('address1')
            d.address2 = store.get('address2')
            d.address3 = store.get('address3')
            d.address4 = store.get('address4')
            d.postcode = store.get('postcode')
            d.phone = store.get('telephoneNumber')
            # d.location = fromstr('POINT(%s, %s)' % (store['latitude'], store['longitude']))
            # d.location = 'POINT((%s, %s))' % (store['latitude'], store['longitude'])
            if float(store['latitude']) and float(store['longitude']):
                d.location = Point(float(store['latitude']), float(store['longitude']))
            d.country = store['country']
            d.save()
            
            
            # Delete all opeing times for this Dodger
            d.opening_times.all().delete()
            
            def parse_open_time(str_time):
                # 08:00-21:00
                if str_time:
                    open_close = str_time.split('-')
                    if len(open_close) >= 2:
                        open_close = [v.replace('.', ':') for v in open_close]
                        return open_close
                
            try:
                # Monday
                open_close = parse_open_time(store.get('openingMon'))
                if store.get('openingMon') and open_close:
                    o = d.opening_times.create(day_of_week=0, open_time=open_close[0], close_time=open_close[1])
                # Tuesday
                open_close = parse_open_time(store.get('openingTue'))
                if store.get('openingTue') and open_close:
                    o = d.opening_times.create(day_of_week=1, open_time=open_close[0], close_time=open_close[1])
                # Wednesday
                open_close = parse_open_time(store.get('openingWed'))
                if store.get('openingWed') and open_close:
                    o = d.opening_times.create(day_of_week=2, open_time=open_close[0], close_time=open_close[1])
                # Thursday
                open_close = parse_open_time(store.get('openingThu'))
                if store.get('openingThu') and open_close:
                    o = d.opening_times.create(day_of_week=3, open_time=open_close[0], close_time=open_close[1])
                # Friday
                open_close = parse_open_time(store.get('openingFri'))
                if store.get('openingFri') and open_close:
                    o = d.opening_times.create(day_of_week=4, open_time=open_close[0], close_time=open_close[1])
                # Saturday
                open_close = parse_open_time(store.get('openingSat'))
                if store.get('openingSat') and open_close:
                    o = d.opening_times.create(day_of_week=5, open_time=open_close[0], close_time=open_close[1])
                # Sunday
                open_close = parse_open_time(store.get('openingSun'))
                if store.get('openingSun') and open_close:
                    o = d.opening_times.create(day_of_week=6, open_time=open_close[0], close_time=open_close[1])
            except Exception, e:
                print e
                print open_close