def _cleanlist(self, listvids): resultlist = [] for vid in listvids: assert isinstance(vid, dict) vid.setdefault(vid.keys()[0]) url = HTMLParser().unescape(vid.get('url')) thumb = HTMLParser().unescape(vid.get('thumb')) label = HTMLParser().unescape(vid.get('label')) upr = urlparse.urlparse(self.url) vbase = upr.scheme + '://' + upr.netloc + '/' if not url.startswith('http'): url = urlparse.urlparse(vbase + url.lstrip('/')).geturl() if not thumb.startswith('http'): thumb = urlparse.urlparse(vbase + thumb.lstrip('/')).geturl() if thumb.endswith('.jpg') or thumb.endswith('.png') or thumb.endswith('.jpeg'): newvid = dict(url=url, thumb=thumb, label=label) newvid.setdefault(newvid.keys()[0]) resultlist.append(newvid) return resultlist
def clean_html(value): """ """ if clean_html and value: # we need a surrounding <p></p> or the content is not generated by appy.pod if not value.startswith(u'<p>') or not value.endswith(u'</p>'): value = u'<p>%s</p>' % value soup = BeautifulSoup(safe_unicode(value)) soup_contents = soup.renderContents() if not isinstance(soup_contents, unicode): soup_contents = safe_unicode(soup_contents) # clean HTML with HTMLParser, it will remove special entities like   soup_contents = HTMLParser().unescape(soup_contents) # clean HTML with lxml Cleaner cleaner = Cleaner() soup_contents = cleaner.clean_html(soup_contents) # clean_html surrounds the cleaned HTML with <div>...</div>... removes it! if soup_contents.startswith(u'<div>') and soup_contents.endswith( u'</div>'): soup_contents = soup_contents[5:-6] if not soup_contents == value: value = soup_contents return value
def populate_restaurants(c): print 'Populating Restaurants table...' if not (os.access('restaurants', os.R_OK) and os.path.isdir('restaurants')): print >> sys.stderr, "Error: cannot access raw data directory 'restaurants'" sys.exit(1) if not (os.access('suburbs.txt', os.R_OK) and os.path.isfile('suburbs.txt')): print >> sys.stderr, "Error: cannot access raw data file 'suburbs.txt'" sys.exit(1) #get postcodes from file and cache in dict suburbs = open('suburbs.txt').readlines() postcodes = {} for suburb in suburbs: lat, lng, pst, sub = suburb.strip().split('\t') postcodes[sub] = pst postcodes['CBD'] = 2000 #special case not in data file users = c.execute('SELECT username FROM Users').fetchall() num_users = c.execute('SELECT COUNT(*) FROM Users').fetchone()[0] i = 0 for restaurant in glob.glob('restaurants/*'): r = open(restaurant).readlines() #extract info from file try: name = r[0].strip() name = HTMLParser().unescape(name) address = r[1].strip() address = HTMLParser().unescape(address) address = re.sub(r'nsw', 'NSW', address, flags=re.I) if not address.endswith(', NSW'): address = address + ', NSW' suburb = re.match(r'.*, (.+), Sydney', r[1]).group(1) suburb = HTMLParser().unescape(suburb) phone = r[2].strip().replace('(', '').replace(')', '') if re.match('Not available', phone): phone = 'Not provided' hours = r[3].strip() hours = re.sub(r'\s*,\s*', ', ', hours) hours = HTMLParser().unescape(hours) cuisine = r[4].strip() cuisine = HTMLParser().unescape(cuisine) cost = r[5].strip() image = r[6].strip() except: print >> sys.stderr, "Error: skipping '%s'" % restaurant continue #lookup postcode using suburb postcode = '' if not suburb in postcodes: continue else: postcode = postcodes[suburb] #and append it to the address address = address + ' ' + str(postcode) #chose a random protocol for the website protocol = 'http://' if random.randint(0, 1) == 1: protocol = 'https://' #make site of the form protocol://www.lowercase.name.of.restaurant.fake.com website = name.replace(' ', ' ').replace(' ', '.').replace( '-', '').strip() + '.fake.com' website = HTMLParser().unescape(website) website = urllib.quote(website) #encode as url website = protocol + 'www.' + website #avoid encoding the protocol website = website.lower().replace('..', '.') #ensure only some restaurants have owners owner = None if random.randint(0, 3) == 0: owner = users[random.randint(0, num_users - 1)][0] i += 1 data = (i, name, suburb, address, postcode, phone, hours, cuisine, owner, website, cost, image) c.execute( '''INSERT INTO Restaurants (id, name, suburb, address, postcode, phone, hours, cuisine, owner, website, cost, image) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', data)