Ejemplo n.º 1
0
 def _cleanlist(self, listvids):
     resultlist = []
     for vid in listvids:
         assert isinstance(vid, dict)
         vid.setdefault(vid.keys()[0])
         url = HTMLParser().unescape(vid.get('url'))
         thumb = HTMLParser().unescape(vid.get('thumb'))
         label = HTMLParser().unescape(vid.get('label'))
         upr = urlparse.urlparse(self.url)
         vbase = upr.scheme + '://' + upr.netloc + '/'
         if not url.startswith('http'):
             url = urlparse.urlparse(vbase + url.lstrip('/')).geturl()
         if not thumb.startswith('http'):
             thumb = urlparse.urlparse(vbase + thumb.lstrip('/')).geturl()
         if thumb.endswith('.jpg') or thumb.endswith('.png') or thumb.endswith('.jpeg'):
             newvid = dict(url=url, thumb=thumb, label=label)
             newvid.setdefault(newvid.keys()[0])
             resultlist.append(newvid)
     return resultlist
Ejemplo n.º 2
0
def clean_html(value):
    """ """
    if clean_html and value:
        # we need a surrounding <p></p> or the content is not generated by appy.pod
        if not value.startswith(u'<p>') or not value.endswith(u'</p>'):
            value = u'<p>%s</p>' % value
        soup = BeautifulSoup(safe_unicode(value))
        soup_contents = soup.renderContents()
        if not isinstance(soup_contents, unicode):
            soup_contents = safe_unicode(soup_contents)
        # clean HTML with HTMLParser, it will remove special entities like &#xa0;
        soup_contents = HTMLParser().unescape(soup_contents)
        # clean HTML with lxml Cleaner
        cleaner = Cleaner()
        soup_contents = cleaner.clean_html(soup_contents)
        # clean_html surrounds the cleaned HTML with <div>...</div>... removes it!
        if soup_contents.startswith(u'<div>') and soup_contents.endswith(
                u'</div>'):
            soup_contents = soup_contents[5:-6]
        if not soup_contents == value:
            value = soup_contents
    return value
Ejemplo n.º 3
0
def populate_restaurants(c):
    print 'Populating Restaurants table...'

    if not (os.access('restaurants', os.R_OK)
            and os.path.isdir('restaurants')):
        print >> sys.stderr, "Error: cannot access raw data directory 'restaurants'"
        sys.exit(1)

    if not (os.access('suburbs.txt', os.R_OK)
            and os.path.isfile('suburbs.txt')):
        print >> sys.stderr, "Error: cannot access raw data file 'suburbs.txt'"
        sys.exit(1)

    #get postcodes from file and cache in dict
    suburbs = open('suburbs.txt').readlines()
    postcodes = {}
    for suburb in suburbs:
        lat, lng, pst, sub = suburb.strip().split('\t')
        postcodes[sub] = pst
    postcodes['CBD'] = 2000  #special case not in data file

    users = c.execute('SELECT username FROM Users').fetchall()
    num_users = c.execute('SELECT COUNT(*) FROM Users').fetchone()[0]

    i = 0
    for restaurant in glob.glob('restaurants/*'):
        r = open(restaurant).readlines()

        #extract info from file
        try:
            name = r[0].strip()
            name = HTMLParser().unescape(name)
            address = r[1].strip()
            address = HTMLParser().unescape(address)
            address = re.sub(r'nsw', 'NSW', address, flags=re.I)
            if not address.endswith(', NSW'):
                address = address + ', NSW'
            suburb = re.match(r'.*, (.+), Sydney', r[1]).group(1)
            suburb = HTMLParser().unescape(suburb)
            phone = r[2].strip().replace('(', '').replace(')', '')
            if re.match('Not available', phone):
                phone = 'Not provided'
            hours = r[3].strip()
            hours = re.sub(r'\s*,\s*', ', ', hours)
            hours = HTMLParser().unescape(hours)
            cuisine = r[4].strip()
            cuisine = HTMLParser().unescape(cuisine)
            cost = r[5].strip()
            image = r[6].strip()
        except:
            print >> sys.stderr, "Error: skipping '%s'" % restaurant
            continue

        #lookup postcode using suburb
        postcode = ''
        if not suburb in postcodes:
            continue
        else:
            postcode = postcodes[suburb]

        #and append it to the address
        address = address + ' ' + str(postcode)

        #chose a random protocol for the website
        protocol = 'http://'
        if random.randint(0, 1) == 1:
            protocol = 'https://'

        #make site of the form protocol://www.lowercase.name.of.restaurant.fake.com
        website = name.replace('  ', ' ').replace(' ', '.').replace(
            '-', '').strip() + '.fake.com'
        website = HTMLParser().unescape(website)
        website = urllib.quote(website)  #encode as url
        website = protocol + 'www.' + website  #avoid encoding the protocol
        website = website.lower().replace('..', '.')

        #ensure only some restaurants have owners
        owner = None
        if random.randint(0, 3) == 0:
            owner = users[random.randint(0, num_users - 1)][0]

        i += 1
        data = (i, name, suburb, address, postcode, phone, hours, cuisine,
                owner, website, cost, image)
        c.execute(
            '''INSERT INTO Restaurants
				(id, name, suburb, address, postcode, phone, hours, cuisine, owner, website, cost, image)
				VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', data)