def main(): table = Datasheet() tel = '' street = '' locality = '' title = '' for i in range(3): page = i+1 url = URL("http://torino.paginegialle.it/pgol/4-veterinari/3-torino/p-%s?mr=50" % page) print "collecting from %s" % url connection = url.open() doc = Document( connection.read() ) items = doc.by_class('item_sx') row = [] for j, item in enumerate(items): divs = item.by_class('address') try: title = item.by_class('item_head')[0].by_tag('a')[0].content except IndexError, e: print >> sys.stderr, "%s" % j, e pass for z, div in enumerate(divs): if div != None: try: street = div.by_class('street-address')[0].content locality = div.by_class('locality')[0].content tel = div.by_class('tel')[0].by_class('value')[0].content except IndexError, e: print >> sys.stderr, "%s" % z, e pass save = "%s, %s %s, %s \n" % ( plaintext(title), plaintext(street).replace(",", ""), plaintext(locality).replace('(TO)', ''), plaintext(tel).replace(",", "") ) print >> sys.stderr, save row.append(save)
def main(): table = Datasheet() url = URL("http://www.comuniecitta.it/torino/elenco-ospedali-di-torino.html") connection = url.open() doc = Document( connection.read() ) items = doc.by_class('ulamm')[1:] row = [] for ul in items: li = ul.by_tag('li') kind = plaintext(ul.previous.content) for el in li: if el != None: save = "%s, %s \n" % ( plaintext(el.content).replace('\n', ','), kind, ) row.append(save) table.append( row ) table.save("files/h_torino.txt")