def parse_location_info(self): result = {} name = self.get_node_text(self.root, "//div[@class='top_left']/h1") print ' ', name if name is None: raise Exception('%s is not a valid location' % self.id) result['name'] = name.strip() db.cur.execute("SELECT * from location WHERE id=?", (self.id,)) row = db.cur.fetchone() # parse the line map map_id = self.parse_map() if map_id: result['map_id'] = map_id # parse suburbs suburbs = self.get_node_text(self.root, "//div[@class='suburbsInner']/p") if suburbs is None: print "empty suburbs, should not happen" result['suburbs'] = ' '.join([re.sub('\s+', ' ', x).strip() for x in suburbs.split('\n') if re.sub('\s*', '', x) != '']) # set the parsed mark and update it to database result['parsed'] = 'T' db.update_table_with_dict('location', 'id', self.id, result)
def parse_route_info(self): result = {} title = self.get_node_text(self.root, "//h1[@class='WheelChairAccess']") print ' ', title if title is None: raise Exception('%s is not a valid route' % self.id) title = re.sub('\s+', ' ', title).strip() result['title'] = title db.cur.execute("SELECT * from route WHERE id=?", (self.id,)) row = db.cur.fetchone() if row: result.update(self.parse_line_info(row['type'], title)) # parse the line map map_id = self.parse_map() if map_id: result['map_id'] = map_id # parse description desc_node = self.get_node(self.root, "//div[@class='routeDescriptionInner']") s = '' it = desc_node.itertext() try: s += it + "\n" except: pass result['desc'] = '\n'.join([re.sub('\s+', ' ', x) for x in s.split('\n') if re.sub('\s*', '', x) != '']) # set the parsed mark and update it to database result['parsed'] = 'T' db.update_table_with_dict('route', 'id', self.id, result)
def parse_stop_info(self): result = {} table_node = self.get_node(self.root, "//table[@id='stopInfo']") td_nodes = table_node.xpath("//td") #get title title = self.get_node_text(self.root, "//h1[@class='fn org']") if title is None: raise Exception('not a valid stop, title is empty') try: result['name'] = title[0:title.rindex('-')].strip() except: result['name'] = title result['address_street'] = self.get_node_text(td_nodes[0], "//span[@class='street-address']") result['address_locality'] = self.get_node_text(td_nodes[0], "//span[@class='locality']") result['address_postalcode'] = self.get_node_text(td_nodes[0], "//span[@class='postal-code']") location_name = self.get_node_text(td_nodes[1], "a") location_id = self.get_node(td_nodes[1], "a").get('href').split('/')[-1] if (location_id != '' and location_name != ''): db.update_table('location', 'id', location_id, name=location_name) result['location_id'] = location_id else: raise Exception('not a valid stop, location is empty,') result['tickets'] = ','.join([n.text for n in td_nodes[3].xpath("ul/li")]) result['waiting_indoor'] = self.get_node_text(td_nodes[20].xpath("dl/dd")[0]) result['waiting_sheltered'] = self.get_node_text(td_nodes[20].xpath("dl/dd")[1]) result['bicycle_racks'] = self.get_node_text(td_nodes[22].xpath("dl/dd")[0]) result['bicycle_lockers'] = self.get_node_text(td_nodes[22].xpath("dl/dd")[1]) result['bicycle_cage'] = self.get_node_text(td_nodes[22].xpath("dl/dd")[2]) result['geo_latitude'] = self.get_node_text(self.root, "//span[@class='latitude']") result['geo_longitude'] = self.get_node_text(self.root, "//span[@class='longitude']") fields_index = { 'phone_lostproperty':2, 'phone_feedback':4, 'staff_available':5, 'phone_station':6, 'accessible':7, 'metcard_ticket_machine':8, 'myki_machine':9, 'myki_checks':10, 'vline_booking':11, 'seating':12, 'lighting':13, 'stairs':14, 'escalator':15, 'lifts':16, 'lockers':17, 'public_phone':18, 'public_toilet':19, 'car_parking':21, 'taxi_rank':23, 'tactile_paths':24, 'hearing_loop':25, } for k, v in fields_index.items(): try: result[k] = self.get_node_text(td_nodes[v]) except: print "Parse stop info error on %s, order %d" % (k,v) raise result['parsed'] = 'T' # print "parsed stop info:\n\t", '\n\t'.join(["%s = %s" % (k,v) for k,v in result.items()]) db.update_table_with_dict('stop', 'id', self.id, result) return result