/
location_parser.py
57 lines (40 loc) · 1.57 KB
/
location_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
from lxml import etree
import re
from parser import Parser
import utils
import db
class LocationParser(Parser):
def __init__(self):
pass
def parse_location_info(self):
result = {}
name = self.get_node_text(self.root, "//div[@class='top_left']/h1")
print ' ', name
if name is None:
raise Exception('%s is not a valid location' % self.id)
result['name'] = name.strip()
db.cur.execute("SELECT * from location WHERE id=?", (self.id,))
row = db.cur.fetchone()
# parse the line map
map_id = self.parse_map()
if map_id:
result['map_id'] = map_id
# parse suburbs
suburbs = self.get_node_text(self.root, "//div[@class='suburbsInner']/p")
if suburbs is None:
print "empty suburbs, should not happen"
result['suburbs'] = ' '.join([re.sub('\s+', ' ', x).strip() for x in suburbs.split('\n') if re.sub('\s*', '', x) != ''])
# set the parsed mark and update it to database
result['parsed'] = 'T'
db.update_table_with_dict('location', 'id', self.id, result)
def parse_map(self):
map_node = self.get_node(self.root, "//div[@class='mapInner']/a")
if map_node is None: return None
# http://ptv.vic.gov.au/
map_link = map_node.get('href')
db.update_table('map', 'link', map_link)
map_id = db.query("SELECT id FROM map WHERE link=?", (map_link,))
return map_id
def parse(self, id, html):
Parser.parse(self, id, html)
self.parse_location_info()