def endElement(self, name): if self.debug > 4: print "END TAG %s" % name if self.tag == 'title': self.page['title'] = self.text if self.tag == 'text': if re.search('^File:', self.page['title']): meta = wikiparser.parse(self.text) if meta.has_key("lat") and meta.has_key("lon"): if not meta.has_key("desc"): meta["desc"] = re.sub(ur'\.[^\.]+$', '', self.page['title'].replace('File:', '', 1)) # take only first paragraph from description m = re.compile(ur'<p>(.+?)</p>', re.M|re.S).search(meta["desc"]) if m: meta["desc"] = m.group(1) print "%d>%s: [%s,%s] [%s]" % (self.count, self.page['title'], meta['lat'], meta['lon'], meta['desc']) # FIXME validate lat and lon as float try: q = u"INSERT INTO wpc_img (page, \"desc\", point) VALUES(%s,%s,ST_SetSRID(ST_MakePoint(%lf,%lf),4326))" % (sqlesc(self.page['title']), sqlesc(meta['desc']), float(meta['lat']), float(meta['lon'])) #print q cc.execute(q) except ValueError: print "ValueError: meta=%s" % repr(meta) pass self.count = self.count + 1 self.tag = None
def process(page): from urllib import quote_plus import urllib2 url = "http://commons.wikimedia.org/w/index.php?title=" + quote_plus(page) + "&action=raw" try: p = http_get(url).read() except urllib2.HTTPError: print "HTTP Error [%s]" % url sys.exit(0) meta = wikiparser.parse(p) if meta.has_key("lat") and meta.has_key("lon"): if not meta.has_key("desc"): meta["desc"] = re.sub(ur'\.[^\.]+$', '', page.replace('File:', '', 1)) # take only first paragraph from description m = re.compile(ur'<p>(.+?)</p>', re.M|re.S).search(meta["desc"]) if m: meta["desc"] = m.group(1) print ">%s: [%s,%s] [%s]" % (page, meta['lat'], meta['lon'], meta['desc']) # FIXME validate lat and lon as float cu.execute("SELECT * FROM wpc_img WHERE page=%s", (page,)) try: if cu.rowcount > 0: query = u"UPDATE wpc_img SET \"desc\"=%s, point=ST_SetSRID(ST_MakePoint(%lf,%lf),4326), date=NOW() WHERE page=%s" % (sqlesc(meta['desc']), float(meta['lat']), float(meta['lon']), sqlesc(page)) else: query = u"INSERT INTO wpc_img (page, \"desc\", point, date) VALUES(%s,%s,ST_SetSRID(ST_MakePoint(%lf,%lf),4326),NOW())" % (sqlesc(page), sqlesc(meta['desc']), float(meta['lat']), float(meta['lon'])) cu.execute(query) query = u"INSERT INTO wpc_done (page, \"desc\", lat, lon, done) VALUES (%s,%s,%lf,%lf,NOW())" % (sqlesc(page), sqlesc(meta['desc']), float(meta['lat']), float(meta['lon'])) cu.execute(query) except ValueError: print "ValueError: meta=%s" % repr(meta) pass else: cu.execute("DELETE FROM wpc_img WHERE page=%s", (page,))
def endElement(self, name): if self.debug > 4: print "END TAG %s" % name if self.tag == "title": self.page["title"] = self.text if self.tag == "text": if re.search("^File:", self.page["title"]): meta = wikiparser.parse(self.text) if meta.has_key("error"): print "Error parsing [%s]: %s" % (self.page["title"], meta["error"]) return if meta.has_key("lat") and meta.has_key("lon"): if not meta.has_key("desc"): meta["desc"] = re.sub(ur"\.[^\.]+$", "", self.page["title"].replace("File:", "", 1)) # take only first paragraph from description m = re.compile(ur"<p>(.+?)</p>", re.M | re.S).search(meta["desc"]) if m: meta["desc"] = m.group(1) print "%d>%s: [%s,%s] [%s]" % ( self.count, self.page["title"], meta["lat"], meta["lon"], meta["desc"], ) # FIXME validate lat and lon as float try: q = "SELECT wpc_upsert (%s,%s,ST_SetSRID(ST_MakePoint(%lf,%lf),4326),timestamptz '%s')" % ( sqlesc(self.page["title"]), sqlesc(meta["desc"]), float(meta["lat"]), float(meta["lon"]), self.page["timestamp"], ) # print q cc.execute(q) except ValueError: print "ValueError: meta=%s" % repr(meta) pass self.count = self.count + 1 if self.tag == "timestamp": m = re.match(r"(\d\d\d\d-\d\d-\d\d)T(\d\d:\d\d:\d\d)Z", self.text.strip()) if not m: raise BaseException("Invalid timestamp format [%s]" % self.text) self.page["timestamp"] = "%s %s+00" % (m.group(1), m.group(2)) self.tag = None
def endElement(self, name): if self.debug > 4: print "END TAG %s" % name if self.tag == 'title': self.page['title'] = self.text if self.tag == 'text': if re.search('^File:', self.page['title']): meta = wikiparser.parse(self.text) if meta.has_key("error"): print "Error parsing [%s]: %s" % (self.page['title'], meta['error']) return if meta.has_key("lat") and meta.has_key("lon"): if not meta.has_key("desc"): meta["desc"] = re.sub( ur'\.[^\.]+$', '', self.page['title'].replace('File:', '', 1)) # take only first paragraph from description m = re.compile(ur'<p>(.+?)</p>', re.M | re.S).search(meta["desc"]) if m: meta["desc"] = m.group(1) print "%d>%s: [%s,%s] [%s]" % ( self.count, self.page['title'], meta['lat'], meta['lon'], meta['desc']) # FIXME validate lat and lon as float try: q = "SELECT wpc_upsert (%s,%s,ST_SetSRID(ST_MakePoint(%lf,%lf),4326),timestamptz '%s')" % ( sqlesc(self.page['title']), sqlesc(meta['desc']), float(meta['lat']), float( meta['lon']), self.page['timestamp']) #print q cc.execute(q) except ValueError: print "ValueError: meta=%s" % repr(meta) pass self.count = self.count + 1 if self.tag == 'timestamp': m = re.match(r'(\d\d\d\d-\d\d-\d\d)T(\d\d:\d\d:\d\d)Z', self.text.strip()) if not m: raise BaseException('Invalid timestamp format [%s]' % self.text) self.page['timestamp'] = "%s %s+00" % (m.group(1), m.group(2)) self.tag = None