Beispiel #1
0
 def endElement(self, name):
   if self.debug > 4:
     print "END TAG %s" % name
   if self.tag == 'title':
     self.page['title'] = self.text
   if self.tag == 'text':
     if re.search('^File:', self.page['title']):
       meta = wikiparser.parse(self.text)
       if meta.has_key("lat") and meta.has_key("lon"):
         if not meta.has_key("desc"):
           meta["desc"] = re.sub(ur'\.[^\.]+$', '', self.page['title'].replace('File:', '', 1))
         # take only first paragraph from description
         m = re.compile(ur'<p>(.+?)</p>', re.M|re.S).search(meta["desc"])
         if m:
           meta["desc"] = m.group(1)
         print "%d>%s: [%s,%s] [%s]" % (self.count, self.page['title'], meta['lat'], meta['lon'], meta['desc'])
         # FIXME validate lat and lon as float
         try:
           q = u"INSERT INTO wpc_img (page, \"desc\", point) VALUES(%s,%s,ST_SetSRID(ST_MakePoint(%lf,%lf),4326))" % (sqlesc(self.page['title']), sqlesc(meta['desc']), float(meta['lat']), float(meta['lon']))
           #print q
           cc.execute(q)
         except ValueError:
           print "ValueError: meta=%s" % repr(meta)
           pass
         self.count = self.count + 1
   self.tag = None
Beispiel #2
0
def process(page):
  from urllib import quote_plus
  import urllib2
  url = "http://commons.wikimedia.org/w/index.php?title=" + quote_plus(page) + "&action=raw"
  try:
    p = http_get(url).read()
  except urllib2.HTTPError:
    print "HTTP Error [%s]" % url
    sys.exit(0)

  meta = wikiparser.parse(p)
  if meta.has_key("lat") and meta.has_key("lon"):
    if not meta.has_key("desc"):
      meta["desc"] = re.sub(ur'\.[^\.]+$', '', page.replace('File:', '', 1))
    # take only first paragraph from description
    m = re.compile(ur'<p>(.+?)</p>', re.M|re.S).search(meta["desc"])
    if m:
      meta["desc"] = m.group(1)
    print ">%s: [%s,%s] [%s]" % (page, meta['lat'], meta['lon'], meta['desc'])
    # FIXME validate lat and lon as float
    cu.execute("SELECT * FROM wpc_img WHERE page=%s", (page,))
    try:
      if cu.rowcount > 0:
        query = u"UPDATE wpc_img SET \"desc\"=%s, point=ST_SetSRID(ST_MakePoint(%lf,%lf),4326), date=NOW() WHERE page=%s" % (sqlesc(meta['desc']), float(meta['lat']), float(meta['lon']), sqlesc(page))
      else:
        query = u"INSERT INTO wpc_img (page, \"desc\", point, date) VALUES(%s,%s,ST_SetSRID(ST_MakePoint(%lf,%lf),4326),NOW())" % (sqlesc(page), sqlesc(meta['desc']), float(meta['lat']), float(meta['lon']))
      cu.execute(query)
      query = u"INSERT INTO wpc_done (page, \"desc\", lat, lon, done) VALUES (%s,%s,%lf,%lf,NOW())" % (sqlesc(page), sqlesc(meta['desc']), float(meta['lat']), float(meta['lon']))
      cu.execute(query)
    except ValueError:
      print "ValueError: meta=%s" % repr(meta)
      pass
  else:
    cu.execute("DELETE FROM wpc_img WHERE page=%s", (page,))
 def endElement(self, name):
     if self.debug > 4:
         print "END TAG %s" % name
     if self.tag == "title":
         self.page["title"] = self.text
     if self.tag == "text":
         if re.search("^File:", self.page["title"]):
             meta = wikiparser.parse(self.text)
             if meta.has_key("error"):
                 print "Error parsing [%s]: %s" % (self.page["title"], meta["error"])
                 return
             if meta.has_key("lat") and meta.has_key("lon"):
                 if not meta.has_key("desc"):
                     meta["desc"] = re.sub(ur"\.[^\.]+$", "", self.page["title"].replace("File:", "", 1))
                 # take only first paragraph from description
                 m = re.compile(ur"<p>(.+?)</p>", re.M | re.S).search(meta["desc"])
                 if m:
                     meta["desc"] = m.group(1)
                 print "%d>%s: [%s,%s] [%s]" % (
                     self.count,
                     self.page["title"],
                     meta["lat"],
                     meta["lon"],
                     meta["desc"],
                 )
                 # FIXME validate lat and lon as float
                 try:
                     q = "SELECT wpc_upsert (%s,%s,ST_SetSRID(ST_MakePoint(%lf,%lf),4326),timestamptz '%s')" % (
                         sqlesc(self.page["title"]),
                         sqlesc(meta["desc"]),
                         float(meta["lat"]),
                         float(meta["lon"]),
                         self.page["timestamp"],
                     )
                     # print q
                     cc.execute(q)
                 except ValueError:
                     print "ValueError: meta=%s" % repr(meta)
                     pass
                 self.count = self.count + 1
     if self.tag == "timestamp":
         m = re.match(r"(\d\d\d\d-\d\d-\d\d)T(\d\d:\d\d:\d\d)Z", self.text.strip())
         if not m:
             raise BaseException("Invalid timestamp format [%s]" % self.text)
         self.page["timestamp"] = "%s %s+00" % (m.group(1), m.group(2))
     self.tag = None
Beispiel #4
0
 def endElement(self, name):
     if self.debug > 4:
         print "END TAG %s" % name
     if self.tag == 'title':
         self.page['title'] = self.text
     if self.tag == 'text':
         if re.search('^File:', self.page['title']):
             meta = wikiparser.parse(self.text)
             if meta.has_key("error"):
                 print "Error parsing [%s]: %s" % (self.page['title'],
                                                   meta['error'])
                 return
             if meta.has_key("lat") and meta.has_key("lon"):
                 if not meta.has_key("desc"):
                     meta["desc"] = re.sub(
                         ur'\.[^\.]+$', '',
                         self.page['title'].replace('File:', '', 1))
                 # take only first paragraph from description
                 m = re.compile(ur'<p>(.+?)</p>',
                                re.M | re.S).search(meta["desc"])
                 if m:
                     meta["desc"] = m.group(1)
                 print "%d>%s: [%s,%s] [%s]" % (
                     self.count, self.page['title'], meta['lat'],
                     meta['lon'], meta['desc'])
                 # FIXME validate lat and lon as float
                 try:
                     q = "SELECT wpc_upsert (%s,%s,ST_SetSRID(ST_MakePoint(%lf,%lf),4326),timestamptz '%s')" % (
                         sqlesc(self.page['title']), sqlesc(meta['desc']),
                         float(meta['lat']), float(
                             meta['lon']), self.page['timestamp'])
                     #print q
                     cc.execute(q)
                 except ValueError:
                     print "ValueError: meta=%s" % repr(meta)
                     pass
                 self.count = self.count + 1
     if self.tag == 'timestamp':
         m = re.match(r'(\d\d\d\d-\d\d-\d\d)T(\d\d:\d\d:\d\d)Z',
                      self.text.strip())
         if not m:
             raise BaseException('Invalid timestamp format [%s]' %
                                 self.text)
         self.page['timestamp'] = "%s %s+00" % (m.group(1), m.group(2))
     self.tag = None